def __init__(self, nums=20, huber_delta=1., lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=1000, network_settings=[128, 128], **kwargs): assert nums > 0, 'assert nums > 0' super().__init__(**kwargs) assert not self.is_continuous, 'qrdqn only support discrete action space' self.nums = nums self.huber_delta = huber_delta self.quantiles = th.tensor((2 * np.arange(self.nums) + 1) / (2.0 * self.nums)).float().to(self.device) # [N,] self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.assign_interval = assign_interval self.q_net = TargetTwin(QrdqnDistributional(self.obs_spec, rep_net_params=self._rep_net_params, action_dim=self.a_dim, nums=self.nums, network_settings=network_settings)).to(self.device) self.oplr = OPLR(self.q_net, lr, **self._oplr_params) self._trainer_modules.update(model=self.q_net, oplr=self.oplr)
def __init__(self, lr: float = 5.0e-4, eps_init: float = 1, eps_mid: float = 0.2, eps_final: float = 0.01, init2mid_annealing_step: int = 1000, assign_interval: int = 1000, network_settings: List[int] = [32, 32], **kwargs): super().__init__(**kwargs) assert not self.is_continuous, 'dqn only support discrete action space' self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.assign_interval = assign_interval self.q_net = TargetTwin( CriticQvalueAll(self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings)).to(self.device) self.oplr = OPLR(self.q_net, lr, **self._oplr_params) self._trainer_modules.update(model=self.q_net) self._trainer_modules.update(oplr=self.oplr)
def __init__(self, lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=2, network_settings={ 'share': [128], 'v': [128], 'adv': [128] }, **kwargs): super().__init__(**kwargs) assert not self.is_continuous, 'dueling double dqn only support discrete action space' self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.assign_interval = assign_interval self.q_net = TargetTwin(CriticDueling(self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings)).to(self.device) self.oplr = OPLR(self.q_net, lr, **self._oplr_params) self._trainer_modules.update(model=self.q_net, oplr=self.oplr)
def __init__(self, envspec, v_min=-10, v_max=10, atoms=51, lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=2, network_settings={ 'share': [128], 'v': [128], 'adv': [128] }, **kwargs): assert not envspec.is_continuous, 'rainbow only support discrete action space' super().__init__(envspec=envspec, **kwargs) self.v_min = v_min self.v_max = v_max self.atoms = atoms self.delta_z = (self.v_max - self.v_min) / (self.atoms - 1) self.z = tf.reshape( tf.constant( [self.v_min + i * self.delta_z for i in range(self.atoms)], dtype=tf.float32), [-1, self.atoms]) # [1, N] self.zb = tf.tile(self.z, tf.constant([self.a_dim, 1])) # [A, N] self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.assign_interval = assign_interval def _create_net(name, representation_net=None): return ValueNetwork( name=name, representation_net=representation_net, value_net_type=OutputNetworkType.RAINBOW_DUELING, value_net_kwargs=dict(action_dim=self.a_dim, atoms=self.atoms, network_settings=network_settings)) self.rainbow_net = _create_net('rainbow_net', self._representation_net) self._representation_target_net = self._create_representation_net( '_representation_target_net') self.rainbow_target_net = _create_net('rainbow_target_net', self._representation_target_net) update_target_net_weights(self.rainbow_target_net.weights, self.rainbow_net.weights) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self._worker_params_dict.update(self.rainbow_net._policy_models) self._all_params_dict.update(self.rainbow_net._all_models) self._all_params_dict.update(optimizer=self.optimizer) self._model_post_process()
def __init__(self, envspec, mode='q', lr=0.2, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, **kwargs): assert not hasattr(s_dim, '__len__') assert not envspec.is_continuous self.mode = mode self.s_dim = s_dim self.a_dim = a_dim self.gamma = float(kwargs.get('gamma', 0.999)) self.max_train_step = int(kwargs.get('max_train_step', 1000)) self.step = 0 self.train_step = 0 self.n_agents = int(kwargs.get('n_agents', 0)) if self.n_agents <= 0: raise ValueError('agents num must larger than zero.') self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.table = np.zeros(shape=(self.s_dim, self.a_dim)) self.lr = lr self.next_a = np.zeros(self.n_agents, dtype=np.int32) self.mask = [] ion()
def __init__(self, lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=1000, head_num=4, network_settings=[32, 32], **kwargs): super().__init__(**kwargs) assert not self.is_continuous, 'Bootstrapped DQN only support discrete action space' self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.assign_interval = assign_interval self.head_num = head_num self._probs = th.FloatTensor([1. / head_num for _ in range(head_num)]) self.now_head = 0 self.q_net = TargetTwin( CriticQvalueBootstrap(self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, head_num=self.head_num, network_settings=network_settings)).to( self.device) self.oplr = OPLR(self.q_net, lr, **self._oplr_params) self._trainer_modules.update(model=self.q_net, oplr=self.oplr)
def __init__(self, envspec, online_quantiles=8, target_quantiles=8, select_quantiles=32, quantiles_idx=64, huber_delta=1., lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=2, network_settings={ 'q_net': [128, 64], 'quantile': [128, 64], 'tile': [64] }, **kwargs): assert not envspec.is_continuous, 'iqn only support discrete action space' super().__init__(envspec=envspec, **kwargs) self.pi = tf.constant(np.pi) self.online_quantiles = online_quantiles self.target_quantiles = target_quantiles self.select_quantiles = select_quantiles self.quantiles_idx = quantiles_idx self.huber_delta = huber_delta self.assign_interval = assign_interval self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) def _create_net(name, representation_net=None): return ValueNetwork(name=name, representation_net=representation_net, value_net_type=OutputNetworkType.IQN_NET, value_net_kwargs=dict( action_dim=self.a_dim, quantiles_idx=self.quantiles_idx, network_settings=network_settings)) self.q_net = _create_net('q_net', self._representation_net) self._representation_target_net = self._create_representation_net( '_representation_target_net') self.q_target_net = _create_net('q_target_net', self._representation_target_net) update_target_net_weights(self.q_target_net.weights, self.q_net.weights) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self._worker_params_dict.update(self.q_net._policy_models) self._all_params_dict.update(self.q_net._all_models) self._all_params_dict.update(optimizer=self.optimizer) self._model_post_process()
def __init__(self, envspec, alpha=0.2, beta=0.1, ployak=0.995, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, use_epsilon=False, q_lr=5.0e-4, alpha_lr=5.0e-4, auto_adaption=True, network_settings=[32, 32], **kwargs): assert not envspec.is_continuous, 'maxsqn only support discrete action space' super().__init__(envspec=envspec, **kwargs) self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.use_epsilon = use_epsilon self.ployak = ployak self.log_alpha = alpha if not auto_adaption else tf.Variable( initial_value=0.0, name='log_alpha', dtype=tf.float32, trainable=True) self.auto_adaption = auto_adaption self.target_entropy = beta * np.log(self.a_dim) def _create_net(name, representation_net=None): return DoubleValueNetwork( name=name, representation_net=representation_net, value_net_type=OutputNetworkType.CRITIC_QVALUE_ALL, value_net_kwargs=dict(output_shape=self.a_dim, network_settings=network_settings)) self.critic_net = _create_net('critic_net', self._representation_net) self._representation_target_net = self._create_representation_net( '_representation_target_net') self.critic_target_net = _create_net('critic_target_net', self._representation_target_net) update_target_net_weights(self.critic_target_net.weights, self.critic_net.weights) self.q_lr, self.alpha_lr = map(self.init_lr, [q_lr, alpha_lr]) self.optimizer_critic, self.optimizer_alpha = map( self.init_optimizer, [self.q_lr, self.alpha_lr]) self._worker_params_dict.update(self.critic_net._policy_models) self._all_params_dict.update(self.critic_net._all_models) self._all_params_dict.update(optimizer_critic=self.optimizer_critic, optimizer_alpha=self.optimizer_alpha) self._model_post_process()
def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, online_quantiles=8, target_quantiles=8, select_quantiles=32, quantiles_idx=64, huber_delta=1., lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=2, hidden_units={ 'q_net': [128, 64], 'quantile': [128, 64], 'tile': [64] }, **kwargs): assert not is_continuous, 'iqn only support discrete action space' super().__init__( s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.pi = tf.constant(np.pi) self.online_quantiles = online_quantiles self.target_quantiles = target_quantiles self.select_quantiles = select_quantiles self.quantiles_idx = quantiles_idx self.huber_delta = huber_delta self.assign_interval = assign_interval self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) def _net(): return NetWork(self.feat_dim, self.a_dim, self.quantiles_idx, hidden_units) self.q_net = _net() self.q_target_net = _net() self.critic_tv = self.q_net.trainable_variables + self.other_tv update_target_net_weights(self.q_target_net.weights, self.q_net.weights) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self.model_recorder(dict( model=self.q_net, optimizer=self.optimizer ))
def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, alpha=0.2, beta=0.1, ployak=0.995, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, use_epsilon=False, q_lr=5.0e-4, alpha_lr=5.0e-4, auto_adaption=True, hidden_units=[32, 32], **kwargs): assert not is_continuous, 'maxsqn only support discrete action space' super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.use_epsilon = use_epsilon self.ployak = ployak self.log_alpha = alpha if not auto_adaption else tf.Variable( initial_value=0.0, name='log_alpha', dtype=tf.float32, trainable=True) self.auto_adaption = auto_adaption self.target_entropy = beta * np.log(self.a_dim) def _q_net(): return Critic(self.feat_dim, self.a_dim, hidden_units) self.critic_net = DoubleQ(_q_net) self.critic_target_net = DoubleQ(_q_net) self.critic_tv = self.critic_net.trainable_variables + self.other_tv update_target_net_weights(self.critic_target_net.weights, self.critic_net.weights) self.q_lr, self.alpha_lr = map(self.init_lr, [q_lr, alpha_lr]) self.optimizer_critic, self.optimizer_alpha = map( self.init_optimizer, [self.q_lr, self.alpha_lr]) self.model_recorder( dict(critic_net=self.critic_net, optimizer_critic=self.optimizer_critic, optimizer_alpha=self.optimizer_alpha))
def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, v_min=-10, v_max=10, atoms=51, lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=2, hidden_units={ 'share': [128], 'v': [128], 'adv': [128] }, **kwargs): assert not is_continuous, 'rainbow only support discrete action space' super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.v_min = v_min self.v_max = v_max self.atoms = atoms self.delta_z = (self.v_max - self.v_min) / (self.atoms - 1) self.z = tf.reshape( tf.constant( [self.v_min + i * self.delta_z for i in range(self.atoms)], dtype=tf.float32), [-1, self.atoms]) # [1, N] self.zb = tf.tile(self.z, tf.constant([self.a_dim, 1])) # [A, N] self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.assign_interval = assign_interval def _net(): return NetWork(self.feat_dim, self.a_dim, self.atoms, hidden_units) self.rainbow_net = _net() self.rainbow_target_net = _net() self.critic_tv = self.rainbow_net.trainable_variables + self.other_tv update_target_net_weights(self.rainbow_target_net.weights, self.rainbow_net.weights) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self.model_recorder( dict(model=self.rainbow_net, optimizer=self.optimizer))
def __init__(self, envspec, nums=20, huber_delta=1., lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=1000, network_settings=[128, 128], **kwargs): assert not envspec.is_continuous, 'qrdqn only support discrete action space' assert nums > 0 super().__init__(envspec=envspec, **kwargs) self.nums = nums self.huber_delta = huber_delta self.quantiles = tf.reshape( tf.constant((2 * np.arange(self.nums) + 1) / (2.0 * self.nums), dtype=tf.float32), [-1, self.nums]) # [1, N] self.batch_quantiles = tf.tile(self.quantiles, [self.a_dim, 1]) # [1, N] => [A, N] self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.assign_interval = assign_interval def _create_net(name, representation_net=None): return ValueNetwork( name=name, representation_net=representation_net, value_net_type=OutputNetworkType.QRDQN_DISTRIBUTIONAL, value_net_kwargs=dict(action_dim=self.a_dim, nums=self.nums, network_settings=network_settings)) self.q_dist_net = _create_net('q_dist_net', self._representation_net) self._representation_target_net = self._create_representation_net( '_representation_target_net') self.q_target_dist_net = _create_net('q_target_dist_net', self._representation_target_net) update_target_net_weights(self.q_target_dist_net.weights, self.q_dist_net.weights) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self._worker_params_dict.update(self.q_dist_net._policy_models) self._all_params_dict.update(self.q_dist_net._all_models) self._all_params_dict.update(optimizer=self.optimizer) self._model_post_process()
def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, nums=20, huber_delta=1., lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=1000, hidden_units=[128, 128], **kwargs): assert not is_continuous, 'qrdqn only support discrete action space' assert nums > 0 super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.nums = nums self.huber_delta = huber_delta self.quantiles = tf.reshape( tf.constant((2 * np.arange(self.nums) + 1) / (2.0 * self.nums), dtype=tf.float32), [-1, self.nums]) # [1, N] self.batch_quantiles = tf.tile(self.quantiles, [self.a_dim, 1]) # [1, N] => [A, N] self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.assign_interval = assign_interval def _net(): return NetWork(self.feat_dim, self.a_dim, self.nums, hidden_units) self.q_dist_net = _net() self.q_target_dist_net = _net() self.critic_tv = self.q_dist_net.trainable_variables + self.other_tv update_target_net_weights(self.q_target_dist_net.weights, self.q_dist_net.weights) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self.model_recorder( dict(model=self.q_dist_net, optimizer=self.optimizer))
def __init__(self, envspec, target_k: int = 4, lr: float = 5.0e-4, eps_init: float = 1, eps_mid: float = 0.2, eps_final: float = 0.01, init2mid_annealing_step: int = 1000, assign_interval: int = 1000, network_settings: List[int] = [32, 32], **kwargs): assert not envspec.is_continuous, 'dqn only support discrete action space' super().__init__(envspec=envspec, **kwargs) self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.assign_interval = assign_interval self.target_k = target_k assert self.target_k > 0, "assert self.target_k > 0" self.target_nets = [] self.current_target_idx = 0 def _create_net(name, representation_net=None): return ValueNetwork( name=name, representation_net=representation_net, value_net_type=OutputNetworkType.CRITIC_QVALUE_ALL, value_net_kwargs=dict(output_shape=self.a_dim, network_settings=network_settings) ) self.q_net = _create_net('dqn_q_net', self._representation_net) for i in range(self.target_k): target_q_net = _create_net( 'dqn_q_target_net' + str(i), self._create_representation_net('_representation_target_net' + str(i)) ) update_target_net_weights(target_q_net.weights, self.q_net.weights) self.target_nets.append(target_q_net) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self._worker_params_dict.update(self.q_net._policy_models) self._all_params_dict.update(self.q_net._all_models) self._all_params_dict.update(optimizer=self.optimizer) self._model_post_process()
def __init__(self, mixer='vdn', mixer_settings={}, lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, use_double=True, init2mid_annealing_step=1000, assign_interval=1000, network_settings={ 'share': [128], 'v': [128], 'adv': [128] }, **kwargs): super().__init__(**kwargs) assert not any(list(self.is_continuouss.values()) ), 'VDN only support discrete action space' self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.assign_interval = assign_interval self._use_double = use_double self._mixer_type = mixer self._mixer_settings = mixer_settings self.q_nets = {} for id in set(self.model_ids): self.q_nets[id] = TargetTwin( CriticDueling(self.obs_specs[id], rep_net_params=self._rep_net_params, output_shape=self.a_dims[id], network_settings=network_settings)).to( self.device) self.mixer = self._build_mixer() self.oplr = OPLR( tuple(self.q_nets.values()) + (self.mixer, ), lr, **self._oplr_params) self._trainer_modules.update( {f"model_{id}": self.q_nets[id] for id in set(self.model_ids)}) self._trainer_modules.update(mixer=self.mixer, oplr=self.oplr)
def __init__(self, envspec, lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=1000, head_num=4, network_settings=[32, 32], **kwargs): assert not envspec.is_continuous, 'Bootstrapped DQN only support discrete action space' super().__init__(envspec=envspec, **kwargs) self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.assign_interval = assign_interval self.head_num = head_num self._probs = [1. / head_num for _ in range(head_num)] self.now_head = 0 def _create_net(name, representation_net=None): return ValueNetwork( name=name, representation_net=representation_net, value_net_type=OutputNetworkType.CRITIC_QVALUE_BOOTSTRAP, value_net_kwargs=dict(output_shape=self.a_dim, head_num=self.head_num, network_settings=network_settings)) self.q_net = _create_net('q_net', self._representation_net) self._representation_target_net = self._create_representation_net( '_representation_target_net') self.q_target_net = _create_net('q_target_net', self._representation_target_net) update_target_net_weights(self.q_target_net.weights, self.q_net.weights) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self._worker_params_dict.update(self.q_net._policy_models) self._all_params_dict.update(self.q_net._all_models) self._all_params_dict.update(optimizer=self.optimizer) self._model_post_process()
def __init__(self, envspec, lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=2, network_settings={ 'share': [128], 'v': [128], 'adv': [128] }, **kwargs): assert not envspec.is_continuous, 'dueling double dqn only support discrete action space' super().__init__(envspec=envspec, **kwargs) self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.assign_interval = assign_interval def _create_net(name, representation_net): return ValueNetwork( name=name, representation_net=representation_net, value_net_type=OutputNetworkType.CRITIC_DUELING, value_net_kwargs=dict(output_shape=self.a_dim, network_settings=network_settings)) self.dueling_net = _create_net('dueling_net', self._representation_net) self._representation_target_net = self._create_representation_net( '_representation_target_net') self.dueling_target_net = _create_net('dueling_target_net', self._representation_target_net) update_target_net_weights(self.dueling_target_net.weights, self.dueling_net.weights) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self._worker_params_dict.update(self.dueling_net._policy_models) self._all_params_dict.update(self.dueling_net._all_models) self._all_params_dict.update(optimizer=self.optimizer) self._model_post_process()
def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=2, hidden_units={ 'share': [128], 'v': [128], 'adv': [128] }, **kwargs): assert not is_continuous, 'dueling double dqn only support discrete action space' super().__init__( s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.assign_interval = assign_interval def _net(): return NetWork(self.feat_dim, self.a_dim, hidden_units) self.dueling_net = _net() self.dueling_target_net = _net() self.critic_tv = self.dueling_net.trainable_variables + self.other_tv update_target_net_weights(self.dueling_target_net.weights, self.dueling_net.weights) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self.model_recorder(dict( model=self.dueling_net, optimizer=self.optimizer ))
def __init__(self, alpha=0.2, beta=0.1, polyak=0.995, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, use_epsilon=False, q_lr=5.0e-4, alpha_lr=5.0e-4, auto_adaption=True, network_settings=[32, 32], **kwargs): super().__init__(**kwargs) assert not self.is_continuous, 'maxsqn only support discrete action space' self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.use_epsilon = use_epsilon self.polyak = polyak self.auto_adaption = auto_adaption self.target_entropy = beta * np.log(self.a_dim) self.critic = TargetTwin(CriticQvalueAll(self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings), self.polyak).to(self.device) self.critic2 = deepcopy(self.critic) self.critic_oplr = OPLR([self.critic, self.critic2], q_lr, **self._oplr_params) if self.auto_adaption: self.log_alpha = th.tensor(0., requires_grad=True).to(self.device) self.alpha_oplr = OPLR(self.log_alpha, alpha_lr, **self._oplr_params) self._trainer_modules.update(alpha_oplr=self.alpha_oplr) else: self.log_alpha = th.tensor(alpha).log().to(self.device) self._trainer_modules.update(critic=self.critic, critic2=self.critic2, log_alpha=self.log_alpha, critic_oplr=self.critic_oplr)
def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=1000, head_num=4, hidden_units=[32, 32], **kwargs): assert not is_continuous, 'Bootstrapped DQN only support discrete action space' super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.assign_interval = assign_interval self.head_num = head_num self._probs = [1. / head_num for _ in range(head_num)] self.now_head = 0 def _q_net(): return NetWork(self.feat_dim, self.a_dim, self.head_num, hidden_units) self.q_net = _q_net() self.q_target_net = _q_net() self.critic_tv = self.q_net.trainable_variables + self.other_tv update_target_net_weights(self.q_target_net.weights, self.q_net.weights) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self.model_recorder(dict(model=self.q_net, optimizer=self.optimizer))
def __init__(self, s_dim: Union[int, np.ndarray], visual_sources: Union[int, np.ndarray], visual_resolution: Union[List, np.ndarray], a_dim: Union[int, np.ndarray], is_continuous: Union[bool, np.ndarray], lr: float = 5.0e-4, eps_init: float = 1, eps_mid: float = 0.2, eps_final: float = 0.01, init2mid_annealing_step: int = 1000, assign_interval: int = 1000, hidden_units: List[int] = [32, 32], **kwargs): assert not is_continuous, 'dqn only support discrete action space' super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.assign_interval = assign_interval def _q_net(): return NetWork(self.feat_dim, self.a_dim, hidden_units) self.q_net = _q_net() self.q_target_net = _q_net() self.critic_tv = self.q_net.trainable_variables + self.other_tv update_target_net_weights(self.q_target_net.weights, self.q_net.weights) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self.model_recorder(dict(model=self.q_net, optimizer=self.optimizer))
def __init__(self, v_min=-10, v_max=10, atoms=51, lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=2, network_settings={ 'share': [128], 'v': [128], 'adv': [128] }, **kwargs): super().__init__(**kwargs) assert not self.is_continuous, 'rainbow only support discrete action space' self._v_min = v_min self._v_max = v_max self._atoms = atoms self._delta_z = (self._v_max - self._v_min) / (self._atoms - 1) self._z = th.linspace(self._v_min, self._v_max, self._atoms).float().to(self.device) # [N,] self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.assign_interval = assign_interval self.rainbow_net = TargetTwin( RainbowDueling(self.obs_spec, rep_net_params=self._rep_net_params, action_dim=self.a_dim, atoms=self._atoms, network_settings=network_settings)).to(self.device) self.rainbow_net.target.train() # so that NoisyLinear takes effect self.oplr = OPLR(self.rainbow_net, lr, **self._oplr_params) self._trainer_modules.update(model=self.rainbow_net, oplr=self.oplr)
def __init__(self, online_quantiles=8, target_quantiles=8, select_quantiles=32, quantiles_idx=64, huber_delta=1., lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=2, network_settings={ 'q_net': [128, 64], 'quantile': [128, 64], 'tile': [64] }, **kwargs): super().__init__(**kwargs) assert not self.is_continuous, 'iqn only support discrete action space' self.online_quantiles = online_quantiles self.target_quantiles = target_quantiles self.select_quantiles = select_quantiles self.quantiles_idx = quantiles_idx self.huber_delta = huber_delta self.assign_interval = assign_interval self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.q_net = TargetTwin(IqnNet(self.obs_spec, rep_net_params=self._rep_net_params, action_dim=self.a_dim, quantiles_idx=self.quantiles_idx, network_settings=network_settings)).to(self.device) self.oplr = OPLR(self.q_net, lr, **self._oplr_params) self._trainer_modules.update(model=self.q_net, oplr=self.oplr)
def __init__(self, target_k: int = 4, lr: float = 5.0e-4, eps_init: float = 1, eps_mid: float = 0.2, eps_final: float = 0.01, init2mid_annealing_step: int = 1000, assign_interval: int = 1000, network_settings: List[int] = [32, 32], **kwargs): super().__init__(**kwargs) assert not self.is_continuous, 'dqn only support discrete action space' self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.assign_interval = assign_interval self.target_k = target_k assert self.target_k > 0, "assert self.target_k > 0" self.current_target_idx = 0 self.q_net = CriticQvalueAll(self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings).to( self.device) self.target_nets = [] for i in range(self.target_k): target_q_net = deepcopy(self.q_net) target_q_net.eval() sync_params(target_q_net, self.q_net) self.target_nets.append(target_q_net) self.oplr = OPLR(self.q_net, lr, **self._oplr_params) self._trainer_modules.update(model=self.q_net, oplr=self.oplr)
class MAXSQN(SarlOffPolicy): """ https://github.com/createamind/DRL/blob/master/spinup/algos/maxsqn/maxsqn.py """ policy_mode = 'off-policy' def __init__(self, alpha=0.2, beta=0.1, polyak=0.995, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, use_epsilon=False, q_lr=5.0e-4, alpha_lr=5.0e-4, auto_adaption=True, network_settings=[32, 32], **kwargs): super().__init__(**kwargs) assert not self.is_continuous, 'maxsqn only support discrete action space' self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.use_epsilon = use_epsilon self.polyak = polyak self.auto_adaption = auto_adaption self.target_entropy = beta * np.log(self.a_dim) self.critic = TargetTwin(CriticQvalueAll(self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings), self.polyak).to(self.device) self.critic2 = deepcopy(self.critic) self.critic_oplr = OPLR([self.critic, self.critic2], q_lr, **self._oplr_params) if self.auto_adaption: self.log_alpha = th.tensor(0., requires_grad=True).to(self.device) self.alpha_oplr = OPLR(self.log_alpha, alpha_lr, **self._oplr_params) self._trainer_modules.update(alpha_oplr=self.alpha_oplr) else: self.log_alpha = th.tensor(alpha).log().to(self.device) self._trainer_modules.update(critic=self.critic, critic2=self.critic2, log_alpha=self.log_alpha, critic_oplr=self.critic_oplr) @property def alpha(self): return self.log_alpha.exp() @iton def select_action(self, obs): q = self.critic(obs, rnncs=self.rnncs) # [B, A] self.rnncs_ = self.critic.get_rnncs() if self.use_epsilon and self._is_train_mode and self.expl_expt_mng.is_random(self._cur_train_step): actions = np.random.randint(0, self.a_dim, self.n_copies) else: cate_dist = td.Categorical(logits=(q / self.alpha)) mu = q.argmax(-1) # [B,] actions = pi = cate_dist.sample() # [B,] return actions, Data(action=actions) @iton def _train(self, BATCH): q1 = self.critic(BATCH.obs, begin_mask=BATCH.begin_mask) # [T, B, A] q2 = self.critic2(BATCH.obs, begin_mask=BATCH.begin_mask) # [T, B, A] q1_eval = (q1 * BATCH.action).sum(-1, keepdim=True) # [T, B, 1] q2_eval = (q2 * BATCH.action).sum(-1, keepdim=True) # [T, B, 1] q1_log_probs = (q1 / (self.alpha + th.finfo().eps)).log_softmax(-1) # [T, B, A] q1_entropy = -(q1_log_probs.exp() * q1_log_probs).sum(-1, keepdim=True).mean() # 1 q1_target = self.critic.t(BATCH.obs_, begin_mask=BATCH.begin_mask) # [T, B, A] q2_target = self.critic2.t(BATCH.obs_, begin_mask=BATCH.begin_mask) # [T, B, A] q1_target_max = q1_target.max(-1, keepdim=True)[0] # [T, B, 1] q1_target_log_probs = (q1_target / (self.alpha + th.finfo().eps)).log_softmax(-1) # [T, B, A] q1_target_entropy = -(q1_target_log_probs.exp() * q1_target_log_probs).sum(-1, keepdim=True) # [T, B, 1] q2_target_max = q2_target.max(-1, keepdim=True)[0] # [T, B, 1] # q2_target_log_probs = q2_target.log_softmax(-1) # q2_target_log_max = q2_target_log_probs.max(1, keepdim=True)[0] q_target = th.minimum(q1_target_max, q2_target_max) + self.alpha * q1_target_entropy # [T, B, 1] dc_r = n_step_return(BATCH.reward, self.gamma, BATCH.done, q_target, BATCH.begin_mask).detach() # [T, B, 1] td_error1 = q1_eval - dc_r # [T, B, 1] td_error2 = q2_eval - dc_r # [T, B, 1] q1_loss = (td_error1.square() * BATCH.get('isw', 1.0)).mean() # 1 q2_loss = (td_error2.square() * BATCH.get('isw', 1.0)).mean() # 1 loss = 0.5 * (q1_loss + q2_loss) self.critic_oplr.optimize(loss) summaries = { 'LEARNING_RATE/critic_lr': self.critic_oplr.lr, 'LOSS/loss': loss, 'Statistics/log_alpha': self.log_alpha, 'Statistics/alpha': self.alpha, 'Statistics/q1_entropy': q1_entropy, 'Statistics/q_min': th.minimum(q1, q2).mean(), 'Statistics/q_mean': q1.mean(), 'Statistics/q_max': th.maximum(q1, q2).mean() } if self.auto_adaption: alpha_loss = -(self.alpha * (self.target_entropy - q1_entropy).detach()).mean() self.alpha_oplr.optimize(alpha_loss) summaries.update({ 'LOSS/alpha_loss': alpha_loss, 'LEARNING_RATE/alpha_lr': self.alpha_oplr.lr }) return (td_error1 + td_error2) / 2, summaries def _after_train(self): super()._after_train() self.critic.sync() self.critic2.sync()
class C51(SarlOffPolicy): """ Category 51, https://arxiv.org/abs/1707.06887 No double, no dueling, no noisy net. """ policy_mode = 'off-policy' def __init__(self, v_min=-10, v_max=10, atoms=51, lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=1000, network_settings=[128, 128], **kwargs): super().__init__(**kwargs) assert not self.is_continuous, 'c51 only support discrete action space' self._v_min = v_min self._v_max = v_max self._atoms = atoms self._delta_z = (self._v_max - self._v_min) / (self._atoms - 1) self._z = th.linspace(self._v_min, self._v_max, self._atoms).float().to(self.device) # [N,] self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.assign_interval = assign_interval self.q_net = TargetTwin( C51Distributional(self.obs_spec, rep_net_params=self._rep_net_params, action_dim=self.a_dim, atoms=self._atoms, network_settings=network_settings)).to( self.device) self.oplr = OPLR(self.q_net, lr, **self._oplr_params) self._trainer_modules.update(model=self.q_net, oplr=self.oplr) @iton def select_action(self, obs): feat = self.q_net(obs, rnncs=self.rnncs) # [B, A, N] self.rnncs_ = self.q_net.get_rnncs() if self._is_train_mode and self.expl_expt_mng.is_random( self._cur_train_step): actions = np.random.randint(0, self.a_dim, self.n_copies) else: q = (self._z * feat).sum(-1) # [B, A, N] * [N,] => [B, A] actions = q.argmax(-1) # [B,] return actions, Data(action=actions) @iton def _train(self, BATCH): q_dist = self.q_net(BATCH.obs, begin_mask=BATCH.begin_mask) # [T, B, A, N] # [T, B, A, N] * [T, B, A, 1] => [T, B, A, N] => [T, B, N] q_dist = (q_dist * BATCH.action.unsqueeze(-1)).sum(-2) q_eval = (q_dist * self._z).sum(-1) # [T, B, N] * [N,] => [T, B] target_q_dist = self.q_net.t( BATCH.obs_, begin_mask=BATCH.begin_mask) # [T, B, A, N] # [T, B, A, N] * [1, N] => [T, B, A] target_q = (target_q_dist * self._z).sum(-1) a_ = target_q.argmax(-1) # [T, B] a_onehot = F.one_hot(a_, self.a_dim).float() # [T, B, A] # [T, B, A, N] * [T, B, A, 1] => [T, B, A, N] => [T, B, N] target_q_dist = (target_q_dist * a_onehot.unsqueeze(-1)).sum(-2) target = n_step_return( BATCH.reward.repeat(1, 1, self._atoms), self.gamma, BATCH.done.repeat(1, 1, self._atoms), target_q_dist, BATCH.begin_mask.repeat(1, 1, self._atoms)).detach() # [T, B, N] target = target.clamp(self._v_min, self._v_max) # [T, B, N] # An amazing trick for calculating the projection gracefully. # ref: https://github.com/ShangtongZhang/DeepRL target_dist = ( 1 - (target.unsqueeze(-1) - self._z.view(1, 1, -1, 1)).abs() / self._delta_z).clamp(0, 1) * target_q_dist.unsqueeze( -1) # [T, B, N, 1] target_dist = target_dist.sum(-1) # [T, B, N] _cross_entropy = -(target_dist * th.log(q_dist + th.finfo().eps)).sum( -1, keepdim=True) # [T, B, 1] loss = (_cross_entropy * BATCH.get('isw', 1.0)).mean() # 1 self.oplr.optimize(loss) return _cross_entropy, { 'LEARNING_RATE/lr': self.oplr.lr, 'LOSS/loss': loss, 'Statistics/q_max': q_eval.max(), 'Statistics/q_min': q_eval.min(), 'Statistics/q_mean': q_eval.mean() } def _after_train(self): super()._after_train() if self._cur_train_step % self.assign_interval == 0: self.q_net.sync()
class AveragedDQN(Off_Policy): ''' Averaged-DQN, http://arxiv.org/abs/1611.01929 ''' def __init__(self, envspec, target_k: int = 4, lr: float = 5.0e-4, eps_init: float = 1, eps_mid: float = 0.2, eps_final: float = 0.01, init2mid_annealing_step: int = 1000, assign_interval: int = 1000, network_settings: List[int] = [32, 32], **kwargs): assert not envspec.is_continuous, 'dqn only support discrete action space' super().__init__(envspec=envspec, **kwargs) self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.assign_interval = assign_interval self.target_k = target_k assert self.target_k > 0, "assert self.target_k > 0" self.target_nets = [] self.current_target_idx = 0 def _create_net(name, representation_net=None): return ValueNetwork( name=name, representation_net=representation_net, value_net_type=OutputNetworkType.CRITIC_QVALUE_ALL, value_net_kwargs=dict(output_shape=self.a_dim, network_settings=network_settings) ) self.q_net = _create_net('dqn_q_net', self._representation_net) for i in range(self.target_k): target_q_net = _create_net( 'dqn_q_target_net' + str(i), self._create_representation_net('_representation_target_net' + str(i)) ) update_target_net_weights(target_q_net.weights, self.q_net.weights) self.target_nets.append(target_q_net) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self._worker_params_dict.update(self.q_net._policy_models) self._all_params_dict.update(self.q_net._all_models) self._all_params_dict.update(optimizer=self.optimizer) self._model_post_process() def choose_action(self, obs, evaluation: bool = False) -> np.ndarray: if np.random.uniform() < self.expl_expt_mng.get_esp(self.train_step, evaluation=evaluation): a = np.random.randint(0, self.a_dim, self.n_agents) else: a, self.cell_state = self._get_action(obs, self.cell_state) a = a.numpy() return a @tf.function def _get_action(self, obs, cell_state): with tf.device(self.device): q_values, cell_state = self.q_net(obs, cell_state=cell_state) for i in range(1, self.target_k): target_q_values, _ = self.target_nets[i](obs, cell_state=cell_state) q_values += target_q_values return tf.argmax(q_values, axis=1), cell_state # 不取平均也可以 def _target_params_update(self): if self.global_step % self.assign_interval == 0: update_target_net_weights(self.target_nets[self.current_target_idx].weights, self.q_net.weights) self.current_target_idx = (self.current_target_idx + 1) % self.target_k def learn(self, **kwargs) -> NoReturn: self.train_step = kwargs.get('train_step') for i in range(self.train_times_per_step): self._learn(function_dict={ 'summary_dict': dict([['LEARNING_RATE/lr', self.lr(self.train_step)]]) }) @tf.function def _train(self, BATCH, isw, cell_state): with tf.device(self.device): with tf.GradientTape() as tape: q, _ = self.q_net(BATCH.obs, cell_state=cell_state) q_next, _ = self.target_nets[0](BATCH.obs_, cell_state=cell_state) for i in range(1, self.target_k): target_q_values, _ = self.target_nets[i](BATCH.obs, cell_state=cell_state) q_next += target_q_values q_next /= self.target_k q_eval = tf.reduce_sum(tf.multiply(q, BATCH.action), axis=1, keepdims=True) q_target = tf.stop_gradient(BATCH.reward + self.gamma * (1 - BATCH.done) * tf.reduce_max(q_next, axis=1, keepdims=True)) td_error = q_target - q_eval q_loss = tf.reduce_mean(tf.square(td_error) * isw) grads = tape.gradient(q_loss, self.q_net.trainable_variables) self.optimizer.apply_gradients( zip(grads, self.q_net.trainable_variables) ) self.global_step.assign_add(1) return td_error, dict([ ['LOSS/loss', q_loss], ['Statistics/q_max', tf.reduce_max(q_eval)], ['Statistics/q_min', tf.reduce_min(q_eval)], ['Statistics/q_mean', tf.reduce_mean(q_eval)] ])
class QS: ''' Q-learning/Sarsa/Expected Sarsa. ''' def __init__(self, envspec, mode='q', lr=0.2, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, **kwargs): assert not hasattr(s_dim, '__len__') assert not envspec.is_continuous self.mode = mode self.s_dim = s_dim self.a_dim = a_dim self.gamma = float(kwargs.get('gamma', 0.999)) self.max_train_step = int(kwargs.get('max_train_step', 1000)) self.step = 0 self.train_step = 0 self.n_agents = int(kwargs.get('n_agents', 0)) if self.n_agents <= 0: raise ValueError('agents num must larger than zero.') self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.table = np.zeros(shape=(self.s_dim, self.a_dim)) self.lr = lr self.next_a = np.zeros(self.n_agents, dtype=np.int32) self.mask = [] ion() def one_hot2int(self, x): idx = [np.where(np.asarray(i))[0][0] for i in x] return idx def partial_reset(self, done): self.mask = np.where(done)[0] def choose_action(self, s, visual_s=None, evaluation=False): s = self.one_hot2int(s) if self.mode == 'q': return self._get_action(s, evaluation) elif self.mode == 'sarsa' or self.mode == 'expected_sarsa': a = self._get_action(s, evaluation) self.next_a[self.mask] = a[self.mask] return self.next_a def _get_action(self, s, evaluation=False, _max=False): a = np.array([np.argmax(self.table[i, :]) for i in s]) if not _max: if np.random.uniform() < self.expl_expt_mng.get_esp( self.train_step, evaluation=evaluation): a = np.random.randint(0, self.a_dim, self.n_agents) return a def learn(self, **kwargs): self.train_step = kwargs.get('train_step') def store_data(self, s, visual_s, a, r, s_, visual_s_, done): self.step += 1 s = self.one_hot2int(s) s_ = self.one_hot2int(s_) if self.mode == 'q': a_ = self._get_action(s_, _max=True) value = self.table[s_, a_] else: self.next_a = self._get_action(s_) if self.mode == 'expected_sarsa': value = np.mean(self.table[s_, :], axis=-1) else: value = self.table[s_, self.next_a] self.table[s, a] = (1 - self.lr) * self.table[s, a] + self.lr * ( r + self.gamma * (1 - done) * value) if self.step % 1000 == 0: plot_heatmap(self.s_dim, self.a_dim, self.table) def close(self): ioff() def no_op_store(self, s, visual_s, a, r, s_, visual_s_, done): pass def __getattr__(self, x): # print(x) return lambda *args, **kwargs: 0
class C51(make_off_policy_class(mode='share')): ''' Category 51, https://arxiv.org/abs/1707.06887 No double, no dueling, no noisy net. ''' def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, v_min=-10, v_max=10, atoms=51, lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=1000, hidden_units=[128, 128], **kwargs): assert not is_continuous, 'c51 only support discrete action space' super().__init__( s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.v_min = v_min self.v_max = v_max self.atoms = atoms self.delta_z = (self.v_max - self.v_min) / (self.atoms - 1) self.z = tf.reshape(tf.constant([self.v_min + i * self.delta_z for i in range(self.atoms)], dtype=tf.float32), [-1, self.atoms]) # [1, N] self.zb = tf.tile(self.z, tf.constant([self.a_dim, 1])) # [A, N] self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.assign_interval = assign_interval def _net(): return NetWork(self.feat_dim, self.a_dim, self.atoms, hidden_units) self.q_dist_net = _net() self.q_target_dist_net = _net() self.critic_tv = self.q_dist_net.trainable_variables + self.other_tv update_target_net_weights(self.q_target_dist_net.weights, self.q_dist_net.weights) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self.model_recorder(dict( model=self.q_dist_net, optimizer=self.optimizer )) def show_logo(self): self.logger.info(''' xxxxxxx xxxxx xxx xxxx xxx xxxx xxxx xxxx x xxxx xx xxx x xxxxx xx xxx xxx xx xxx xxx xx xxx xx xx xxx x xx xx xx xxxxxxxx xxxxx xxxx xxxxx x xxxx ''') def choose_action(self, s, visual_s, evaluation=False): if np.random.uniform() < self.expl_expt_mng.get_esp(self.train_step, evaluation=evaluation): a = np.random.randint(0, self.a_dim, self.n_agents) else: a, self.cell_state = self._get_action(s, visual_s, self.cell_state) a = a.numpy() return a @tf.function def _get_action(self, s, visual_s, cell_state): with tf.device(self.device): feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True) q = self.get_q(feat) # [B, A] return tf.argmax(q, axis=-1), cell_state # [B, 1] def learn(self, **kwargs): self.train_step = kwargs.get('train_step') def _update(): if self.global_step % self.assign_interval == 0: update_target_net_weights(self.q_target_dist_net.weights, self.q_dist_net.weights) for i in range(self.train_times_per_step): self._learn(function_dict={ 'train_function': self.train, 'update_function': _update, 'summary_dict': dict([['LEARNING_RATE/lr', self.lr(self.train_step)]]) }) @tf.function(experimental_relax_shapes=True) def train(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done = memories batch_size = tf.shape(a)[0] with tf.device(self.device): with tf.GradientTape() as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) indexs = tf.reshape(tf.range(batch_size), [-1, 1]) # [B, 1] q_dist = self.q_dist_net(feat) # [B, A, N] q_dist = tf.transpose(tf.reduce_sum(tf.transpose(q_dist, [2, 0, 1]) * a, axis=-1), [1, 0]) # [B, N] q_eval = tf.reduce_sum(q_dist * self.z, axis=-1) target_q_dist = self.q_target_dist_net(feat_) # [B, A, N] target_q = tf.reduce_sum(self.zb * target_q_dist, axis=-1) # [B, A, N] => [B, A] a_ = tf.reshape(tf.cast(tf.argmax(target_q, axis=-1), dtype=tf.int32), [-1, 1]) # [B, 1] target_q_dist = tf.gather_nd(target_q_dist, tf.concat([indexs, a_], axis=-1)) # [B, N] target = tf.tile(r, tf.constant([1, self.atoms])) \ + self.gamma * tf.multiply(self.z, # [1, N] (1.0 - tf.tile(done, tf.constant([1, self.atoms])))) # [B, N], [1, N]* [B, N] = [B, N] target = tf.clip_by_value(target, self.v_min, self.v_max) # [B, N] b = (target - self.v_min) / self.delta_z # [B, N] u, l = tf.math.ceil(b), tf.math.floor(b) # [B, N] u_id, l_id = tf.cast(u, tf.int32), tf.cast(l, tf.int32) # [B, N] u_minus_b, b_minus_l = u - b, b - l # [B, N] index_help = tf.tile(indexs, tf.constant([1, self.atoms])) # [B, N] index_help = tf.expand_dims(index_help, -1) # [B, N, 1] u_id = tf.concat([index_help, tf.expand_dims(u_id, -1)], axis=-1) # [B, N, 2] l_id = tf.concat([index_help, tf.expand_dims(l_id, -1)], axis=-1) # [B, N, 2] _cross_entropy = tf.stop_gradient(target_q_dist * u_minus_b) * tf.math.log(tf.gather_nd(q_dist, l_id)) \ + tf.stop_gradient(target_q_dist * b_minus_l) * tf.math.log(tf.gather_nd(q_dist, u_id)) # [B, N] # tf.debugging.check_numerics(_cross_entropy, '_cross_entropy') cross_entropy = -tf.reduce_sum(_cross_entropy, axis=-1) # [B,] # tf.debugging.check_numerics(cross_entropy, 'cross_entropy') loss = tf.reduce_mean(cross_entropy * isw) + crsty_loss td_error = cross_entropy grads = tape.gradient(loss, self.critic_tv) self.optimizer.apply_gradients( zip(grads, self.critic_tv) ) self.global_step.assign_add(1) return td_error, dict([ ['LOSS/loss', loss], ['Statistics/q_max', tf.reduce_max(q_eval)], ['Statistics/q_min', tf.reduce_min(q_eval)], ['Statistics/q_mean', tf.reduce_mean(q_eval)] ]) @tf.function(experimental_relax_shapes=True) def get_q(self, feat): with tf.device(self.device): return tf.reduce_sum(self.zb * self.q_dist_net(feat), axis=-1) # [B, A, N] => [B, A]
class QRDQN(Off_Policy): ''' Quantile Regression DQN Distributional Reinforcement Learning with Quantile Regression, https://arxiv.org/abs/1710.10044 No double, no dueling, no noisy net. ''' def __init__(self, envspec, nums=20, huber_delta=1., lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=1000, network_settings=[128, 128], **kwargs): assert not envspec.is_continuous, 'qrdqn only support discrete action space' assert nums > 0 super().__init__(envspec=envspec, **kwargs) self.nums = nums self.huber_delta = huber_delta self.quantiles = tf.reshape( tf.constant((2 * np.arange(self.nums) + 1) / (2.0 * self.nums), dtype=tf.float32), [-1, self.nums]) # [1, N] self.batch_quantiles = tf.tile(self.quantiles, [self.a_dim, 1]) # [1, N] => [A, N] self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.assign_interval = assign_interval def _create_net(name, representation_net=None): return ValueNetwork( name=name, representation_net=representation_net, value_net_type=OutputNetworkType.QRDQN_DISTRIBUTIONAL, value_net_kwargs=dict(action_dim=self.a_dim, nums=self.nums, network_settings=network_settings)) self.q_dist_net = _create_net('q_dist_net', self._representation_net) self._representation_target_net = self._create_representation_net( '_representation_target_net') self.q_target_dist_net = _create_net('q_target_dist_net', self._representation_target_net) update_target_net_weights(self.q_target_dist_net.weights, self.q_dist_net.weights) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self._worker_params_dict.update(self.q_dist_net._policy_models) self._all_params_dict.update(self.q_dist_net._all_models) self._all_params_dict.update(optimizer=self.optimizer) self._model_post_process() def choose_action(self, s, visual_s, evaluation=False): if np.random.uniform() < self.expl_expt_mng.get_esp( self.train_step, evaluation=evaluation): a = np.random.randint(0, self.a_dim, self.n_agents) else: a, self.cell_state = self._get_action(s, visual_s, self.cell_state) a = a.numpy() return a @tf.function def _get_action(self, s, visual_s, cell_state): with tf.device(self.device): q_values, cell_state = self.q_dist_net(s, visual_s, cell_state=cell_state) q = tf.reduce_sum(self.batch_quantiles * q_values, axis=-1) # [B, A, N] => [B, A] return tf.argmax(q, axis=-1), cell_state # [B, 1] def _target_params_update(self): if self.global_step % self.assign_interval == 0: update_target_net_weights(self.q_target_dist_net.weights, self.q_dist_net.weights) def learn(self, **kwargs): self.train_step = kwargs.get('train_step') for i in range(self.train_times_per_step): self._learn( function_dict={ 'summary_dict': dict([['LEARNING_RATE/lr', self.lr(self.train_step)]]), 'train_data_list': ['s', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done'] }) @tf.function(experimental_relax_shapes=True) def _train(self, memories, isw, cell_state): s, visual_s, a, r, s_, visual_s_, done = memories batch_size = tf.shape(a)[0] with tf.device(self.device): with tf.GradientTape() as tape: indexs = tf.reshape(tf.range(batch_size), [-1, 1]) # [B, 1] q_dist, _ = self.q_dist_net(s, visual_s, cell_state=cell_state) # [B, A, N] q_dist = tf.transpose( tf.reduce_sum(tf.transpose(q_dist, [2, 0, 1]) * a, axis=-1), [1, 0]) # [B, N] target_q_dist, _ = self.q_target_dist_net( s_, visual_s_, cell_state=cell_state) # [B, A, N] target_q = tf.reduce_sum(self.batch_quantiles * target_q_dist, axis=-1) # [B, A, N] => [B, A] a_ = tf.reshape( tf.cast(tf.argmax(target_q, axis=-1), dtype=tf.int32), [-1, 1]) # [B, 1] target_q_dist = tf.gather_nd(target_q_dist, tf.concat([indexs, a_], axis=-1)) # [B, N] target = tf.tile(r, tf.constant([1, self.nums])) \ + self.gamma * tf.multiply(self.quantiles, # [1, N] (1.0 - tf.tile(done, tf.constant([1, self.nums])))) # [B, N], [1, N]* [B, N] = [B, N] q_eval = tf.reduce_sum(q_dist * self.quantiles, axis=-1) # [B, 1] q_target = tf.reduce_sum(target * self.quantiles, axis=-1) # [B, 1] td_error = q_eval - q_target # [B, 1] quantile_error = tf.expand_dims( q_dist, axis=-1) - tf.expand_dims( target, axis=1) # [B, N, 1] - [B, 1, N] => [B, N, N] huber = huber_loss(quantile_error, delta=self.huber_delta) # [B, N, N] huber_abs = tf.abs( self.quantiles - tf.where(quantile_error < 0, tf.ones_like(quantile_error), tf.zeros_like(quantile_error)) ) # [1, N] - [B, N, N] => [B, N, N] loss = tf.reduce_mean(huber_abs * huber, axis=-1) # [B, N, N] => [B, N] loss = tf.reduce_sum(loss, axis=-1) # [B, N] => [B, ] loss = tf.reduce_mean(loss * isw) # [B, ] => 1 grads = tape.gradient(loss, self.q_dist_net.trainable_variables) self.optimizer.apply_gradients( zip(grads, self.q_dist_net.trainable_variables)) self.global_step.assign_add(1) return td_error, dict( [['LOSS/loss', loss], ['Statistics/q_max', tf.reduce_max(q_eval)], ['Statistics/q_min', tf.reduce_min(q_eval)], ['Statistics/q_mean', tf.reduce_mean(q_eval)]])