def __init__(self, envspec, v_min=-10, v_max=10, atoms=51, lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=2, network_settings={ 'share': [128], 'v': [128], 'adv': [128] }, **kwargs): assert not envspec.is_continuous, 'rainbow only support discrete action space' super().__init__(envspec=envspec, **kwargs) self.v_min = v_min self.v_max = v_max self.atoms = atoms self.delta_z = (self.v_max - self.v_min) / (self.atoms - 1) self.z = tf.reshape( tf.constant( [self.v_min + i * self.delta_z for i in range(self.atoms)], dtype=tf.float32), [-1, self.atoms]) # [1, N] self.zb = tf.tile(self.z, tf.constant([self.a_dim, 1])) # [A, N] self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.assign_interval = assign_interval def _create_net(name, representation_net=None): return ValueNetwork( name=name, representation_net=representation_net, value_net_type=OutputNetworkType.RAINBOW_DUELING, value_net_kwargs=dict(action_dim=self.a_dim, atoms=self.atoms, network_settings=network_settings)) self.rainbow_net = _create_net('rainbow_net', self._representation_net) self._representation_target_net = self._create_representation_net( '_representation_target_net') self.rainbow_target_net = _create_net('rainbow_target_net', self._representation_target_net) update_target_net_weights(self.rainbow_target_net.weights, self.rainbow_net.weights) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self._worker_params_dict.update(self.rainbow_net._policy_models) self._all_params_dict.update(self.rainbow_net._all_models) self._all_params_dict.update(optimizer=self.optimizer) self._model_post_process()
def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, lr=5.0e-4, alpha=2, ployak=0.995, hidden_units=[32, 32], **kwargs): assert not is_continuous, 'sql only support discrete action space' super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.alpha = alpha self.ployak = ployak def _q_net(): return NetWork(self.feat_dim, self.a_dim, hidden_units) self.q_net = _q_net() self.q_target_net = _q_net() self.critic_tv = self.q_net.trainable_variables + self.other_tv self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) update_target_net_weights(self.q_target_net.weights, self.q_net.weights) self.model_recorder(dict(model=self.q_net, optimizer=self.optimizer))
def __init__(self, envspec, online_quantiles=8, target_quantiles=8, select_quantiles=32, quantiles_idx=64, huber_delta=1., lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=2, network_settings={ 'q_net': [128, 64], 'quantile': [128, 64], 'tile': [64] }, **kwargs): assert not envspec.is_continuous, 'iqn only support discrete action space' super().__init__(envspec=envspec, **kwargs) self.pi = tf.constant(np.pi) self.online_quantiles = online_quantiles self.target_quantiles = target_quantiles self.select_quantiles = select_quantiles self.quantiles_idx = quantiles_idx self.huber_delta = huber_delta self.assign_interval = assign_interval self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) def _create_net(name, representation_net=None): return ValueNetwork(name=name, representation_net=representation_net, value_net_type=OutputNetworkType.IQN_NET, value_net_kwargs=dict( action_dim=self.a_dim, quantiles_idx=self.quantiles_idx, network_settings=network_settings)) self.q_net = _create_net('q_net', self._representation_net) self._representation_target_net = self._create_representation_net( '_representation_target_net') self.q_target_net = _create_net('q_target_net', self._representation_target_net) update_target_net_weights(self.q_target_net.weights, self.q_net.weights) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self._worker_params_dict.update(self.q_net._policy_models) self._all_params_dict.update(self.q_net._all_models) self._all_params_dict.update(optimizer=self.optimizer) self._model_post_process()
def __init__(self, envspec, alpha=0.2, beta=0.1, ployak=0.995, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, use_epsilon=False, q_lr=5.0e-4, alpha_lr=5.0e-4, auto_adaption=True, network_settings=[32, 32], **kwargs): assert not envspec.is_continuous, 'maxsqn only support discrete action space' super().__init__(envspec=envspec, **kwargs) self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.use_epsilon = use_epsilon self.ployak = ployak self.log_alpha = alpha if not auto_adaption else tf.Variable( initial_value=0.0, name='log_alpha', dtype=tf.float32, trainable=True) self.auto_adaption = auto_adaption self.target_entropy = beta * np.log(self.a_dim) def _create_net(name, representation_net=None): return DoubleValueNetwork( name=name, representation_net=representation_net, value_net_type=OutputNetworkType.CRITIC_QVALUE_ALL, value_net_kwargs=dict(output_shape=self.a_dim, network_settings=network_settings)) self.critic_net = _create_net('critic_net', self._representation_net) self._representation_target_net = self._create_representation_net( '_representation_target_net') self.critic_target_net = _create_net('critic_target_net', self._representation_target_net) update_target_net_weights(self.critic_target_net.weights, self.critic_net.weights) self.q_lr, self.alpha_lr = map(self.init_lr, [q_lr, alpha_lr]) self.optimizer_critic, self.optimizer_alpha = map( self.init_optimizer, [self.q_lr, self.alpha_lr]) self._worker_params_dict.update(self.critic_net._policy_models) self._all_params_dict.update(self.critic_net._all_models) self._all_params_dict.update(optimizer_critic=self.optimizer_critic, optimizer_alpha=self.optimizer_alpha) self._model_post_process()
def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, online_quantiles=8, target_quantiles=8, select_quantiles=32, quantiles_idx=64, huber_delta=1., lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=2, hidden_units={ 'q_net': [128, 64], 'quantile': [128, 64], 'tile': [64] }, **kwargs): assert not is_continuous, 'iqn only support discrete action space' super().__init__( s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.pi = tf.constant(np.pi) self.online_quantiles = online_quantiles self.target_quantiles = target_quantiles self.select_quantiles = select_quantiles self.quantiles_idx = quantiles_idx self.huber_delta = huber_delta self.assign_interval = assign_interval self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) def _net(): return NetWork(self.feat_dim, self.a_dim, self.quantiles_idx, hidden_units) self.q_net = _net() self.q_target_net = _net() self.critic_tv = self.q_net.trainable_variables + self.other_tv update_target_net_weights(self.q_target_net.weights, self.q_net.weights) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self.model_recorder(dict( model=self.q_net, optimizer=self.optimizer ))
def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, alpha=0.2, beta=0.1, ployak=0.995, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, use_epsilon=False, q_lr=5.0e-4, alpha_lr=5.0e-4, auto_adaption=True, hidden_units=[32, 32], **kwargs): assert not is_continuous, 'maxsqn only support discrete action space' super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.use_epsilon = use_epsilon self.ployak = ployak self.log_alpha = alpha if not auto_adaption else tf.Variable( initial_value=0.0, name='log_alpha', dtype=tf.float32, trainable=True) self.auto_adaption = auto_adaption self.target_entropy = beta * np.log(self.a_dim) def _q_net(): return Critic(self.feat_dim, self.a_dim, hidden_units) self.critic_net = DoubleQ(_q_net) self.critic_target_net = DoubleQ(_q_net) self.critic_tv = self.critic_net.trainable_variables + self.other_tv update_target_net_weights(self.critic_target_net.weights, self.critic_net.weights) self.q_lr, self.alpha_lr = map(self.init_lr, [q_lr, alpha_lr]) self.optimizer_critic, self.optimizer_alpha = map( self.init_optimizer, [self.q_lr, self.alpha_lr]) self.model_recorder( dict(critic_net=self.critic_net, optimizer_critic=self.optimizer_critic, optimizer_alpha=self.optimizer_alpha))
def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, v_min=-10, v_max=10, atoms=51, lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=2, hidden_units={ 'share': [128], 'v': [128], 'adv': [128] }, **kwargs): assert not is_continuous, 'rainbow only support discrete action space' super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.v_min = v_min self.v_max = v_max self.atoms = atoms self.delta_z = (self.v_max - self.v_min) / (self.atoms - 1) self.z = tf.reshape( tf.constant( [self.v_min + i * self.delta_z for i in range(self.atoms)], dtype=tf.float32), [-1, self.atoms]) # [1, N] self.zb = tf.tile(self.z, tf.constant([self.a_dim, 1])) # [A, N] self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.assign_interval = assign_interval def _net(): return NetWork(self.feat_dim, self.a_dim, self.atoms, hidden_units) self.rainbow_net = _net() self.rainbow_target_net = _net() self.critic_tv = self.rainbow_net.trainable_variables + self.other_tv update_target_net_weights(self.rainbow_target_net.weights, self.rainbow_net.weights) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self.model_recorder( dict(model=self.rainbow_net, optimizer=self.optimizer))
def learn(self, **kwargs): self.train_step = kwargs.get('train_step') for i in range(self.train_times_per_step): if self.data_low.is_lg_batch_size and self.data_high.is_lg_batch_size: self.intermediate_variable_reset() low_data = self.get_transitions( self.data_low, data_name_list=['s', 'a', 'r', 's_', 'done', 'g', 'g_']) high_data = self.get_transitions( self.data_high, data_name_list=['s', 'r', 'a', 'g', 'done', 's_']) # --------------------------------------获取需要传给train函数的参数 _low_training_data = self.get_value_from_dict( data_name_list=['s', 'a', 'r', 's_', 'done', 'g', 'g_'], data_dict=low_data) _high_training_data = self.get_value_from_dict( data_name_list=['s', 'r', 'a', 'g', 'done', 's_'], data_dict=high_data) summaries = self.train_low(_low_training_data) self.summaries.update(summaries) update_target_net_weights( self.low_actor_target.weights + self.low_critic_target.weights, self.low_actor.weights + self.low_critic.weights, self.ployak) if self.counts % self.sub_goal_steps == 0: self.counts = 0 high_summaries = self.train_high(_high_training_data) self.summaries.update(high_summaries) update_target_net_weights( self.high_actor_target.weights + self.high_critic_target.weights, self.high_actor.weights + self.high_critic.weights, self.ployak) self.counts += 1 self.summaries.update( dict([[ 'LEARNING_RATE/low_actor_lr', self.low_actor_lr(self.train_step) ], [ 'LEARNING_RATE/low_critic_lr', self.low_critic_lr(self.train_step) ], [ 'LEARNING_RATE/high_actor_lr', self.high_actor_lr(self.train_step) ], [ 'LEARNING_RATE/high_critic_lr', self.high_critic_lr(self.train_step) ]])) self.write_training_summaries(self.global_step, self.summaries)
def __init__(self, envspec, nums=20, huber_delta=1., lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=1000, network_settings=[128, 128], **kwargs): assert not envspec.is_continuous, 'qrdqn only support discrete action space' assert nums > 0 super().__init__(envspec=envspec, **kwargs) self.nums = nums self.huber_delta = huber_delta self.quantiles = tf.reshape( tf.constant((2 * np.arange(self.nums) + 1) / (2.0 * self.nums), dtype=tf.float32), [-1, self.nums]) # [1, N] self.batch_quantiles = tf.tile(self.quantiles, [self.a_dim, 1]) # [1, N] => [A, N] self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.assign_interval = assign_interval def _create_net(name, representation_net=None): return ValueNetwork( name=name, representation_net=representation_net, value_net_type=OutputNetworkType.QRDQN_DISTRIBUTIONAL, value_net_kwargs=dict(action_dim=self.a_dim, nums=self.nums, network_settings=network_settings)) self.q_dist_net = _create_net('q_dist_net', self._representation_net) self._representation_target_net = self._create_representation_net( '_representation_target_net') self.q_target_dist_net = _create_net('q_target_dist_net', self._representation_target_net) update_target_net_weights(self.q_target_dist_net.weights, self.q_dist_net.weights) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self._worker_params_dict.update(self.q_dist_net._policy_models) self._all_params_dict.update(self.q_dist_net._all_models) self._all_params_dict.update(optimizer=self.optimizer) self._model_post_process()
def __init__(self, s_dim: Union[List[int], np.ndarray], a_dim: Union[List[int], np.ndarray], is_continuous: Union[List[bool], np.ndarray], ployak: float = 0.995, actor_lr: float = 5.0e-4, critic_lr: float = 1.0e-3, hidden_units: Dict = { 'actor': [32, 32], 'q': [32, 32] }, **kwargs): ''' TODO: Annotation ''' assert all(is_continuous), 'maddpg only support continuous action space' super().__init__( s_dim=s_dim, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.ployak = ployak # self.action_noises = NormalActionNoise(mu=np.zeros(self.a_dim), sigma=1 * np.ones(self.a_dim)) self.action_noises = {i: OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.a_dim[i]), sigma=0.2 * np.ones(self.a_dim[i])) for i in range(self.agent_sep_ctls)} def _actor_net(i): return ActorCts(self.s_dim[i], self.a_dim[i], hidden_units['actor']) self.actor_nets = {i: _actor_net(i) for i in range(self.agent_sep_ctls)} self.actor_target_nets = {i: _actor_net(i) for i in range(self.agent_sep_ctls)} def _q_net(): return Critic(self.total_s_dim, self.total_a_dim, hidden_units['q']) self.q_nets = {i: _q_net() for i in range(self.agent_sep_ctls)} self.q_target_nets = {i: _q_net() for i in range(self.agent_sep_ctls)} for i in range(self.agent_sep_ctls): update_target_net_weights( self.actor_target_nets[i].weights + self.q_target_nets[i].weights, self.actor_nets[i].weights + self.q_nets[i].weights ) self.actor_lrs = {i: self.init_lr(actor_lr) for i in range(self.agent_sep_ctls)} self.critic_lrs = {i: self.init_lr(critic_lr) for i in range(self.agent_sep_ctls)} self.optimizer_actors = {i: self.init_optimizer(self.actor_lrs[i]) for i in range(self.agent_sep_ctls)} self.optimizer_critics = {i: self.init_optimizer(self.critic_lrs[i]) for i in range(self.agent_sep_ctls)} models_and_optimizers = {} models_and_optimizers.update({f'actor-{i}': self.actor_nets[i] for i in range(self.agent_sep_ctls)}) models_and_optimizers.update({f'critic-{i}': self.q_nets[i] for i in range(self.agent_sep_ctls)}) models_and_optimizers.update({f'optimizer_actor-{i}': self.optimizer_actors[i] for i in range(self.agent_sep_ctls)}) models_and_optimizers.update({f'optimizer_critic-{i}': self.optimizer_critics[i] for i in range(self.agent_sep_ctls)}) self.model_recorder(models_and_optimizers)
def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, nums=20, huber_delta=1., lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=1000, hidden_units=[128, 128], **kwargs): assert not is_continuous, 'qrdqn only support discrete action space' assert nums > 0 super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.nums = nums self.huber_delta = huber_delta self.quantiles = tf.reshape( tf.constant((2 * np.arange(self.nums) + 1) / (2.0 * self.nums), dtype=tf.float32), [-1, self.nums]) # [1, N] self.batch_quantiles = tf.tile(self.quantiles, [self.a_dim, 1]) # [1, N] => [A, N] self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.assign_interval = assign_interval def _net(): return NetWork(self.feat_dim, self.a_dim, self.nums, hidden_units) self.q_dist_net = _net() self.q_target_dist_net = _net() self.critic_tv = self.q_dist_net.trainable_variables + self.other_tv update_target_net_weights(self.q_target_dist_net.weights, self.q_dist_net.weights) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self.model_recorder( dict(model=self.q_dist_net, optimizer=self.optimizer))
def __init__(self, envspec, target_k: int = 4, lr: float = 5.0e-4, eps_init: float = 1, eps_mid: float = 0.2, eps_final: float = 0.01, init2mid_annealing_step: int = 1000, assign_interval: int = 1000, network_settings: List[int] = [32, 32], **kwargs): assert not envspec.is_continuous, 'dqn only support discrete action space' super().__init__(envspec=envspec, **kwargs) self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.assign_interval = assign_interval self.target_k = target_k assert self.target_k > 0, "assert self.target_k > 0" self.target_nets = [] self.current_target_idx = 0 def _create_net(name, representation_net=None): return ValueNetwork( name=name, representation_net=representation_net, value_net_type=OutputNetworkType.CRITIC_QVALUE_ALL, value_net_kwargs=dict(output_shape=self.a_dim, network_settings=network_settings) ) self.q_net = _create_net('dqn_q_net', self._representation_net) for i in range(self.target_k): target_q_net = _create_net( 'dqn_q_target_net' + str(i), self._create_representation_net('_representation_target_net' + str(i)) ) update_target_net_weights(target_q_net.weights, self.q_net.weights) self.target_nets.append(target_q_net) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self._worker_params_dict.update(self.q_net._policy_models) self._all_params_dict.update(self.q_net._all_models) self._all_params_dict.update(optimizer=self.optimizer) self._model_post_process()
def __init__(self, envspec, lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=1000, head_num=4, network_settings=[32, 32], **kwargs): assert not envspec.is_continuous, 'Bootstrapped DQN only support discrete action space' super().__init__(envspec=envspec, **kwargs) self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.assign_interval = assign_interval self.head_num = head_num self._probs = [1. / head_num for _ in range(head_num)] self.now_head = 0 def _create_net(name, representation_net=None): return ValueNetwork( name=name, representation_net=representation_net, value_net_type=OutputNetworkType.CRITIC_QVALUE_BOOTSTRAP, value_net_kwargs=dict(output_shape=self.a_dim, head_num=self.head_num, network_settings=network_settings)) self.q_net = _create_net('q_net', self._representation_net) self._representation_target_net = self._create_representation_net( '_representation_target_net') self.q_target_net = _create_net('q_target_net', self._representation_target_net) update_target_net_weights(self.q_target_net.weights, self.q_net.weights) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self._worker_params_dict.update(self.q_net._policy_models) self._all_params_dict.update(self.q_net._all_models) self._all_params_dict.update(optimizer=self.optimizer) self._model_post_process()
def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=2, hidden_units={ 'share': [128], 'v': [128], 'adv': [128] }, **kwargs): assert not is_continuous, 'dueling double dqn only support discrete action space' super().__init__( s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.assign_interval = assign_interval def _net(): return NetWork(self.feat_dim, self.a_dim, hidden_units) self.dueling_net = _net() self.dueling_target_net = _net() self.critic_tv = self.dueling_net.trainable_variables + self.other_tv update_target_net_weights(self.dueling_target_net.weights, self.dueling_net.weights) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self.model_recorder(dict( model=self.dueling_net, optimizer=self.optimizer ))
def __init__(self, envspec, lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=2, network_settings={ 'share': [128], 'v': [128], 'adv': [128] }, **kwargs): assert not envspec.is_continuous, 'dueling double dqn only support discrete action space' super().__init__(envspec=envspec, **kwargs) self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.assign_interval = assign_interval def _create_net(name, representation_net): return ValueNetwork( name=name, representation_net=representation_net, value_net_type=OutputNetworkType.CRITIC_DUELING, value_net_kwargs=dict(output_shape=self.a_dim, network_settings=network_settings)) self.dueling_net = _create_net('dueling_net', self._representation_net) self._representation_target_net = self._create_representation_net( '_representation_target_net') self.dueling_target_net = _create_net('dueling_target_net', self._representation_target_net) update_target_net_weights(self.dueling_target_net.weights, self.dueling_net.weights) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self._worker_params_dict.update(self.dueling_net._policy_models) self._all_params_dict.update(self.dueling_net._all_models) self._all_params_dict.update(optimizer=self.optimizer) self._model_post_process()
def learn(self, **kwargs): self.train_step = kwargs.get('train_step') def _train(memories, isw, crsty_loss, cell_state): td_error, summaries = self.train(memories, isw, crsty_loss, cell_state) if self.annealing and not self.auto_adaption: self.log_alpha.assign( tf.math.log( tf.cast(self.alpha_annealing(self.global_step.numpy()), tf.float32))) return td_error, summaries def _pre_process(data): data['visual_s'] = np.transpose(data['visual_s'][:, 0].numpy(), (0, 3, 1, 2)) data['visual_s_'] = np.transpose(data['visual_s_'][:, 0].numpy(), (0, 3, 1, 2)) data['pos'] = self.data_convert( np.transpose(random_crop(data['visual_s'], self.img_size), (0, 2, 3, 1))) data['visual_s'] = self.data_convert( np.transpose(random_crop(data['visual_s'], self.img_size), (0, 2, 3, 1))) data['visual_s_'] = self.data_convert( np.transpose(random_crop(data['visual_s_'], self.img_size), (0, 2, 3, 1))) return (data, ) for i in range(self.train_times_per_step): self._learn( function_dict={ 'train_function': _train, 'update_function': lambda: update_target_net_weights( self.critic_target_net.weights + self.encoder_target. trainable_variables, self.critic_net.weights + self. encoder.trainable_variables, self.ployak), 'summary_dict': dict([[ 'LEARNING_RATE/actor_lr', self.actor_lr(self.train_step) ], [ 'LEARNING_RATE/critic_lr', self.critic_lr(self.train_step) ], [ 'LEARNING_RATE/alpha_lr', self.alpha_lr(self.train_step) ]]), 'train_data_list': [ 's', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'pos' ], 'pre_process_function': _pre_process })
def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=1000, head_num=4, hidden_units=[32, 32], **kwargs): assert not is_continuous, 'Bootstrapped DQN only support discrete action space' super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.assign_interval = assign_interval self.head_num = head_num self._probs = [1. / head_num for _ in range(head_num)] self.now_head = 0 def _q_net(): return NetWork(self.feat_dim, self.a_dim, self.head_num, hidden_units) self.q_net = _q_net() self.q_target_net = _q_net() self.critic_tv = self.q_net.trainable_variables + self.other_tv update_target_net_weights(self.q_target_net.weights, self.q_net.weights) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self.model_recorder(dict(model=self.q_net, optimizer=self.optimizer))
def learn(self, **kwargs) -> NoReturn: ''' TODO: Annotation ''' self.train_step = kwargs.get('train_step') for i in range(self.train_times_per_step): if self.data.is_lg_batch_size: self.intermediate_variable_reset() batch_data = self.data.sample() done = batch_data[-1] s, visual_a, a, r, s_, visual_s_ = [batch_data[i:i + self.agent_sep_ctls] for i in range(0, len(batch_data) - 1, self.agent_sep_ctls)] target_a = [self._get_actions(i, s_[i], evaluation=True, use_target=True) for i in range(self.agent_sep_ctls)] s_all = np.hstack(s) a_all = np.hstack(a) s_next_all = np.hstack(s_) target_a_all = np.hstack(target_a) for i in range(self.agent_sep_ctls): summary = {} if i == 0: al = np.full(fill_value=[], shape=(done.shape[0], 0), dtype=np.float32) ar = np.hstack(a[i + 1:]) elif i == self.agent_sep_ctls - 1: al = np.hstack(a[:i]) ar = np.full(fill_value=[], shape=(done.shape[0], 0), dtype=np.float32) else: al = np.hstack(a[:i]) ar = np.hstack(a[i + 1:]) # actor: al, ar, s(all), s # critic: r, done, s_(all), target_a(all), s(all), a(all) summary.update(self._train(i, s_all, a_all, s_next_all, target_a_all, r[i], done, s[i], al, ar)) summary.update({'LEARNING_RATE/actor_lr': self.actor_lrs[i](self.train_step), 'LEARNING_RATE/critic_lr': self.critic_lrs[i](self.train_step)}) self.write_training_summaries(self.global_step, summary, self.writers[i]) self.global_step.assign_add(1) for i in range(self.agent_sep_ctls): update_target_net_weights( self.actor_target_nets[i].weights + self.q_target_nets[i].weights, self.actor_nets[i].weights + self.q_nets[i].weights, self.ployak)
def __init__(self, s_dim: Union[int, np.ndarray], visual_sources: Union[int, np.ndarray], visual_resolution: Union[List, np.ndarray], a_dim: Union[int, np.ndarray], is_continuous: Union[bool, np.ndarray], lr: float = 5.0e-4, eps_init: float = 1, eps_mid: float = 0.2, eps_final: float = 0.01, init2mid_annealing_step: int = 1000, assign_interval: int = 1000, hidden_units: List[int] = [32, 32], **kwargs): assert not is_continuous, 'dqn only support discrete action space' super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.assign_interval = assign_interval def _q_net(): return NetWork(self.feat_dim, self.a_dim, hidden_units) self.q_net = _q_net() self.q_target_net = _q_net() self.critic_tv = self.q_net.trainable_variables + self.other_tv update_target_net_weights(self.q_target_net.weights, self.q_net.weights) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self.model_recorder(dict(model=self.q_net, optimizer=self.optimizer))
def learn(self, **kwargs): self.train_step = kwargs.get('train_step') for i in range(self.train_times_per_step): if self.data_low.is_lg_batch_size and self.data_high.is_lg_batch_size: self.intermediate_variable_reset() low_data = self.get_transitions(self.data_low) high_data = self.get_transitions(self.data_high) summaries = self.train_low(low_data) self.summaries.update(summaries) update_target_net_weights(self.low_ac_target_net.weights, self.low_ac_net.weights, self.ployak) if self.counts % self.sub_goal_steps == 0: self.counts = 0 high_summaries = self.train_high(high_data) self.summaries.update(high_summaries) update_target_net_weights(self.high_ac_target_net.weights, self.high_ac_net.weights, self.ployak) self.counts += 1 self.summaries.update( dict([[ 'LEARNING_RATE/low_actor_lr', self.low_actor_lr(self.train_step) ], [ 'LEARNING_RATE/low_critic_lr', self.low_critic_lr(self.train_step) ], [ 'LEARNING_RATE/high_actor_lr', self.high_actor_lr(self.train_step) ], [ 'LEARNING_RATE/high_critic_lr', self.high_critic_lr(self.train_step) ]])) self.write_training_summaries(self.global_step, self.summaries)
def __init__(self, envspec, lr=5.0e-4, alpha=2, ployak=0.995, network_settings=[32, 32], **kwargs): assert not envspec.is_continuous, 'sql only support discrete action space' super().__init__(envspec=envspec, **kwargs) self.alpha = alpha self.ployak = ployak def _create_net(name, representation_net=None): return ValueNetwork( name=name, representation_net=representation_net, value_net_type=OutputNetworkType.CRITIC_QVALUE_ALL, value_net_kwargs=dict(output_shape=self.a_dim, network_settings=network_settings)) self.q_net = _create_net('q_net', self._representation_net) self._representation_target_net = self._create_representation_net( '_representation_target_net') self.q_target_net = _create_net('q_target_net', self._representation_target_net) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) update_target_net_weights(self.q_target_net.weights, self.q_net.weights) self._worker_params_dict.update(self.q_net._policy_models) self._all_params_dict.update(self.q_net._all_models) self._all_params_dict.update(optimizer=self.optimizer) self._model_post_process()
def learn(self, **kwargs): self.train_step = kwargs.get('train_step') for i in range(self.train_times_per_step): self._learn( function_dict={ 'train_function': self.train, 'update_function': lambda: update_target_net_weights( self.q_target_net.weights, self.q_net.weights, self. ployak), 'summary_dict': dict([['LEARNING_RATE/lr', self.lr(self.train_step)]]) })
def learn(self, **kwargs): self.train_step = kwargs.get('train_step') for i in range(self.train_times_per_step): self._learn(function_dict={ 'train_function': self.train, 'update_function': lambda: update_target_net_weights( self.actor_target_net.weights + self.reward_critic_target_net.weights + self.cost_critic_target_net.weights, self.actor_net.weights + self.reward_critic_net.weights + self.cost_critic_net.weights, self.ployak), 'summary_dict': dict([ ['LEARNING_RATE/actor_lr', self.actor_lr(self.train_step)], ['LEARNING_RATE/reward_critic_lr', self.reward_critic_lr(self.train_step)], ['LEARNING_RATE/cost_critic_lr', self.cost_critic_lr(self.train_step)] ]), 'sample_data_list': ['s', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'cost'], 'train_data_list': ['ss', 'vvss', 'a', 'r', 'done', 'cost'], })
def learn(self, **kwargs): self.train_step = kwargs.get('train_step') def _train(memories, isw, crsty_loss, cell_state): if self.is_continuous or self.use_gumbel: td_error, summaries = self.train_persistent( memories, isw, crsty_loss, cell_state) else: td_error, summaries = self.train_discrete( memories, isw, crsty_loss, cell_state) if self.annealing and not self.auto_adaption: self.log_alpha.assign( tf.math.log( tf.cast(self.alpha_annealing(self.global_step.numpy()), tf.float32))) return td_error, summaries for i in range(self.train_times_per_step): self._learn( function_dict={ 'train_function': _train, 'update_function': lambda: update_target_net_weights( self.critic_target_net.weights, self.critic_net. weights, self.ployak), 'summary_dict': dict([[ 'LEARNING_RATE/actor_lr', self.actor_lr(self.train_step) ], [ 'LEARNING_RATE/critic_lr', self.critic_lr(self.train_step) ], [ 'LEARNING_RATE/alpha_lr', self.alpha_lr(self.train_step) ]]) })
def __init__(self, envspec, ployak=0.995, delay_num=2, gaussian_noise_sigma=0.2, gaussian_noise_bound=0.2, actor_lr=5.0e-4, critic_lr=1.0e-3, discrete_tau=1.0, network_settings={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'q': [32, 32] }, **kwargs): super().__init__(envspec=envspec, **kwargs) self.ployak = ployak self.delay_num = delay_num self.discrete_tau = discrete_tau self.gaussian_noise_sigma = gaussian_noise_sigma self.gaussian_noise_bound = gaussian_noise_bound if self.is_continuous: def _create_net(name, representation_net=None): return ADoubleCNetwork( name=name, representation_net=representation_net, policy_net_type=OutputNetworkType.ACTOR_DPG, policy_net_kwargs=dict(output_shape=self.a_dim, network_settings=network_settings['actor_continuous']), value_net_type=OutputNetworkType.CRITIC_QVALUE_ONE, value_net_kwargs=dict(action_dim=self.a_dim, network_settings=network_settings['q']) ) self.noised_action = self.target_noised_action = ClippedNormalNoisedAction(sigma=self.gaussian_noise_sigma, noise_bound=self.gaussian_noise_bound) else: def _create_net(name, representation_net=None): return ADoubleCNetwork( name=name, representation_net=representation_net, policy_net_type=OutputNetworkType.ACTOR_DCT, policy_net_kwargs=dict(output_shape=self.a_dim, network_settings=network_settings['actor_discrete']), value_net_type=OutputNetworkType.CRITIC_QVALUE_ONE, value_net_kwargs=dict(action_dim=self.a_dim, network_settings=network_settings['q']) ) self.gumbel_dist = tfp.distributions.Gumbel(0, 1) self.ac_net = _create_net('ac_net', self._representation_net) self._representation_target_net = self._create_representation_net('_representation_target_net') self.ac_target_net = _create_net('ac_target_net', self._representation_target_net) update_target_net_weights(self.ac_target_net.weights, self.ac_net.weights) self.actor_lr, self.critic_lr = map(self.init_lr, [actor_lr, critic_lr]) self.optimizer_actor, self.optimizer_critic = map(self.init_optimizer, [self.actor_lr, self.critic_lr]) self._worker_params_dict.update(self.ac_net._policy_models) self._all_params_dict.update(self.ac_net._all_models) self._all_params_dict.update(optimizer_actor=self.optimizer_actor, optimizer_critic=self.optimizer_critic) self._model_post_process()
def _target_params_update(self): update_target_net_weights(self.ac_target_net.weights, self.ac_net.weights, self.ployak)
def _update(): if self.global_step % self.assign_interval == 0: update_target_net_weights(self.q_target_dist_net.weights, self.q_dist_net.weights)
def _target_params_update(self): if self.global_step % self.assign_interval == 0: update_target_net_weights(self.q_target_net.weights, self.q_net.weights)
def _target_params_update(self): if self.global_step % self.assign_interval == 0: update_target_net_weights(self.target_nets[self.current_target_idx].weights, self.q_net.weights) self.current_target_idx = (self.current_target_idx + 1) % self.target_k
def __init__( self, envspec, ployak=0.995, actor_lr=5.0e-4, reward_critic_lr=1.0e-3, cost_critic_lr=1.0e-3, lambda_lr=5.0e-4, discrete_tau=1.0, cost_constraint=1.0, network_settings={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'reward': [32, 32], 'cost': [32, 32] }, **kwargs): super().__init__(envspec=envspec, **kwargs) self.ployak = ployak self.discrete_tau = discrete_tau self._lambda = tf.Variable(0.0, dtype=tf.float32) self.cost_constraint = cost_constraint # long tern cost <= d if self.is_continuous: # NOTE: value_net is reward net; value_net2 is cost net. def _create_net(name, representation_net=None): return ACCNetwork( name=name, representation_net=representation_net, policy_net_type=OutputNetworkType.ACTOR_DPG, policy_net_kwargs=dict( output_shape=self.a_dim, network_settings=network_settings['actor_continuous']), value_net_type=OutputNetworkType.CRITIC_QVALUE_ONE, value_net_kwargs=dict( action_dim=self.a_dim, network_settings=network_settings['reward']), value_net2_type=OutputNetworkType.CRITIC_QVALUE_ONE, value_net2_kwargs=dict( action_dim=self.a_dim, network_settings=network_settings['cost'])) else: def _create_net(name, representation_net=None): return ACCNetwork( name=name, representation_net=representation_net, policy_net_type=OutputNetworkType.ACTOR_DCT, policy_net_kwargs=dict( output_shape=self.a_dim, network_settings=network_settings['actor_discrete']), value_net_type=OutputNetworkType.CRITIC_QVALUE_ONE, value_net_kwargs=dict( action_dim=self.a_dim, network_settings=network_settings['reward']), value_net2_type=OutputNetworkType.CRITIC_QVALUE_ONE, value_net2_kwargs=dict( action_dim=self.a_dim, network_settings=network_settings['cost'])) self.ac_net = _create_net('ac_net', self._representation_net) self._representation_target_net = self._create_representation_net( '_representation_target_net') self.ac_target_net = _create_net('ac_target_net', self._representation_target_net) if self.is_continuous: # self.action_noise = NormalActionNoise(sigma=0.2) self.action_noise = OrnsteinUhlenbeckActionNoise(sigma=0.2) else: self.gumbel_dist = tfp.distributions.Gumbel(0, 1) update_target_net_weights(self.ac_target_net.weights, self.ac_net.weights) self.lambda_lr = lambda_lr self.actor_lr, self.reward_critic_lr, self.cost_critic_lr = map( self.init_lr, [actor_lr, reward_critic_lr, cost_critic_lr]) self.optimizer_actor, self.optimizer_reward_critic, self.optimizer_cost_critic = map( self.init_optimizer, [self.actor_lr, self.reward_critic_lr, self.cost_critic_lr]) self._worker_params_dict.update(self.ac_net._policy_models) self._all_params_dict.update(self.ac_net._all_models) self._all_params_dict.update( optimizer_actor=self.optimizer_actor, optimizer_reward_critic=self.optimizer_reward_critic, optimizer_cost_critic=self.optimizer_cost_critic) self._model_post_process()