def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, lr=5.0e-4, alpha=2, ployak=0.995, hidden_units=[32, 32], **kwargs): assert not is_continuous, 'sql only support discrete action space' super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.alpha = alpha self.ployak = ployak _q_net = lambda: rls.critic_q_all(self.feat_dim, self.a_dim, hidden_units) self.q_net = _q_net() self.q_target_net = _q_net() self.critic_tv = self.q_net.trainable_variables + self.other_tv self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self.update_target_net_weights(self.q_target_net.weights, self.q_net.weights) self.model_recorder(dict(model=self.q_net, optimizer=self.optimizer))
def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, alpha=0.2, beta=0.1, ployak=0.995, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_episode=100, use_epsilon=False, q_lr=5.0e-4, alpha_lr=5.0e-4, auto_adaption=True, hidden_units=[32, 32], **kwargs): assert not is_continuous, 'maxsqn only support discrete action space' super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_episode=init2mid_annealing_episode, max_episode=self.max_episode) self.use_epsilon = use_epsilon self.ployak = ployak self.log_alpha = alpha if not auto_adaption else tf.Variable( initial_value=0.0, name='log_alpha', dtype=tf.float32, trainable=True) self.auto_adaption = auto_adaption self.target_entropy = beta * np.log(self.a_dim) _q_net = lambda: rls.critic_q_all(self.feat_dim, self.a_dim, hidden_units) self.critic_net = DoubleQ(_q_net) self.critic_target_net = DoubleQ(_q_net) self.critic_tv = self.critic_net.trainable_variables + self.other_tv self.update_target_net_weights(self.critic_target_net.weights, self.critic_net.weights) self.q_lr, self.alpha_lr = map(self.init_lr, [q_lr, alpha_lr]) self.optimizer_critic, self.optimizer_alpha = map( self.init_optimizer, [self.q_lr, self.alpha_lr]) self.model_recorder( dict(critic_net=self.critic_net, optimizer_critic=self.optimizer_critic, optimizer_alpha=self.optimizer_alpha))
def _q_net(): return rls.critic_q_all(self.feat_dim, self.options_num, hidden_units['q'])
def __init__( self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, q_lr=5.0e-3, intra_option_lr=5.0e-4, termination_lr=5.0e-4, interest_lr=5.0e-4, boltzmann_temperature=1.0, options_num=4, ent_coff=0.01, double_q=False, use_baseline=True, terminal_mask=True, termination_regularizer=0.01, assign_interval=1000, hidden_units={ 'q': [32, 32], 'intra_option': [32, 32], 'termination': [32, 32], 'interest': [32, 32] }, **kwargs): super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.assign_interval = assign_interval self.options_num = options_num self.termination_regularizer = termination_regularizer self.ent_coff = ent_coff self.use_baseline = use_baseline self.terminal_mask = terminal_mask self.double_q = double_q self.boltzmann_temperature = boltzmann_temperature def _q_net(): return rls.critic_q_all(self.feat_dim, self.options_num, hidden_units['q']) self.q_net = _q_net() self.q_target_net = _q_net() self.intra_option_net = rls.oc_intra_option( self.feat_dim, self.a_dim, self.options_num, hidden_units['intra_option']) self.termination_net = rls.critic_q_all(self.feat_dim, self.options_num, hidden_units['termination'], 'sigmoid') self.interest_net = rls.critic_q_all(self.feat_dim, self.options_num, hidden_units['interest'], 'sigmoid') self.critic_tv = self.q_net.trainable_variables + self.other_tv self.actor_tv = self.intra_option_net.trainable_variables if self.is_continuous: self.log_std = tf.Variable(initial_value=-0.5 * np.ones( (self.options_num, self.a_dim), dtype=np.float32), trainable=True) # [P, A] self.actor_tv += [self.log_std] self.update_target_net_weights(self.q_target_net.weights, self.q_net.weights) self.q_lr, self.intra_option_lr, self.termination_lr, self.interest_lr = map( self.init_lr, [q_lr, intra_option_lr, termination_lr, interest_lr]) self.q_optimizer = self.init_optimizer(self.q_lr, clipvalue=5.) self.intra_option_optimizer = self.init_optimizer(self.intra_option_lr, clipvalue=5.) self.termination_optimizer = self.init_optimizer(self.termination_lr, clipvalue=5.) self.interest_optimizer = self.init_optimizer(self.interest_lr, clipvalue=5.) self.model_recorder( dict(q_net=self.q_net, intra_option_net=self.intra_option_net, termination_net=self.termination_net, interest_net=self.interest_net, q_optimizer=self.q_optimizer, intra_option_optimizer=self.intra_option_optimizer, termination_optimizer=self.termination_optimizer, interest_optimizer=self.interest_optimizer))
def _q_net(): return rls.critic_q_all(self.feat_dim, self.a_dim, hidden_units) self.q_net = _q_net()