def __init__( self, scope, ob_space, ac_space, policy_size="normal", extrahid=True, hidsize=128, memsize=128, rec_gate_init=0.0, update_ob_stats_independently_per_gpu=True, proportion_of_exp_used_for_predictor_update=1.0, dynamics_bonus=False, meta_rl=False, ): StochasticPolicy.__init__(self, scope, ob_space, ac_space, meta_rl=meta_rl) self.proportion_of_exp_used_for_predictor_update = ( proportion_of_exp_used_for_predictor_update) enlargement = {"small": 1, "normal": 2, "large": 4}[policy_size] rep_size = 512 self.ph_mean = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="obmean") self.ph_std = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="obstd") memsize *= enlargement hidsize *= enlargement convfeat = 16 * enlargement self.ob_rms = RunningMeanStd( shape=list(ob_space.shape[:2]) + [1], use_mpi=not update_ob_stats_independently_per_gpu, ) ph_istate = tf.placeholder(dtype=tf.float32, shape=(None, memsize), name="state") pdparamsize = self.pdtype.param_shape()[0] self.memsize = memsize # Inputs to policy and value function will have different shapes depending on whether it is rollout # or optimization time, so we treat separately. ( self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt, ) = self.apply_policy( self.ph_ob['obs'][:, :-1], reuse=False, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps - 1, pdparamsize=pdparamsize, additional_inputs=self.ph_ob, ) ( self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout, ) = self.apply_policy( self.ph_ob['obs'], reuse=True, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps, pdparamsize=pdparamsize, additional_inputs=self.ph_ob, ) if dynamics_bonus: self.define_dynamics_prediction_rew(convfeat=convfeat, rep_size=rep_size, enlargement=enlargement) else: self.define_self_prediction_rew(convfeat=convfeat, rep_size=rep_size, enlargement=enlargement) pd = self.pdtype.pdfromflat(self.pdparam_rollout) self.a_samp = pd.sample() self.nlp_samp = pd.neglogp(self.a_samp) self.entropy_rollout = pd.entropy() self.pd_rollout = pd self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt) self.ph_istate = ph_istate
def __init__( self, scope, ob_space, ac_space, policy_size='normal', maxpool=False, extrahid=True, hidsize=128, memsize=128, rec_gate_init=0.0, update_ob_stats_independently_per_gpu=True, proportion_of_exp_used_for_predictor_update=1., dynamics_bonus=False, ): StochasticPolicy.__init__(self, scope, ob_space, ac_space) self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update enlargement = {'small': 1, 'normal': 2, 'large': 4}[policy_size] rep_size = 512 self.ph_mean = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="obmean") self.ph_std = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="obstd") memsize *= enlargement #256 hidsize *= enlargement #256 convfeat = 16 * enlargement self.ob_rms = RunningMeanStd( shape=list(ob_space.shape[:2]) + [1], use_mpi=not update_ob_stats_independently_per_gpu) ph_istate = tf.placeholder(dtype=tf.float32, shape=(None, memsize), name='state') pdparamsize = self.pdtype.param_shape()[0] self.memsize = memsize self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt = \ self.apply_policy(self.ph_ob[None][:,:-1], ph_new=self.ph_new, ph_istate=ph_istate, reuse=False, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps - 1, pdparamsize=pdparamsize, rec_gate_init=rec_gate_init ) self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \ self.apply_policy(self.ph_ob[None], ph_new=self.ph_new, ph_istate=ph_istate, reuse=True, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps, pdparamsize=pdparamsize, rec_gate_init=rec_gate_init ) if dynamics_bonus: self.define_dynamics_prediction_rew(convfeat=convfeat, rep_size=rep_size, enlargement=enlargement) else: self.define_self_prediction_rew(convfeat=convfeat, rep_size=rep_size, enlargement=enlargement) self.step_prediction(convfeat=convfeat, rep_size=rep_size, enlargement=enlargement) pd = self.pdtype.pdfromflat(self.pdparam_rollout) self.a_samp = pd.sample() self.nlp_samp = pd.neglogp(self.a_samp) self.entropy_rollout = pd.entropy() self.pd_rollout = pd self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt) self.ph_istate = ph_istate
def __init__(self, scope, ob_space, ac_space, policy_size='normal', maxpool=False, extrahid=True, hidsize=128, memsize=128, rec_gate_init=0.0, update_ob_stats_independently_per_gpu=True, proportion_of_exp_used_for_predictor_update=1., dynamics_bonus=False, num_agents=1, rnd_type='rnd', div_type='oracle', indep_rnd=False, indep_policy=False, sd_type='oracle', rnd_mask_prob=1.): StochasticPolicy.__init__(self, scope, ob_space, ac_space) self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update enlargement = {'small': 1, 'normal': 2, 'large': 4}[policy_size] rep_size = 512 self.rnd_mask = tf.placeholder(dtype=tf.float32, shape=(None, None, num_agents), name="rnd_mask") self.new_rnd_mask = tf.placeholder(dtype=tf.float32, shape=(None, None), name="new_rnd_mask") self.div_train_mask = tf.placeholder(dtype=tf.float32, shape=(None, None), name="div_train_mask") self.sample_agent_prob = tf.placeholder(dtype=tf.float32, shape=( None, None, ), name="sample_agent_prob") self.stage_label = tf.placeholder(dtype=tf.int32, shape=(None, None), name="stage_label") self.ph_mean = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="obmean") self.ph_std = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="obstd") self.ph_count = tf.placeholder(dtype=tf.float32, shape=(), name="obcount") self.sep_ph_mean = tf.placeholder(dtype=tf.float32, shape=( None, None, ) + ob_space.shape[:2] + (1, ), name="sep_obmean") self.sep_ph_std = tf.placeholder(dtype=tf.float32, shape=( None, None, ) + ob_space.shape[:2] + (1, ), name="sep_obstd") self.sep_ph_count = tf.placeholder(dtype=tf.float32, shape=(), name="sep_obcount") self.game_score = tf.placeholder(dtype=tf.float32, shape=(None, None), name="game_score") self.last_rew_ob = tf.placeholder(dtype=ob_space.dtype, shape=(None, None) + tuple(ob_space.shape), name="last_rew_ob") self.div_ph_mean = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="div_obmean") self.div_ph_std = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="div_obstd") self.idle_agent_label = tf.placeholder(dtype=tf.int32, shape=( None, None, ), name="idle_agent_label") self.rew_agent_label = tf.placeholder(dtype=tf.int32, shape=( None, None, ), name="rew_agent_label") #self.var_ph_mean = tf.get_variable("var_ph_mean", list(ob_space.shape[:2])+[1], initializer=tf.constant_initializer(0.0)) #self.var_ph_std = tf.get_variable("var_ph_std", list(ob_space.shape[:2])+[1], initializer=tf.constant_initializer(0.0)) #self.var_ph_count = tf.get_variable("var_ph_count", (), initializer=tf.constant_initializer(0.0)) self.sd_ph_mean = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="sd_obmean") self.sd_ph_std = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="sd_obstd") memsize *= enlargement hidsize *= enlargement convfeat = 16 * enlargement self.ob_rms_list = [RunningMeanStd(shape=list(ob_space.shape[:2])+[1], use_mpi= not update_ob_stats_independently_per_gpu) \ for _ in range(num_agents)] self.ob_rms = RunningMeanStd( shape=list(ob_space.shape[:2]) + [1], use_mpi=not update_ob_stats_independently_per_gpu) self.diversity_ob_rms = RunningMeanStd( shape=list(ob_space.shape[:2]) + [1], use_mpi=not update_ob_stats_independently_per_gpu) ph_istate = tf.placeholder(dtype=tf.float32, shape=(None, memsize), name='state') pdparamsize = self.pdtype.param_shape()[0] self.memsize = memsize self.num_agents = num_agents self.indep_rnd = indep_rnd self.indep_policy = indep_policy self.num_agents = num_agents if num_agents <= 0: self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt = \ self.apply_policy(self.ph_ob[None][:,:-1], ph_new=self.ph_new, ph_istate=ph_istate, reuse=False, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps - 1, pdparamsize=pdparamsize, rec_gate_init=rec_gate_init ) self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \ self.apply_policy(self.ph_ob[None], ph_new=self.ph_new, ph_istate=ph_istate, reuse=True, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps, pdparamsize=pdparamsize, rec_gate_init=rec_gate_init ) else: self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt = \ self.apply_multi_head_policy(self.ph_ob[None][:,:-1], ph_new=self.ph_new, ph_istate=ph_istate, reuse=False, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps - 1, pdparamsize=pdparamsize, rec_gate_init=rec_gate_init ) self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \ self.apply_multi_head_policy(self.ph_ob[None], ph_new=self.ph_new, ph_istate=ph_istate, reuse=True, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps, pdparamsize=pdparamsize, rec_gate_init=rec_gate_init ) if dynamics_bonus: self.define_dynamics_prediction_rew(convfeat=convfeat, rep_size=rep_size, enlargement=enlargement) else: #self.define_self_prediction_rew(convfeat=convfeat, rep_size=rep_size, enlargement=enlargement) self.aux_loss, self.int_rew, self.feat_var, self.max_feat = self.define_multi_head_self_prediction_rew( convfeat=convfeat, rep_size=rep_size, enlargement=enlargement) self.stage_rnd = tf.constant(1.) self.stage_prob = tf.constant(1.) if div_type == 'cls': with tf.variable_scope("div", reuse=False): #self.define_rew_discriminator(convfeat=convfeat, rep_size=256) with tf.variable_scope("int", reuse=False): self.disc_logits, self.all_div_prob, self.sp_prob, self.div_rew, self.disc_pd, self.disc_nlp = self.define_rew_discriminator_v2( convfeat=convfeat, rep_size=512, use_rew=True) else: self.div_rew = tf.constant(0.) pd = self.pdtype.pdfromflat(self.pdparam_rollout) self.a_samp = pd.sample() self.nlp_samp = pd.neglogp(self.a_samp) self.entropy_rollout = pd.entropy() self.pd_rollout = pd self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt) self.ph_istate = ph_istate
def __init__(self, scope, ob_space, ac_space, policy_size='normal', maxpool=False, extrahid=True, hidsize=128, memsize=128, rec_gate_init=0.0, update_ob_stats_independently_per_gpu=True, proportion_of_exp_used_for_predictor_update=1., exploration_type='bottleneck', beta=0.001, rew_counter=None ): StochasticPolicy.__init__(self, scope, ob_space, ac_space) self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update enlargement = { 'small': 1, 'normal': 2, 'large': 4 }[policy_size] rep_size = 512 self.ph_mean = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2])+[1], name="obmean") # (84, 84, 1) self.ph_std = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2])+[1], name="obstd") # (84, 84, 1) memsize *= enlargement # memsize = 256 hidsize *= enlargement # hidsize = 256 convfeat = 16*enlargement # covfeat = 32 self.ob_rms = RunningMeanStd(shape=list(ob_space.shape[:2])+[1], use_mpi=not update_ob_stats_independently_per_gpu) ph_istate = tf.placeholder(dtype=tf.float32,shape=(None, memsize), name='state') # (None,256) pdparamsize = self.pdtype.param_shape()[0] # 18 等于动作维度 self.memsize = memsize # Inputs to policy and value function will have different shapes depending on whether it is rollout or optimization time, so we treat separately. # pdparam_opt.shape=(None, None, 18), vpred_int_opt.shape=(None, None), vpred_ext_opt.shape=(None, None), snext_opt.shape=(None, 256) self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt = \ self.apply_policy(self.ph_ob[None][:,:-1], reuse=False, scope=scope, hidsize=hidsize, # 256 memsize=memsize, # 256 extrahid=extrahid, # True sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps - 1, pdparamsize=pdparamsize) # 18 self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \ self.apply_policy(self.ph_ob[None], reuse=True, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps, pdparamsize=pdparamsize) self.exploration_type = exploration_type self.max_table = 0 self.define_bottleneck_rew(convfeat=convfeat, rep_size=rep_size/8, enlargement=enlargement, beta=beta, rew_counter=rew_counter) pd = self.pdtype.pdfromflat(self.pdparam_rollout) # 输出策略 softmax 的分布. self.a_samp = pd.sample() # 采样动作 self.nlp_samp = pd.neglogp(self.a_samp) # 输出动作 self.entropy_rollout = pd.entropy() self.pd_rollout = pd self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt) self.a_samp_opt = self.pd_opt.sample() self.ph_istate = ph_istate self.scope = scope ############################################# ########## 以下过程实际并未使用 ################ ############################################# # for gradcam policy a_one_hot = tf.one_hot(self.ph_ac, self.ac_space.n, axis=2) # (None,None) -> (None,None,18) # 相当于取出 one_hot 执行的动作的位置的 pdparam_opt loss_cam_pol = tf.reduce_mean(tf.multiply(self.pdparam_opt, a_one_hot)) # (None,) self.conv_out = tf.get_default_graph().get_tensor_by_name('ppo/pol/Relu_2:0') self.grads = tf.gradients(loss_cam_pol, self.conv_out)[0] # for gradcam aux loss_cam_aux = self.kl if int(str(tf.__version__).split('.')[1]) < 10: self.conv_aux_out = tf.get_default_graph().get_tensor_by_name('ppo/LeakyRelu_2/Maximum:0') else: self.conv_aux_out = tf.get_default_graph().get_tensor_by_name('ppo/LeakyRelu_2:0') self.grads_aux = tf.abs(tf.gradients(loss_cam_aux, self.conv_aux_out)[0]) # self.cams 实际并未使用 weights = tf.reduce_mean(tf.reduce_mean(self.grads, 2), 1) weights = tf.expand_dims(tf.expand_dims(weights, axis=1), axis=1) weights = tf.tile(weights, [1, 6, 6, 1]) cams = tf.reduce_sum((weights * self.conv_out), axis=3) self.cams = tf.maximum(cams, tf.zeros_like(cams)) # self.cans_aux 实际并未使用 weights_aux = tf.reduce_mean(tf.reduce_mean(self.grads_aux, 2), 1) weights_aux = tf.expand_dims(tf.expand_dims(weights_aux, axis=1), axis=1) weights_aux = tf.tile(weights_aux, [1, 7, 7, 1]) cams_aux = tf.nn.relu(tf.reduce_sum((weights_aux * self.conv_aux_out), axis=3)) self.cams_aux = tf.maximum(cams_aux, tf.zeros_like(cams_aux))
def __init__(self, scope, ob_space, ac_space, policy_size='normal', maxpool=False, extrahid=True, hidsize=128, memsize=128, rec_gate_init=0.0, update_ob_stats_independently_per_gpu=True, proportion_of_exp_used_for_predictor_update=1., dynamics_bonus=False, action_balance_coef=1., array_action=True): StochasticPolicy.__init__(self, scope, ob_space, ac_space) self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update self.action_balance_coef = action_balance_coef self.array_action = array_action self.enlargement = {'small': 1, 'normal': 2, 'large': 4}[policy_size] self.rep_size = 512 self.ph_mean = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="obmean") self.ph_std = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="obstd") memsize *= self.enlargement hidsize *= self.enlargement self.convfeat = 16 * self.enlargement self.ob_rms = RunningMeanStd( shape=list(ob_space.shape[:2]) + [1], use_mpi=not update_ob_stats_independently_per_gpu) ph_istate = tf.placeholder(dtype=tf.float32, shape=(None, memsize), name='state') pdparamsize = self.pdtype.param_shape()[0] self.memsize = memsize # self.int_rew_ab = None # self.int_rew_ab_opt = None if self.action_balance_coef is not None: # self.action_one_hot_list_rollout = get_action_one_hot_list(self.ac_space.n, self.sy_nenvs, self.sy_nsteps) # self.action_one_hot_list_opt = get_action_one_hot_list(self.ac_space.n, self.sy_nenvs, self.sy_nsteps - 1) # with tf.device('/cpu:0'): self.action_one_hot_rollout = get_action_one_hot( self.ac_space.n, self.sy_nenvs, self.sy_nsteps) # self.action_one_hot_list_opt = get_action_one_hot(self.ac_space.n, self.sy_nenvs, self.sy_nsteps - 1) if self.array_action: # with tf.device('/cpu:0'): self.action_encode_array_rollout = get_action_encode_array( self.ac_space.n, self.sy_nenvs, self.sy_nsteps, ob_space.shape[:2]) # self.action_encode_array_rollout, self.split_lengths = get_action_encode_array( # self.ac_space.n, self.sy_nenvs, self.sy_nsteps, ob_space.shape[:2]) self.feat_var_ab, self.max_feat_ab, self.int_rew_ab, self.int_rew_ab_rollout, self.aux_loss_ab = \ self.define_action_balance_rew(ph_ob=self.ph_ob[None], action_one_hot=self.action_one_hot_rollout, convfeat=self.convfeat, rep_size=self.rep_size, enlargement=self.enlargement, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps, ) # self.feat_var_ab_opt, self.max_feat_ab_opt, self.int_rew_ab_opt, self.aux_loss_ab = \ # self.define_action_balance_rew(ph_ob=self.ph_ob[None][:, :-1], # action_one_hot=self.action_one_hot_list_opt, # convfeat=self.convfeat, # rep_size=self.rep_size, enlargement=self.enlargement, # sy_nenvs=self.sy_nenvs, # sy_nsteps=self.sy_nsteps - 1, # ) self.pd_ab = self.pdtype.pdfromflat(self.int_rew_ab) # Inputs to policy and value function will have different shapes depending on whether it is rollout # or optimization time, so we treat separately. self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt, self.logits_raw_opt = \ self.apply_policy(self.ph_ob[None][:, :-1], reuse=False, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps - 1, pdparamsize=pdparamsize ) self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout, _ = \ self.apply_policy(self.ph_ob[None], reuse=True, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps, pdparamsize=pdparamsize ) if dynamics_bonus: self.define_dynamics_prediction_rew(convfeat=self.convfeat, rep_size=self.rep_size, enlargement=self.enlargement) else: self.define_self_prediction_rew(convfeat=self.convfeat, rep_size=self.rep_size, enlargement=self.enlargement) pd = self.pdtype.pdfromflat(self.pdparam_rollout) self.a_samp = pd.sample() self.nlp_samp = pd.neglogp(self.a_samp) self.entropy_rollout = pd.entropy() self.pd_rollout = pd self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt) self.ph_istate = ph_istate