def _init(self, obs_space, ac_space, embedding_shape, hid_size, num_hid_layers, gaussian_fixed_var=True): self.pdtype = pdtype = make_pdtype(ac_space.shape[0]) batch_size = None ob = U.get_placeholder(name="ac_de_ob", dtype=tf.float32, shape=[batch_size, obs_space.shape[0]]) embedding = U.get_placeholder( name="ac_de_embedding", dtype=tf.float32, shape=[batch_size, embedding_shape ]) ##这里我觉得是一个embedding 的值扩展成sequence_len大小,暂时先不管,等具体做到 # 正则化一下 last_out = U.concatenate([ob, embedding], axis=1) with tf.variable_scope("ac_de_filter"): self.ac_rms = RunningMeanStd(shape=obs_space.shape[0] + embedding_shape) last_out = tf.clip_by_value( (last_out - self.ac_rms.mean) / self.ac_rms.std, -5.0, 5.0) for i in range(num_hid_layers): last_out = tf.nn.relu( U.dense(last_out, hid_size[i], "ac_de%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space.shape[0], int): self.mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "ac_de_final", U.normc_initializer(1.0)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([self.mean, self.mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "ac_de_final", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.ac = ac self._act = U.function([stochastic, ob, embedding], ac) self._get_pol_mean = U.function([ob, embedding], self.mean)
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, num_options=2, dc=0, w_intfc=True, k=0.): assert isinstance(ob_space, gym.spaces.Box) self.k = k self.w_intfc = w_intfc self.state_in = [] self.state_out = [] self.dc = dc self.num_options = num_options self.pdtype = pdtype = pdtype = DiagGaussianPdType(ac_space.shape[0]) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None]) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="vffc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = dense3D2(last_out, 1, "vffinal", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:, 0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="termfc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.tpred = tf.nn.sigmoid(dense3D2(tf.stop_gradient(last_out), 1, "termhead", option, num_options=num_options, weight_init=U.normc_initializer(1.0)))[:, 0] termination_sample = tf.greater(self.tpred, tf.random_uniform(shape=tf.shape(self.tpred), maxval=1.)) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="polfc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense3D2(last_out, pdtype.param_shape()[0] // 2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(0.5)) logstd = tf.get_variable(name="logstd", shape=[num_options, 1, pdtype.param_shape()[0] // 2], initializer=U.normc_initializer(0.1), trainable=True) pdparam = tf.concat([mean, mean * 0.0 + logstd[option[0]]], axis=1) self.pd = pdtype.pdfromflat(pdparam) stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="intfc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.intfc = tf.sigmoid(tf.layers.dense(last_out, num_options, name="intfcfinal", kernel_initializer=U.normc_initializer(1.0))) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="OP%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.op_pi = tf.nn.softmax(tf.layers.dense(last_out, num_options, name="OPfinal", kernel_initializer=U.normc_initializer(1.0))) self._act = U.function([stochastic, ob, option], [ac]) self.get_term = U.function([ob, option], [termination_sample]) self.get_tpred = U.function([ob, option], [self.tpred]) self.get_vpred = U.function([ob, option], [self.vpred]) self._get_op_int = U.function([ob], [self.op_pi, self.intfc]) self._get_intfc = U.function([ob], [self.intfc]) self._get_op = U.function([ob], [self.op_pi])
def _init(self, obs_space, embedding_shape, hid_size, num_hid_layers, gaussian_fixed_var=True): self.pdtype = pdtype = make_pdtype(obs_space.shape[0]) batch_size = None ob_input = U.get_placeholder(name="ob", dtype=tf.float32, shape=[batch_size, obs_space.shape[0]]) embedding = U.get_placeholder( name="embedding", dtype=tf.float32, shape=[ batch_size, embedding_shape ]) ##这里我觉得是一个embedding 的值扩展成sequence_len大小,暂时先不管,等具体做到这里的时候再处理 last_out = U.concatenate( [ob_input, embedding], axis=1) ##这里只有policy, 没有 value function, 还有这个要看看concatenate的对不对 # 正则化 with tf.variable_scope("state_de_filter"): self.state_rms = RunningMeanStd(shape=obs_space.shape[0] + embedding_shape) input_z = tf.clip_by_value( (last_out - self.state_rms.mean) / self.state_rms.std, -5.0, 5.0) for i in range(num_hid_layers): input_z = tf.nn.tanh( U.dense(input_z, hid_size[i], "state_de%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(obs_space.shape[0], int): self.mean = U.dense(input_z, pdtype.param_shape()[0] // 2, "state_de_final", U.normc_initializer(0.01)) self.logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([self.mean, self.mean * 0.0 + self.logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "state_de_final", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] self._act = U.function([ob_input, embedding], self.pd.sample()) self.get_mean = U.function([ob_input, embedding], self.mean)
def _init(self, obs_space, batch_size, time_steps, LSTM_size, laten_size, gaussian_fixed_var=True): ##等会儿要重点看一下var有没有更新 self.pdtype = pdtype = make_pdtype(laten_size) obs = U.get_placeholder("en_ob", dtype=tf.float32, shape = [batch_size, time_steps, obs_space.shape[0]]) # 正则化 with tf.variable_scope("obfilter"): ## 看看有没有起效果,我觉得是其效果考虑的 self.obs_rms = RunningMeanStd(shape=obs_space.shape) obz = tf.clip_by_value((obs - self.obs_rms.mean) / self.obs_rms.std, -5.0, 5.0) lstm_fw_cell = rnn.BasicLSTMCell(LSTM_size, forget_bias=1.0) lstm_bw_cell = rnn.BasicLSTMCell(LSTM_size, forget_bias=1.0) outputs, output_state = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, obz, dtype=tf.float32) outputs_average = tf.reduce_mean(outputs[0], axis=1) if gaussian_fixed_var and isinstance(laten_size, int): self.mean = U.dense(outputs_average, pdtype.param_shape()[0] // 2, "dblstmfin", U.normc_initializer(1.0)) self.logstd = U.dense(outputs_average, pdtype.param_shape()[0] // 2, "dblstm_logstd", U.normc_initializer(1.0)) # self.logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], # initializer=tf.constant_initializer(0.1)) ##这个地方是不是也是有问题的 pdparam = U.concatenate([self.mean, self.mean * 0.0 + self.logstd], axis=1) else: pdparam = U.dense(outputs_average, pdtype.param_shape()[0], "dblstmfin", U.normc_initializer(0.1)) self.pd = pdtype.pdfromflat(pdparam) self._encode = U.function([obs], self.pd.sample()) self._get_mean = U.function([obs], self.mean)
def learn(env, model_path, data_path, policy_fn, *, rolloutSize, num_options=4, horizon=80, clip_param=0.025, ent_coeff=0.01, # clipping parameter epsilon, entropy coeff optim_epochs=10, mainlr=3.25e-4, intlr=1e-4, piolr=1e-4, termlr=5e-7, optim_batchsize=100, # optimization hypers gamma=0.99, lam=0.95, # advantage estimation max_iters=20, # time constraint adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) retrain=False, ): """ Core learning function """ ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space, num_options=num_options) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space, num_options=num_options) # Network for old policy atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") option = U.get_placeholder_cached(name="option") term_adv = U.get_placeholder(name='term_adv', dtype=tf.float32, shape=[None]) op_adv = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) betas = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) # Setup losses and stuff kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-ent_coeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] term_loss = pi.tpred * term_adv activated_options = tf.placeholder(dtype=tf.float32, shape=[None, num_options]) pi_w = tf.placeholder(dtype=tf.float32, shape=[None, num_options]) option_hot = tf.one_hot(option, depth=num_options) pi_I = (pi.intfc * activated_options) * pi_w / tf.expand_dims( tf.reduce_sum((pi.intfc * activated_options) * pi_w, axis=1), 1) pi_I = tf.clip_by_value(pi_I, 1e-6, 1 - 1e-6) int_loss = - tf.reduce_sum(betas * tf.reduce_sum(pi_I * option_hot, axis=1) * op_adv) intfc = tf.placeholder(dtype=tf.float32, shape=[None, num_options]) pi_I = (intfc * activated_options) * pi.op_pi / tf.expand_dims( tf.reduce_sum((intfc * activated_options) * pi.op_pi, axis=1), 1) pi_I = tf.clip_by_value(pi_I, 1e-6, 1 - 1e-6) op_loss = - tf.reduce_sum(betas * tf.reduce_sum(pi_I * option_hot, axis=1) * op_adv) log_pi = tf.log(tf.clip_by_value(pi.op_pi, 1e-20, 1.0)) op_entropy = -tf.reduce_mean(pi.op_pi * log_pi, reduction_indices=1) op_loss -= 0.01 * tf.reduce_sum(op_entropy) var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult, option], losses + [U.flatgrad(total_loss, var_list)]) termgrad = U.function([ob, option, term_adv], [U.flatgrad(term_loss, var_list)]) # Since we will use a different step size. opgrad = U.function([ob, option, betas, op_adv, intfc, activated_options], [U.flatgrad(op_loss, var_list)]) # Since we will use a different step size. intgrad = U.function([ob, option, betas, op_adv, pi_w, activated_options], [U.flatgrad(int_loss, var_list)]) # Since we will use a different step size. adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg, ret, lrmult, option], losses) U.initialize() adam.sync() episodes_so_far = 0 timesteps_so_far = 0 global iters_so_far iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=5) # rolling buffer for episode lengths rewbuffer = deque(maxlen=5) # rolling buffer for episode rewards datas = [0 for _ in range(num_options)] if retrain: print("Retraining to New Task !! ") time.sleep(2) U.load_state(model_path+'/') p = [] max_timesteps = int(horizon * rolloutSize * max_iters) while True: if max_iters and iters_so_far >= max_iters: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) render = False rollouts = sample_trajectory(pi, env, horizon=horizon, rolloutSize=rolloutSize, render=render) # Save rollouts data = {'rollouts': rollouts} p.append(data) del data data_file_name = data_path + 'rollout_data.pkl' pickle.dump(p, open(data_file_name, "wb")) add_vtarg_and_adv(rollouts, gamma, lam, num_options) opt_d = [] for i in range(num_options): dur = np.mean(rollouts['opt_dur'][i]) if len(rollouts['opt_dur'][i]) > 0 else 0. opt_d.append(dur) ob, ac, opts, atarg, tdlamret = rollouts["ob"], rollouts["ac"], rollouts["opts"], rollouts["adv"], rollouts["tdlamret"] atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values # Optimizing the policy for opt in range(num_options): indices = np.where(opts == opt)[0] print("Option- ", opt, " Batch Size: ", indices.size) opt_d[opt] = indices.size if not indices.size: continue datas[opt] = d = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent) if indices.size < optim_batchsize: print("Too few samples for opt - ", opt) continue optim_batchsize_corrected = optim_batchsize optim_epochs_corrected = np.clip(np.int(indices.size / optim_batchsize_corrected), 1, optim_epochs) print("Optim Epochs:", optim_epochs_corrected) logger.log("Optimizing...") # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs_corrected): losses = [] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize_corrected): *newlosses, grads = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt]) adam.update(grads, mainlr * cur_lrmult) losses.append(newlosses) # Optimize termination functions termg = termgrad(rollouts["ob"], rollouts['opts'], rollouts["op_adv"])[0] adam.update(termg, termlr) # Optimize interest functions intgrads = intgrad(rollouts['ob'], rollouts['opts'], rollouts["last_betas"], rollouts["op_adv"], rollouts["op_probs"], rollouts["activated_options"])[0] adam.update(intgrads, intlr) # Optimize policy over options opgrads = opgrad(rollouts['ob'], rollouts['opts'], rollouts["last_betas"], rollouts["op_adv"], rollouts["intfc"], rollouts["activated_options"])[0] adam.update(opgrads, piolr) lrlocal = (rollouts["ep_lens"], rollouts["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("Success", rollouts["success"]) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() return pi
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, vae_pol_mean, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size[i], "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size[i], "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) + vae_pol_mean logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.constant_initializer(0.1)) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # change for BC #stochastic = tf.placeholder(dtype=tf.bool, shape=()) stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.ac = ac self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = DiagGaussianPdType(ac_space.shape[0]) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) # Critic Network with tf.variable_scope('vf'): obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) # obz = (ob - self.ob_rms.mean) / self.ob_rms.std last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense( last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0] # Actor Network with tf.variable_scope('pol'): last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name='fc%i' % (i + 1), kernel_initializer=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense( last_out, pdtype.param_shape()[0] // 2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense( last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, model, hid_size, num_hid_layers, num_options=2, term_prob=0.5, eps=0.0005): assert isinstance(ob_space, gym.spaces.Box) self.state_in = [] self.state_out = [] self.term_prob = term_prob self.num_options = num_options # Creating the policy network sequence_length = None self.ac_dim = ac_space.shape[0] self.model = model self.eps = eps self.trained_options = [] ob = U.get_placeholder(name="ob", dtype=tf1.float32, shape=[sequence_length] + list(ob_space.shape)) option = U.get_placeholder(name="option", dtype=tf1.int32, shape=[None]) self.pdtype = pdtype = DiagGaussianPdType(ac_space.shape[0]) with tf1.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf1.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz # Value function for i in range(num_hid_layers[0]): last_out = tf1.nn.tanh( tf1.layers.dense(last_out, hid_size[0], name="vffc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = dense3D2(last_out, 1, "vffinal", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:, 0] # Intra option policy last_out = ob for i in range(num_hid_layers[1]): last_out = tf1.nn.tanh( tf1.layers.dense(last_out, hid_size[1], name="polfc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) mean = dense3D2(last_out, pdtype.param_shape()[0] // 2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(-0.2)) logstd = tf1.get_variable( name="logstd", shape=[num_options, 1, pdtype.param_shape()[0] // 2], initializer=U.normc_initializer(0.1), trainable=True) pdparam = tf1.concat([mean, mean * 0.0 + logstd[option[0]]], axis=1) # pdparam = dense3D2(last_out, pdtype.param_shape()[0], "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(-0.6)) self.pd = pdtype.pdfromflat(pdparam) stochastic = tf1.placeholder(dtype=tf1.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob, option], [ac]) self.get_vpred = U.function([ob, option], [self.vpred]) self.action_pd = U.function( [ob, option], [self.pd.mode(), self.pd.variance()])
def _create_network( self, obs_shape, embedding_shape): ## , input_batch, global_condition_batch '''Construct the WaveNet network.''' import common.tf_util as U outputs = [] sequence_length = 1 input_batch = U.get_placeholder( name="state_de_ob", dtype=tf.float32, shape=[batch_size, self.time_steps - 1, obs_shape.shape[0]]) ##input_batch是3D的 global_condition_batch = U.get_placeholder( name="state_de_embedding", dtype=tf.float32, shape=[batch_size, 1, embedding_shape]) current_layer = input_batch # Pre-process the input with a regular convolution current_layer = self._create_causal_layer(current_layer) ##这里不行 #output_width = tf.shape(input_batch)[1] - self.receptive_field + 1 output_width = input_batch.shape[1] - self.receptive_field + 1 # Add all defined dilation layers. with tf.name_scope('dilated_stack'): for layer_index, dilation in enumerate(self.dilations): with tf.name_scope('layer{}'.format(layer_index)): output, current_layer = self._create_dilation_layer( current_layer, layer_index, dilation, global_condition_batch, output_width) outputs.append(output) with tf.name_scope('postprocessing'): # Perform (+) -> ReLU -> 1x1 conv -> ReLU -> 1x1 conv to # postprocess the output. w1 = self.variables['postprocessing']['postprocess1'] w2 = self.variables['postprocessing']['postprocess2'] if self.use_biases: b1 = self.variables['postprocessing']['postprocess1_bias'] b2 = self.variables['postprocessing']['postprocess2_bias'] if self.histograms: tf.histogram_summary('postprocess1_weights', w1) tf.histogram_summary('postprocess2_weights', w2) if self.use_biases: tf.histogram_summary('postprocess1_biases', b1) tf.histogram_summary('postprocess2_biases', b2) # We skip connections from the outputs of each layer, adding them # all up here. total = sum(outputs) transformed1 = tf.nn.relu(total) conv1 = tf.nn.conv1d(transformed1, w1, stride=1, padding="SAME") if self.use_biases: conv1 = tf.add(conv1, b1) transformed2 = tf.nn.relu(conv1) conv2 = tf.nn.conv1d(transformed2, w2, stride=1, padding="SAME") if self.use_biases: conv2 = tf.add(conv2, b2) # print(conv2) # ========= add by myself =============== # # self.mean = tf.reduce_mean(conv2, axis=1) ###去均值作为每一个维度的 # self.logstd = tf.get_variable(name="wave_logstd", shape=[1, self.pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) # pdparam = U.concatenate([self.mean, self.mean * 0.0 + self.logstd], axis=1) # self.pd = self.pdtype.pdfromflat(pdparam) # # self._act = U.function([input_batch, global_condition_batch], [self.pd.sample()]) # # for debug # self.get_mean = U.function([input_batch, global_condition_batch], self.mean) conv2 = tf.reshape(conv2, [-1, self.quantization_channels]) self.mean = U.dense(conv2, 63, "wave_mean", U.normc_initializer(1.0)) ## 48 * 63 self.logstd = U.dense( conv2, 63, "wave_logstd", weight_init=U.normc_initializer(1.0)) ## 48 * 63 # self.logstd = tf.get_variable(name="wave_logstd", shape=[1, self.pdtype.param_shape()[0] // 2], # initializer=tf.zeros_initializer()) ## 这个地方的大小有待商榷 pdparm = U.concatenate([self.mean, self.mean * 0.0 + self.logstd], axis=1) self.pd = self.pdtype.pdfromflat(pdparm) # target_output = tf.slice(input_batch, [0, self.receptive_field, 0], [-1, -1, -1]) self._act = U.function([input_batch, global_condition_batch], [self.pd.sample()])