def _init(self, ob_space, ac_space, kind): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) x = ob / 255.0 if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 256, 'lin', U.normc_initializer(1.0))) elif kind == 'large': # Nature DQN x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 512, 'lin', U.normc_initializer(1.0))) else: raise NotImplementedError logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0))[:,0] self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # change for BC #stochastic = tf.placeholder(dtype=tf.bool, shape=()) stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.ac = ac self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_proba_dist_type(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) obscaled = ob / 255.0 with tf.variable_scope("pol"): x = obscaled x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0))) logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01)) self.pd = pdtype.proba_distribution_from_flat(logits) with tf.variable_scope("vf"): x = obscaled x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0))) self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0)) self.vpredz = self.vpred self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance( ob_space, gym.spaces.Box) #ru guo hou mian tiao jian wei jia ze tui chu #print ("mlp_policy/20lines") zhi xing liang ci #print ("ac_space.shape[0]", ac_space.shape[0]) shu chu jie guo shi 3 self.pdtype = pdtype = make_pdtype( ac_space ) #return DiagGaussianPdType(ac_space.shape[0]) zhe li mian zui hou you pdclass() sequence_length = None ob = U.get_placeholder( name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape) ) #return tf.placeholder(dtype=dtype, shape=shape, name=name) #print ("obspace.shape:::", list(ob_space.shape)) shu chu shi [11] with tf.variable_scope("obfilter"): #print("gail-tf/gailtf/baselines/ppo1/mlp_policy.py/28lines:") self.ob_rms = RunningMeanStd( shape=ob_space.shape) #zhe ge han shu kan bu dong obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) #ob zhe ge shi hou hai shi placeholder last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer( 1.0))) #da jian le quan lian jie ceng self.vpred = U.dense( last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0) )[:, 0] #wen ti shi zhe li zui hou mei you shu chu dong zuo de kongjian last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): print("gaussian_fixed_var is used") mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: #print ("gaussian_fixed_var is not used") mei you bei yong dao pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat( pdparam ) # mo rren shang mian de pdtype yi ding shi DiagGaussianPd return DiagGaussianPd #pd li mian you kl, entropy, sample deng fang fa self.state_in = [] self.state_out = [] # change for BC #stochastic = tf.placeholder(dtype=tf.bool, shape=()) stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.ac = ac self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) last_action = U.get_placeholder(shape=(None, 524), dtype=tf.float32, name="last_action_one_hot") self.msize = 64 # change to 64 later self.ssize = 64 self.isize = 11 self.available_action_size = 524 available_action = ob[:, (5*self.msize*self.msize+10*self.ssize*self.ssize+self.isize):(5*self.msize*self.msize+10*self.ssize*self.ssize+self.isize+self.available_action_size)] # ob = ob[:,:-(self.available_action_size)] with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) # obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -20.0, 20.0) obz = (ob - self.ob_rms.mean) / self.ob_rms.std minimap = obz[:, 0:5*self.msize*self.msize] # minimap /= 2 screen = obz[:, 5*self.msize*self.msize: 5*self.msize*self.msize+ 10*self.ssize*self.ssize] # screen /= 2 info = obz[:, (5*self.msize*self.msize+10*self.ssize*self.ssize):(5*self.msize*self.msize+10*self.ssize*self.ssize+self.isize)] # info /= 2 # get value prediction, crtic mconv1 = tf.layers.conv2d( inputs=tf.reshape(minimap, [-1,self.msize,self.msize,5]), filters=32, kernel_size=[5, 5], padding="same", kernel_initializer=U.normc_initializer(0.01), activation=tf.nn.leaky_relu) mpool1 = tf.layers.max_pooling2d(inputs=mconv1, pool_size=[2, 2], strides=2) mconv2 = tf.layers.conv2d( inputs=mpool1, filters=64, kernel_size=[5, 5], padding="same", kernel_initializer=U.normc_initializer(0.01), activation=tf.nn.leaky_relu, name="vffcmconv2") mpool2 = tf.layers.max_pooling2d(inputs=mconv2, pool_size=[2, 2], strides=2) mpool2_flat = tf.reshape(mpool2, [-1, 16 * 16 * 64]) sconv1 = tf.layers.conv2d( inputs=tf.reshape(screen, [-1,self.ssize, self.ssize,10]), filters=48, kernel_size=[5, 5], padding="same", kernel_initializer=U.normc_initializer(0.01), activation=tf.nn.leaky_relu) spool1 = tf.layers.max_pooling2d(inputs=sconv1, pool_size=[2, 2], strides=2) sconv2 = tf.layers.conv2d( inputs=spool1, filters=80, kernel_size=[5, 5], padding="same", kernel_initializer=U.normc_initializer(0.01), activation=tf.nn.leaky_relu) spool2 = tf.layers.max_pooling2d(inputs=sconv2, pool_size=[2, 2], strides=2) spool2_flat = tf.reshape(spool2, [-1, 16 * 16 * 80]) info_fc = tf.layers.dense(inputs=layers.flatten(info), units=8, activation=tf.tanh) aa_fc = tf.layers.dense(inputs=layers.flatten(available_action), units=32, activation=tf.tanh) HIDDEN_SIZE = 128 l1_action = tf.layers.dense(layers.flatten(last_action), 256, tf.nn.relu) input_to_rnn = tf.reshape(l1_action, [-1, 16, 16]) action_lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=HIDDEN_SIZE, forget_bias=1.0, state_is_tuple=True) inputs_rnn = tf.unstack(input_to_rnn, num=16, axis=1) rnn_outputs,rnn_state= tf.contrib.rnn.static_rnn(action_lstm_cell, inputs_rnn, dtype=tf.float32) l2_action = tf.layers.dense(rnn_state[-1], 128, tf.nn.tanh) # hidden layer last_acs_ph_lstm = tf.layers.dense(l2_action, 32, tf.nn.tanh) last_out = tf.concat([mpool2_flat, spool2_flat, info_fc, aa_fc, last_acs_ph_lstm], axis=1) vf_last_out = tf.nn.tanh(U.dense(last_out, 1024, 'vf_last_out', weight_init=U.normc_initializer(1.0))) # vf_last_out_2 = tf.nn.tanh(U.dense(vf_last_out, 64, 'vf_last_out_2', # weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(vf_last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0] if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pol_last_out = U.dense(last_out, (pdtype.param_shape()[0])*5, "polfinaldense", U.normc_initializer(0.01)) pdparam = U.dense(pol_last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # change for BC #stochastic = tf.placeholder(dtype=tf.bool, shape=()) stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(available_action), self.pd.mode(available_action)) self.ac = ac self._act = U.function([stochastic, ob, last_action], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -20.0, 20.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] # last_out = obz # for i in range(num_hid_layers): # last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) ### add conv net instead of using dense self.msize = 64 # change to 64 later self.ssize = 64 self.isize = 11 self.available_action_size = 524 minimap = obz[:, 0:5 * self.msize * self.msize] screen = obz[:, 5 * self.msize * self.msize:5 * self.msize * self.msize + 10 * self.ssize * self.ssize] info = obz[:, (5 * self.msize * self.msize + 10 * self.ssize * self.ssize):( 5 * self.msize * self.msize + 10 * self.ssize * self.ssize + self.isize)] available_action = obz[:, (5 * self.msize * self.msize + 10 * self.ssize * self.ssize + self.isize):(5 * self.msize * self.msize + 10 * self.ssize * self.ssize + self.isize + self.available_action_size)] conv1_minimap = tf.layers.conv2d(inputs=tf.reshape( minimap, [-1, self.msize, self.msize, 5]), filters=10, kernel_size=5, strides=1, padding='same', activation=tf.nn.leaky_relu, name="polmconv1") # -> (64, 64, 10) pool1_minimap = tf.layers.max_pooling2d( conv1_minimap, pool_size=4, strides=4, name="polmpool1") # -> (16, 16, 10) conv2_minimap = tf.layers.conv2d(pool1_minimap, 10, 5, 1, 'same', activation=tf.nn.relu, name="polmconv2") # -> (16, 16, 10) pool2_minimap = tf.layers.max_pooling2d( conv2_minimap, 2, 2, name="polmpool2") # -> (8, 8, 10) flat_minimap = tf.reshape(pool2_minimap, [-1, 8 * 8 * 10]) # -> (8*8*10, ) # dense_minimap = tf.layers.dense(inputs=flat_minimap, units=1024, activation=tf.nn.relu) # # dropout_mininmap = tf.layers.dropout( # # inputs=dense_minimap, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN) # minimap_output = tf.layers.dense(dense_minimap, 64) conv1_screen = tf.layers.conv2d( inputs=tf.reshape(screen, [-1, self.ssize, self.ssize, 10]), # (64,64,10) filters=20, kernel_size=5, strides=1, padding='same', activation=tf.nn.leaky_relu, name="polsconv1") # -> (64, 64, 20) pool1_screen = tf.layers.max_pooling2d( conv1_screen, pool_size=4, strides=4, name="polspool1") # -> (16, 16, 20) conv2_screen = tf.layers.conv2d(pool1_screen, 20, 5, 1, 'same', activation=tf.nn.relu, name="polsconv2") # -> (16, 16, 20) pool2_screen = tf.layers.max_pooling2d( conv2_screen, 2, 2, name="polspool2") # -> (8, 8, 20) flat_screen = tf.reshape(pool2_screen, [-1, 8 * 8 * 20]) # -> (8*8*20, ) # dense_screen = tf.layers.dense(inputs=flat_screen, units=1024, activation=tf.nn.relu) # # dropout_screen = tf.layers.dropout( # # inputs=dense_screen, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN) # screen_output = tf.layers.dense(dense_screen, 64, tf.nn.relu) info_fc = tf.layers.dense(inputs=layers.flatten(info), units=4, activation=tf.tanh, name="poldense1") aa_fc = tf.layers.dense(inputs=layers.flatten(available_action), units=16, activation=tf.tanh, name="poldense2") last_out = tf.concat([flat_minimap, flat_screen, info_fc, aa_fc], axis=1, name="polconcat") # last_out = tf.layers.dense(inputs=last_out,units=600,name="poldense3") # last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc1", weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # change for BC #stochastic = tf.placeholder(dtype=tf.bool, shape=()) stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.ac = ac self._act = U.function([stochastic, ob], [ac, self.vpred])