def _init(self, ob_space, ac_space): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) obscaled = ob / 255.0 with tf.variable_scope("pol"): x = obscaled x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0))) logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) with tf.variable_scope("vf"): x = obscaled x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0))) self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0)) self.vpredz = self.vpred self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, kind): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) x = ob / 255.0 if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0))) elif kind == 'large': # Nature DQN x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0))) else: raise NotImplementedError logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:,0] self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, kind): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) x = ob / 255.0 if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu( tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0))) elif kind == 'large': # Nature DQN x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu( tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0))) else: raise NotImplementedError logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) self.vpred = tf.layers.dense( x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:, 0] self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) logits, self.vpred = keras_net(ob) self.pd = pdtype.pdfromflat(logits) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, noisy_nets=False, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.selu(U.dense(last_out, hid_size, "vffc%i"%(i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0] last_out = obz for i in range(num_hid_layers): if noisy_nets: last_out = tf.nn.selu(U.noisy_dense(last_out, hid_size, "noisy_polfc%i"%(i + 1), weight_init=U.normc_initializer(1.0))) else: last_out = tf.nn.selu(U.dense(last_out, hid_size, "polfc%i"%(i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): assert(noisy_nets is False) mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: if noisy_nets: pdparam = U.noisy_dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pdparam = pdparam self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred]) self._vpred_pdparam = U.function([ob], [self.vpred, self.pdparam]) self.ob = ob
def _init(self, ob_space, ac_space): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope('vf'): obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz # for i in range(num_hid_layers): # last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0))) # self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0] self.vpred = discriminator_model([last_out], drop_rate=0.5) with tf.variable_scope('pol'): last_out = obz # for i in range(num_hid_layers): # last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0))) pdparam = generator_model([last_out], pdtype.param_shape()[0], drop_rate=0.5) # if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): # mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01)) # logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) # pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) # else: # pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _register_placeholder(self, *, name=None, dtype=None, shape=None, placeholder=None): if placeholder is None: placeholder = U.get_placeholder(name=name, dtype=tf.float32, shape=shape) elif name is None: name = placeholder.name if name in self._tf_placeholders: raise ValueError( "Placeholder with name {} already exists".format(name)) self._tf_placeholders[name] = placeholder return placeholder
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, summaries = False, should_act = True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = tf.get_default_graph().get_tensor_by_name("observations:0"); if ob is None: ob = U.get_placeholder(name="observations", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope('pol'): last_out = ob for i in range(num_hid_layers): last_out = tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0)) last_out = tf.nn.elu(last_out); #last_out = tf.nn.tanh(last_out) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, tf.ones(shape=mean.shape)* logstd], axis=1) else: pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) with tf.variable_scope("distribution"): self.pd = pdtype.pdfromflat(pdparam) if should_act: with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope('vf'): #obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = ob for i in range(num_hid_layers): last_out = tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)) last_out = tf.nn.tanh(last_out); self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0] self.state_in = [] self.state_out = [] with tf.variable_scope("distribution"): stochastic = tf.placeholder(dtype=tf.bool, shape=(), name = "stochastic") ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, embedding_space_size): assert isinstance(ob_space, gym.spaces.Box) # self.input = tf.placeholder(dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) self.input = U.get_placeholder(name="ob_f", dtype=tf.float32, shape=[None] + list(ob_space.shape)) self.embedding_space = embedding_space_size # x = self.input / 255.0 x = tf.nn.relu( conv2d(self.input, 32, "cnn1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(conv2d(x, 64, "cnn2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(conv2d(x, 64, "cnn3", [3, 3], [1, 1], pad="VALID")) x = flatten(x) self.output = tf.nn.relu( linear(x, self.embedding_space, 'linlast', normalized_columns_initializer(1.0)))
def _init(self, ob_space, ac_space, hid_size, num_hid_layers): assert isinstance(ob_space, gym.spaces.Box) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope('rew'): obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0))) self.reward = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0] self._rew = U.function([ob], [self.reward])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, rnn_hid_units, gaussian_fixed_var=True): #assert isinstance(ob_space, gym.spaces.Box) print("Constructing policy for observation space",ob_space) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) #with tf.variable_scope("obfilter"): # self.ob_rms = RunningMeanStd(shape=ob_space.shape) #obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) obz = ob # Apply rnn_to reduce history with tf.variable_scope("vf"): state = self.rnn(obz, ob_space.shape[0], rnn_hid_units) for i in range(num_hid_layers): last_out = resnet(state, hid_size, "vf%i"%(i+1)) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0] # Apply rnn_to reduce history with tf.variable_scope("pf"): state = self.rnn(obz, ob_space.shape[0], rnn_hid_units) for i in range(num_hid_layers): last_out = resnet(state, hid_size, "pf%i"%(i+1)) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: raise pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_sizes, gaussian_fixed_var=True, use_obfilter=False): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) if use_obfilter: with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) else: obz = ob last_out = obz for i, hid_size in enumerate(hid_sizes): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(0.01))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(0.01))[:,0] last_out = obz for i, hid_size in enumerate(hid_sizes): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(0.01))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) # mean = tf.clip_by_value(mean, ac_space.low, ac_space.high) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) pdparam = tf.identity(pdparam, name="pdparam") self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=(), name="stoch") ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) ac = tf.identity(ac, name="pi") self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, layers_val, layers_pol, gaussian_fixed_var=True, dist='gaussian', ): assert isinstance(ob_space, gym.spaces.Box) self.dist = dist self.pdtype = pdtype = make_pdtype(ac_space, dist=dist) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope('vf'): obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i, size in enumerate(layers_val): last_out = tf.nn.relu(tf.layers.dense(last_out, size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0] with tf.variable_scope('pol'): last_out = obz for i, size in enumerate(layers_pol): last_out = tf.nn.tanh(tf.layers.dense(last_out, size, name='fc%i' % (i + 1), kernel_initializer=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense(last_out, pdtype.param_shape()[0] // 2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) if dist == 'gaussian': self._act = U.function([stochastic, ob], [ac, self.vpred, self.pd.std, self.pd.mean, self.pd.logstd]) elif dist == 'beta': self._act = U.function([stochastic, ob], [ac, self.vpred, self.pd.alpha, self.pd.beta, self.pd.alpha_beta])
def _init(self, ac_space, joint_training, emb_size=None, emb_network=None): self.pdtype = pdtype = make_pdtype(ac_space) self.emb_network = emb_network self.joint_training = joint_training size = 256 if self.joint_training: self.input, output = emb_network.get_input_and_last_layer() x = tf.nn.relu(linear(output, size, 'lin1', normalized_columns_initializer(1.0))) else: self.input = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, emb_size]) x = tf.nn.relu(linear(self.input, size, 'lin1', normalized_columns_initializer(1.0))) # x = tf.nn.relu(linear(x, 32, 'lin2', normalized_columns_initializer(1.0))) logits = linear(x, pdtype.param_shape()[0], "logits", normalized_columns_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) self.ac = self.pd.sample() # self.probs = tf.nn.softmax(logits, dim=-1)[0, :] self.vpred = linear(x, 1, "value", normalized_columns_initializer(1.0)) self._act = U.function([self.input], [self.ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, exploration_rate, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope('vf'): obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0] with tf.variable_scope('pol'): last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(0.01))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.constant_initializer(exploration_rate)) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) my_var = tf.strided_slice(mean, [0], [1], [1], shrink_axis_mask=1) my_var_out = tf.identity(my_var, name='output_node') self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) #obscaled = ob / 255.0 obscaled = ob with tf.variable_scope("pol"): x = obscaled x = tf.nn.relu(U.conv2d(x, 8, "l1", [2, 2], [1, 1], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0))) logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) with tf.variable_scope("vf"): x = obscaled x = tf.nn.relu(U.conv2d(x, 8, "l1", [2, 2], [1, 1], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0))) self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0)) self.vpredz = self.vpred self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() self._act = U.function([stochastic, ob], [ac, self.vpred])
def _build(self): num_primitives = self.num_primitives num_hid_layers = self._num_hid_layers hid_size = self._hid_size self._obs = {} for ob_name, ob_shape in self._ob_shape.items(): self._obs[ob_name] = U.get_placeholder( name="ob_{}".format(ob_name), dtype=tf.float32, shape=[None] + self._ob_shape[ob_name]) self._prev_primitive = prev_primitive = U.get_placeholder( name="prev_primitive", dtype=tf.int32, shape=[None]) with tf.variable_scope(self.name): self._scope = tf.get_variable_scope().name self.ob_rms = {} for ob_name in self.ob_type: with tf.variable_scope("ob_rms_{}".format(ob_name)): self.ob_rms[ob_name] = RunningMeanStd( shape=self._ob_shape[ob_name]) obz = [(self._obs[ob_name] - self.ob_rms[ob_name].mean) / self.ob_rms[ob_name].std for ob_name in self.ob_type] obz = [tf.clip_by_value(ob, -5.0, 5.0) for ob in obz] obz = tf.concat(obz, -1) prev_primitive_one_hot = tf.one_hot(prev_primitive, num_primitives, name="prev_primitive_one_hot") obz = tf.concat([obz, prev_primitive_one_hot], -1) # value function with tf.variable_scope("vf"): _ = obz for i in range(num_hid_layers): _ = self._activation( tf.layers.dense( _, hid_size, name="fc%d" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense( _, 1, name="vpred", kernel_initializer=U.normc_initializer(1.0))[:, 0] # meta policy with tf.variable_scope("pol"): _ = obz for i in range(num_hid_layers): _ = self._activation( tf.layers.dense( _, hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.selector = tf.layers.dense( _, num_primitives, name="action", kernel_initializer=U.normc_initializer(0.01)) self.pdtype = pdtype = CategoricalPdType(num_primitives) self.pd = pdtype.pdfromflat(self.selector) # sample action stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.obs = [self._obs[ob_name] for ob_name in self.ob_type] self._act = U.function([stochastic, self._prev_primitive] + self.obs, [ac, self.vpred])
def learn(env, policy_func, med_func, expert_dataset, pretrained, pretrained_weight, g_step, m_step, e_step, inner_iters, save_per_iter, ckpt_dir, log_dir, timesteps_per_batch, task_name, max_kl=0.01, max_timesteps=0, max_episodes=0, max_iters=0, batch_size=64, med_stepsize=1e-3, pi_stepsize=1e-3, callback=None, writer=None): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space, reuse=(pretrained_weight != None)) oldpi = policy_func("oldpi", ob_space, ac_space) med = med_func("mediator", ob_space, ac_space) pi_var_list = pi.get_trainable_variables() med_var_list = med.get_trainable_variables() g_ob = U.get_placeholder(name="g_ob", dtype=tf.float32, shape=[None] + list(ob_space.shape)) g_ac = U.get_placeholder(name='g_ac', dtype=tf.float32, shape=[None] + list(ac_space.shape)) e_ob = U.get_placeholder(name='e_ob', dtype=tf.float32, shape=[None] + list(ob_space.shape)) e_ac = U.get_placeholder(name='e_ac', dtype=tf.float32, shape=[None] + list(ac_space.shape)) med_loss = -tf.reduce_mean(med.g_pd.logp(g_ac) + med.e_pd.logp(e_ac)) * 0.5 #pi_loss = -0.5 * (tf.reduce_mean(pi.pd.logp(ac) - med.pd.logp(ac))) g_pdf = tfd.MultivariateNormalDiag(loc=pi.pd.mean, scale_diag=pi.pd.std) m_pdf = tfd.MultivariateNormalDiag(loc=med.g_pd.mean, scale_diag=med.g_pd.std) pi_loss = tf.reduce_mean(g_pdf.cross_entropy(m_pdf) - g_pdf.entropy()) # tf.reduce_mean(pi.pd.kl(med.pd)) kloldnew = oldpi.pd.kl(pi.pd) meankl = tf.reduce_mean(kloldnew) dist = meankl expert_loss = -tf.reduce_mean(pi.pd.logp(e_ac)) assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_med_loss = U.function([g_ob, g_ac, e_ob, e_ac], med_loss) compute_pi_loss = U.function([g_ob], pi_loss) compute_exp_loss = U.function([e_ob, e_ac], expert_loss) # compute_kl_loss = U.function([ob], dist) # compute_fvp = U.function([flat_tangent, ob, ac], fvp) compute_med_lossandgrad = U.function([g_ob, g_ac, e_ob, e_ac], [med_loss, U.flatgrad(med_loss, med_var_list)]) compute_pi_lossandgrad = U.function([g_ob], [pi_loss, U.flatgrad(pi_loss, pi_var_list)]) compute_exp_lossandgrad = U.function([e_ob, e_ac], [expert_loss, U.flatgrad(expert_loss, pi_var_list)]) get_flat = U.GetFlat(pi_var_list) set_from_flat = U.SetFromFlat(pi_var_list) med_adam = MpiAdam(med_var_list) pi_adam = MpiAdam(pi_var_list) def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() # th_init = get_flat() # MPI.COMM_WORLD.Bcast(th_init, root=0) # set_from_flat(th_init) med_adam.sync() pi_adam.sync() # if rank == 0: # print("Init pi param sum %d, init med param sum %d." % (th_pi_init.sum(), th_med_init.sum()), flush=True) seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths true_rewbuffer = deque(maxlen=40) assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 loss_stats = stats(["med_loss", "pi_loss"]) ep_stats = stats(["True_rewards", "Episode_length"]) if pretrained_weight is not None: U.load_state(pretrained_weight, var_list=pi_var_list) med_losses = [] pi_losses = [] while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break # Save model if rank == 0 and iters_so_far % save_per_iter == 0 and ckpt_dir is not None: fname = os.path.join(ckpt_dir, task_name) os.makedirs(os.path.dirname(fname), exist_ok=True) saver = tf.train.Saver() saver.save(tf.get_default_session(), fname) logger.log("********** Iteration %i ************" % iters_so_far) # ======= Optimize Mediator========= seg = seg_gen.__next__() g_ob, g_ac = seg['ob'], seg['ac'] #assign_old_eq_new() #stepsize = 3e-4 # thbefore = get_flat() d = dataset.Dataset(dict(ob=g_ob, ac=g_ac)) optim_batchsize = min(batch_size, len(g_ob)) g_loss = [] logger.log("Optimizing Generator...") for _ in range(1): g_batch = d.next_batch(optim_batchsize) g_batch_ob, g_batch_ac = g_batch['ob'], g_batch['ac'] if hasattr(pi, "obs_rms"): pi.obs_rms.update(g_batch_ob) pi_loss, g = compute_pi_lossandgrad(g_batch_ob) # kl = compute_kl_loss(g_ob) # if kl > max_kl * 1.5: # logger.log("violated KL constraint. Shrinking step.") # # stepsize *= 0.1 # break # else: # logger.log("Stepsize OK!") pi_adam.update(allmean(g), pi_stepsize) g_loss.append(pi_loss) pi_losses.append(np.mean(np.array(g_loss))) med_loss = [] logger.log("Optimizing Mediator...") for g_ob_batch, g_ac_batch in dataset.iterbatches((seg['ob'], seg['ac']), include_final_partial_batch=False, batch_size=batch_size): # g_batch = d.next_batch(optim_batchsize) # g_ob_batch, g_ac_batch = g_batch['ob'], g_batch['ac'] e_ob_batch, e_ac_batch = expert_dataset.get_next_batch(optim_batchsize) if hasattr(med, "obs_rms"): med.obs_rms.update(np.concatenate((g_ob_batch, e_ob_batch), 0)) newlosses, g = compute_med_lossandgrad(g_ob_batch, g_ac_batch, e_ob_batch, e_ac_batch) med_adam.update(allmean(g), med_stepsize) med_loss.append(newlosses) med_losses.append(np.mean(np.array(med_loss))) #logger.record_tabular("med_loss_each_iter", np.mean(np.array(med_losses))) #logger.record_tabular("gen_loss_each_iter", np.mean(np.array(pi_losses))) #logger.record_tabular("expert_loss_each_iter", np.mean(np.array(exp_losses))) logger.record_tabular("med_loss_each_iter", np.mean(np.array(med_losses))) logger.record_tabular("gen_loss_each_iter", np.mean(np.array(pi_losses))) lrlocal = (seg["ep_lens"], seg["ep_true_rets"]) listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) lens, true_rets = map(flatten_lists, zip(*listoflrpairs)) true_rewbuffer.extend(true_rets) lenbuffer.extend(lens) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if writer is not None: loss_stats.add_all_summary(writer, [np.mean(np.array(med_losses)), np.mean(np.array(pi_losses))], episodes_so_far) ep_stats.add_all_summary(writer, [np.mean(true_rewbuffer), np.mean(lenbuffer)], episodes_so_far) if rank == 0: logger.dump_tabular()
def learn( env, policy_func, *, timesteps_per_batch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) num_options=1, app='', saves=False, wsaves=False, epoch=-1, seed=1, dc=0): optim_batchsize_ideal = optim_batchsize np.random.seed(seed) tf.set_random_seed(seed) env.seed(seed) ### Book-keeping gamename = env.spec.id[:-3].lower() gamename += 'seed' + str(seed) gamename += app version_name = 'FINAL_NORM-ACT-LOWER-LR-len-400-wNoise-update1-ppo-ESCH-1-2-5-nI' dirname = '{}_{}_{}opts_saves/'.format(version_name, gamename, num_options) print(dirname) #input ("wait here after dirname") if wsaves: first = True if not os.path.exists(dirname): os.makedirs(dirname) first = False # while os.path.exists(dirname) and first: # dirname += '0' files = ['pposgd_simple.py', 'mlp_policy.py', 'run_mujoco.py'] first = True for i in range(len(files)): src = os.path.join( '/home/nfunk/Code_MA/ppoc_off_tryout/baselines/baselines/ppo1/' ) + files[i] print(src) #dest = os.path.join('/home/nfunk/results_NEW/ppo1/') + dirname dest = dirname + "src_code/" if (first): os.makedirs(dest) first = False print(dest) shutil.copy2(src, dest) # brute force copy normal env file at end of copying process: src = os.path.join( '/home/nfunk/Code_MA/ppoc_off_tryout/nfunk/envs_nf/pendulum_nf.py') shutil.copy2(src, dest) ### # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space max_action = env.action_space.high # add the dimension in the observation space! ob_space.shape = ((ob_space.shape[0] + ac_space.shape[0]), ) print(ob_space.shape) print(ac_space.shape) #input ("wait here where the spaces are printed!!!") pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return pol_ov_op_ent = tf.placeholder(dtype=tf.float32, shape=None) # Empirical return # option = tf.placeholder(dtype=tf.int32, shape=[None]) lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon # pdb.set_trace() ob = U.get_placeholder_cached(name="ob") option = U.get_placeholder_cached(name="option") term_adv = U.get_placeholder(name='term_adv', dtype=tf.float32, shape=[None]) ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold atarg_clip = atarg #tf.clip_by_value(atarg,-10,10) surr1 = ratio * atarg_clip #atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg_clip #atarg # pol_surr = -U.mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) #vf_loss = U.mean(tf.square(tf.clip_by_value(pi.vpred - ret, -10.0, 10.0))) vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] term_loss = pi.tpred * term_adv force_pi_loss = U.mean( tf.square( tf.clip_by_value(pi.op_pi, 1e-5, 1.0) - tf.constant([[0.05, 0.95]]))) log_pi = tf.log(tf.clip_by_value(pi.op_pi, 1e-5, 1.0)) #log_pi = tf.Print(log_pi, [log_pi, tf.shape(tf.transpose(log_pi))]) old_log_pi = tf.log(tf.clip_by_value(oldpi.op_pi, 1e-5, 1.0)) entropy = -tf.reduce_sum(pi.op_pi * log_pi, reduction_indices=1) ratio_pol_ov_op = tf.exp( tf.transpose(log_pi)[option[0]] - tf.transpose(old_log_pi)[option[0]]) # pnew / pold term_adv_clip = term_adv #tf.clip_by_value(term_adv,-10,10) surr1_pol_ov_op = ratio_pol_ov_op * term_adv_clip # surrogate from conservative policy iteration surr2_pol_ov_op = U.clip(ratio_pol_ov_op, 1.0 - clip_param, 1.0 + clip_param) * term_adv_clip # pol_surr_pol_ov_op = -U.mean( tf.minimum(surr1_pol_ov_op, surr2_pol_ov_op)) # PPO's pessimistic surrogate (L^CLIP) op_loss = pol_surr_pol_ov_op - pol_ov_op_ent * tf.reduce_sum(entropy) #op_loss = pol_surr_pol_ov_op #total_loss += force_pi_loss total_loss += op_loss var_list = pi.get_trainable_variables() term_list = var_list[6:8] lossandgrad = U.function( [ob, ac, atarg, ret, lrmult, option, term_adv, pol_ov_op_ent], losses + [U.flatgrad(total_loss, var_list)]) termloss = U.function([ob, option, term_adv], [U.flatgrad(term_loss, var_list) ]) # Since we will use a different step size. adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult, option], losses) U.initialize() adam.sync() saver = tf.train.Saver(max_to_keep=10000) saver_best = tf.train.Saver(max_to_keep=1) ### More book-kepping results = [] if saves: results = open( version_name + '_' + gamename + '_' + str(num_options) + 'opts_' + '_results.csv', 'w') results_best_model = open( dirname + version_name + '_' + gamename + '_' + str(num_options) + 'opts_' + '_bestmodel.csv', 'w') out = 'epoch,avg_reward' for opt in range(num_options): out += ',option {} dur'.format(opt) for opt in range(num_options): out += ',option {} std'.format(opt) for opt in range(num_options): out += ',option {} term'.format(opt) for opt in range(num_options): out += ',option {} adv'.format(opt) out += '\n' results.write(out) # results.write('epoch,avg_reward,option 1 dur, option 2 dur, option 1 term, option 2 term\n') results.flush() if epoch >= 0: dirname = '{}_{}opts_saves/'.format(gamename, num_options) print("Loading weights from iteration: " + str(epoch)) filename = dirname + '{}_epoch_{}.ckpt'.format(gamename, epoch) saver.restore(U.get_session(), filename) ### episodes_so_far = 0 timesteps_so_far = 0 global iters_so_far iters_so_far = 0 des_pol_op_ent = 0.1 max_val = -100000 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True, num_options=num_options, saves=saves, results=results, rewbuffer=rewbuffer, dc=dc) datas = [0 for _ in range(num_options)] while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) opt_d = [] for i in range(num_options): dur = np.mean( seg['opt_dur'][i]) if len(seg['opt_dur'][i]) > 0 else 0. opt_d.append(dur) std = [] for i in range(num_options): logstd = np.mean( seg['logstds'][i]) if len(seg['logstds'][i]) > 0 else 0. std.append(np.exp(logstd)) print("mean opt dur:", opt_d) print("mean op pol:", np.mean(np.array(seg['optpol_p']), axis=0)) print("mean term p:", np.mean(np.array(seg['term_p']), axis=0)) print("mean value val:", np.mean(np.array(seg['value_val']), axis=0)) ob, ac, opts, atarg, tdlamret = seg["ob"], seg["ac"], seg["opts"], seg[ "adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy if hasattr(pi, "ob_rms_only"): pi.ob_rms_only.update(ob[:, :-ac_space.shape[0]] ) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values if (iters_so_far + 1) % 1000 == 0: des_pol_op_ent = des_pol_op_ent / 10 if iters_so_far % 50 == 0 and wsaves: print("weights are saved...") filename = dirname + '{}_epoch_{}.ckpt'.format( gamename, iters_so_far) save_path = saver.save(U.get_session(), filename) # adaptively save best run: if (np.mean(rewbuffer) > max_val) and wsaves: max_val = np.mean(rewbuffer) results_best_model.write('epoch: ' + str(iters_so_far) + 'rew: ' + str(np.mean(rewbuffer)) + '\n') results_best_model.flush() filename = dirname + 'best.ckpt'.format(gamename, iters_so_far) save_path = saver_best.save(U.get_session(), filename) min_batch = 160 # Arbitrary t_advs = [[] for _ in range(num_options)] for opt in range(num_options): indices = np.where(opts == opt)[0] print("batch size:", indices.size) opt_d[opt] = indices.size if not indices.size: t_advs[opt].append(0.) continue ### This part is only necessasry when we use options. We proceed to these verifications in order not to discard any collected trajectories. if datas[opt] != 0: if (indices.size < min_batch and datas[opt].n > min_batch): datas[opt] = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent) t_advs[opt].append(0.) continue elif indices.size + datas[opt].n < min_batch: # pdb.set_trace() oldmap = datas[opt].data_map cat_ob = np.concatenate((oldmap['ob'], ob[indices])) cat_ac = np.concatenate((oldmap['ac'], ac[indices])) cat_atarg = np.concatenate( (oldmap['atarg'], atarg[indices])) cat_vtarg = np.concatenate( (oldmap['vtarg'], tdlamret[indices])) datas[opt] = Dataset(dict(ob=cat_ob, ac=cat_ac, atarg=cat_atarg, vtarg=cat_vtarg), shuffle=not pi.recurrent) t_advs[opt].append(0.) continue elif (indices.size + datas[opt].n > min_batch and datas[opt].n < min_batch) or (indices.size > min_batch and datas[opt].n < min_batch): oldmap = datas[opt].data_map cat_ob = np.concatenate((oldmap['ob'], ob[indices])) cat_ac = np.concatenate((oldmap['ac'], ac[indices])) cat_atarg = np.concatenate( (oldmap['atarg'], atarg[indices])) cat_vtarg = np.concatenate( (oldmap['vtarg'], tdlamret[indices])) datas[opt] = d = Dataset(dict(ob=cat_ob, ac=cat_ac, atarg=cat_atarg, vtarg=cat_vtarg), shuffle=not pi.recurrent) if (indices.size > min_batch and datas[opt].n > min_batch): datas[opt] = d = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent) elif datas[opt] == 0: datas[opt] = d = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent) ### optim_batchsize = optim_batchsize or ob.shape[0] optim_epochs = np.clip( np.int(10 * (indices.size / (timesteps_per_batch / num_options))), 10, 10) if num_options > 1 else optim_epochs print("optim epochs:", optim_epochs) logger.log("Optimizing...") # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): #tadv,nodc_adv = pi.get_term_adv(batch["ob"],[opt]) tadv, nodc_adv = pi.get_opt_adv(batch["ob"], [opt]) tadv = tadv if num_options > 1 else np.zeros_like(tadv) t_advs[opt].append(nodc_adv) #if (opt==1): # *newlosses, grads = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv) #else: # *newlosses, grads = lossandgrad0(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv) *newlosses, grads = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv, des_pol_op_ent) #*newlosses, grads = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv) #termg = termloss(batch["ob"], [opt], tadv) #adam.update(termg[0], 5e-7 * cur_lrmult) adam.update(grads, optim_stepsize * cur_lrmult) losses.append(newlosses) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() ### Book keeping if saves: out = "{},{}" for _ in range(num_options): out += ",{},{},{},{}" out += "\n" info = [iters_so_far, np.mean(rewbuffer)] for i in range(num_options): info.append(opt_d[i]) for i in range(num_options): info.append(std[i]) for i in range(num_options): info.append(np.mean(np.array(seg['term_p']), axis=0)[i]) for i in range(num_options): info.append(np.mean(t_advs[i])) results.write(out.format(*info)) results.flush()
def learn( env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) identifier, save_result=True, save_interval=100, reward_list=[], cont=False, play=False, iter, action_repeat=1): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space mirror = hasattr(env, 'mirror_id') mirror_id = env.mirror_id if mirror else None pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) if mirror: mirror_ob = U.get_placeholder(name="mirror_ob", dtype=tf.float32, shape=[None] + list(ob_space.shape)) mirror_ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) sym_loss = 4 * tf.reduce_mean(tf.square(ac - mirror_ac)) if mirror else 0 total_loss = pol_surr + pol_entpen + vf_loss + sym_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] if mirror: losses.append(sym_loss) loss_names.append("sym_loss") var_list = pi.get_trainable_variables() inputs = [ob, ac, atarg, ret, lrmult] if mirror: inputs += [mirror_ob, mirror_ac] lossandgrad = U.function(inputs, losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function(inputs, losses) if play: return pi if cont: load_state(identifier, iter) else: U.initialize() iter = 0 adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True, mirror_id=mirror_id, action_repeat=action_repeat) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = int(iter) tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards rewbuffer_ori = deque(maxlen=100) assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError if MPI.COMM_WORLD.Get_rank() == 0: logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] if mirror: mirror_ob, mirror_ac = seg["mirror_ob"], seg["mirror_ac"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d_dict = dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret) if mirror: d_dict["mirror_ob"] = mirror_ob d_dict["mirror_ac"] = mirror_ac d = Dataset(d_dict, shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): batches = [ batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult ] if mirror: batches += [batch["mirror_ob"], batch["mirror_ac"]] *newlosses, g = lossandgrad(*batches) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) losses = [] for batch in d.iterate_once(optim_batchsize): batches = [ batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult ] if mirror: batches += [batch["mirror_ob"], batch["mirror_ac"]] newlosses = compute_losses(*batches) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) # logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_rets_ori"] ) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, rews_ori = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) rewbuffer_ori.extend(rews_ori) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpRewOriMean", np.mean(rewbuffer_ori)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() reward_list.append(np.mean(rewbuffer_ori)) if save_result and iters_so_far % save_interval == 0: save_state(identifier, iters_so_far) save_rewards(reward_list, identifier, iters_so_far) logger.log('Model and reward saved') return pi
def learn( args, env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) writer=None): print("\nBeginning learning...\n") # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.compat.v1.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.compat.v1.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = {} ob['adj'] = U.get_placeholder_cached(name="adj") ob['node'] = U.get_placeholder_cached(name="node") ob_gen = {} ob_gen['adj'] = U.get_placeholder( shape=[None, ob_space['adj'].shape[0], None, None], dtype=tf.float32, name='adj_gen') ob_gen['node'] = U.get_placeholder( shape=[None, 1, None, ob_space['node'].shape[2]], dtype=tf.float32, name='node_gen') ob_real = {} ob_real['adj'] = U.get_placeholder( shape=[None, ob_space['adj'].shape[0], None, None], dtype=tf.float32, name='adj_real') ob_real['node'] = U.get_placeholder( shape=[None, 1, None, ob_space['node'].shape[2]], dtype=tf.float32, name='node_real') ac = tf.compat.v1.placeholder(dtype=tf.int64, shape=[None, 4], name='ac_real') ## PPO loss kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent pi_logp = pi.pd.logp(ac) oldpi_logp = oldpi.pd.logp(ac) ratio_log = pi.pd.logp(ac) - oldpi.pd.logp(ac) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] ## Expert loss loss_expert = -tf.reduce_mean(pi_logp) ## Discriminator loss step_pred_real, step_logit_real = discriminator_net(ob_real, args, name='d_step') step_pred_gen, step_logit_gen = discriminator_net(ob_gen, args, name='d_step') loss_d_step_real = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=step_logit_real, labels=tf.ones_like(step_logit_real) * 0.9)) loss_d_step_gen = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=step_logit_gen, labels=tf.zeros_like(step_logit_gen))) loss_d_step = loss_d_step_real + loss_d_step_gen if args.gan_type == 'normal': loss_g_step_gen = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=step_logit_gen, labels=tf.zeros_like(step_logit_gen))) elif args.gan_type == 'recommend': loss_g_step_gen = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=step_logit_gen, labels=tf.ones_like(step_logit_gen) * 0.9)) elif args.gan_type == 'wgan': loss_d_step, _, _ = discriminator(ob_real, ob_gen, args, name='d_step') loss_d_step = loss_d_step * -1 loss_g_step_gen, _ = discriminator_net(ob_gen, args, name='d_step') final_pred_real, final_logit_real = discriminator_net(ob_real, args, name='d_final') final_pred_gen, final_logit_gen = discriminator_net(ob_gen, args, name='d_final') loss_d_final_real = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=final_logit_real, labels=tf.ones_like(final_logit_real) * 0.9)) loss_d_final_gen = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=final_logit_gen, labels=tf.zeros_like(final_logit_gen))) loss_d_final = loss_d_final_real + loss_d_final_gen if args.gan_type == 'normal': loss_g_final_gen = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=final_logit_gen, labels=tf.zeros_like(final_logit_gen))) elif args.gan_type == 'recommend': loss_g_final_gen = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=final_logit_gen, labels=tf.ones_like(final_logit_gen) * 0.9)) elif args.gan_type == 'wgan': loss_d_final, _, _ = discriminator(ob_real, ob_gen, args, name='d_final') loss_d_final = loss_d_final * -1 loss_g_final_gen, _ = discriminator_net(ob_gen, args, name='d_final') var_list_pi = pi.get_trainable_variables() var_list_pi_stop = [ var for var in var_list_pi if ('emb' in var.name) or ('gcn' in var.name) or ('stop' in var.name) ] var_list_d_step = [ var for var in tf.compat.v1.global_variables() if 'd_step' in var.name ] var_list_d_final = [ var for var in tf.compat.v1.global_variables() if 'd_final' in var.name ] ## debug debug = {} ## loss update function lossandgrad_ppo = U.function([ ob['adj'], ob['node'], ac, pi.ac_real, oldpi.ac_real, atarg, ret, lrmult ], losses + [U.flatgrad(total_loss, var_list_pi)]) lossandgrad_expert = U.function( [ob['adj'], ob['node'], ac, pi.ac_real], [loss_expert, U.flatgrad(loss_expert, var_list_pi)]) lossandgrad_expert_stop = U.function( [ob['adj'], ob['node'], ac, pi.ac_real], [loss_expert, U.flatgrad(loss_expert, var_list_pi_stop)]) lossandgrad_d_step = U.function( [ob_real['adj'], ob_real['node'], ob_gen['adj'], ob_gen['node']], [loss_d_step, U.flatgrad(loss_d_step, var_list_d_step)]) lossandgrad_d_final = U.function( [ob_real['adj'], ob_real['node'], ob_gen['adj'], ob_gen['node']], [loss_d_final, U.flatgrad(loss_d_final, var_list_d_final)]) loss_g_gen_step_func = U.function([ob_gen['adj'], ob_gen['node']], loss_g_step_gen) loss_g_gen_final_func = U.function([ob_gen['adj'], ob_gen['node']], loss_g_final_gen) adam_pi = MpiAdam(var_list_pi, epsilon=adam_epsilon) adam_pi_stop = MpiAdam(var_list_pi_stop, epsilon=adam_epsilon) adam_d_step = MpiAdam(var_list_d_step, epsilon=adam_epsilon) adam_d_final = MpiAdam(var_list_d_final, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.compat.v1.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ ob['adj'], ob['node'], ac, pi.ac_real, oldpi.ac_real, atarg, ret, lrmult ], losses) # Prepare for rollouts # ---------------------------------------- episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths lenbuffer_valid = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards rewbuffer_env = deque(maxlen=100) # rolling buffer for episode rewards rewbuffer_d_step = deque(maxlen=100) # rolling buffer for episode rewards rewbuffer_d_final = deque(maxlen=100) # rolling buffer for episode rewards rewbuffer_final = deque(maxlen=100) # rolling buffer for episode rewards rewbuffer_final_stat = deque( maxlen=100) # rolling buffer for episode rewardsn seg_gen = traj_segment_generator(args, pi, env, timesteps_per_actorbatch, True, loss_g_gen_step_func, loss_g_gen_final_func) assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" if args.load == 1: try: fname = './ckpt/' + args.name_full_load sess = tf.get_default_session() # sess.run(tf.compat.v1.global_variables_initializer()) saver = tf.train.Saver(var_list_pi) saver.restore(sess, fname) iters_so_far = int(fname.split('_')[-1]) + 1 print('model restored!', fname, 'iters_so_far:', iters_so_far) except: print(fname, 'ckpt not found, start with iters 0') U.initialize() adam_pi.sync() adam_pi_stop.sync() adam_d_step.sync() adam_d_final.sync() counter = 0 level = 0 ## start training while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError # logger.log("********** Iteration %i ************"%iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) ob_adj, ob_node, ac, atarg, tdlamret = seg["ob_adj"], seg[ "ob_node"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob_adj=ob_adj, ob_node=ob_node, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob_adj.shape[0] # inner training loop, train policy for i_optim in range(optim_epochs): loss_expert = 0 loss_expert_stop = 0 g_expert = 0 g_expert_stop = 0 loss_d_step = 0 loss_d_final = 0 g_ppo = 0 g_d_step = 0 g_d_final = 0 pretrain_shift = 5 ## Expert if iters_so_far >= args.expert_start and iters_so_far <= args.expert_end + pretrain_shift: ## Expert train # # # learn how to stop ob_expert, ac_expert = env.get_expert(optim_batchsize) loss_expert, g_expert = lossandgrad_expert( ob_expert['adj'], ob_expert['node'], ac_expert, ac_expert) loss_expert = np.mean(loss_expert) ## PPO if iters_so_far >= args.rl_start and iters_so_far <= args.rl_end: assign_old_eq_new( ) # set old parameter values to new parameter values batch = d.next_batch(optim_batchsize) # ppo if iters_so_far >= args.rl_start + pretrain_shift: # start generator after discriminator trained a well.. *newlosses, g_ppo = lossandgrad_ppo( batch["ob_adj"], batch["ob_node"], batch["ac"], batch["ac"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses_ppo = newlosses if args.has_d_step == 1 and i_optim >= optim_epochs // 2: # update step discriminator ob_expert, _ = env.get_expert( optim_batchsize, curriculum=args.curriculum, evel_total=args.curriculum_num, evel=level) loss_d_step, g_d_step = lossandgrad_d_step( ob_expert["adj"], ob_expert["node"], batch["ob_adj"], batch["ob_node"]) adam_d_step.update(g_d_step, optim_stepsize * cur_lrmult) loss_d_step = np.mean(loss_d_step) if args.has_d_final == 1 and i_optim >= optim_epochs // 4 * 3: # update final discriminator ob_expert, _ = env.get_expert( optim_batchsize, is_final=True, curriculum=args.curriculum, level_total=args.curriculum_num, level=level) seg_final_adj, seg_final_node = traj_final_generator( pi, copy.deepcopy(env), optim_batchsize, True) # update final discriminator loss_d_final, g_d_final = lossandgrad_d_final( ob_expert["adj"], ob_expert["node"], seg_final_adj, seg_final_node) adam_d_final.update(g_d_final, optim_stepsize * cur_lrmult) # update generator adam_pi.update(0.2 * g_ppo + 0.05 * g_expert, optim_stepsize * cur_lrmult) # WGAN # if args.has_d_step == 1: # clip_D = [p.assign(tf.clip_by_value(p, -0.01, 0.01)) for p in var_list_d_step] # if args.has_d_final == 1: # clip_D = [p.assign(tf.clip_by_value(p, -0.01, 0.01)) for p in var_list_d_final] # ## PPO val # if iters_so_far >= args.rl_start and iters_so_far <= args.rl_end: # logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob_adj"], batch["ob_node"], batch["ac"], batch["ac"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) # logger.log(fmt_row(13, meanlosses)) if writer is not None: writer.add_scalar("loss_expert", loss_expert, iters_so_far) writer.add_scalar("loss_expert_stop", loss_expert_stop, iters_so_far) writer.add_scalar("loss_d_step", loss_d_step, iters_so_far) writer.add_scalar("loss_d_final", loss_d_final, iters_so_far) writer.add_scalar('grad_expert_min', np.amin(g_expert), iters_so_far) writer.add_scalar('grad_expert_max', np.amax(g_expert), iters_so_far) writer.add_scalar('grad_expert_norm', np.linalg.norm(g_expert), iters_so_far) writer.add_scalar('grad_expert_stop_min', np.amin(g_expert_stop), iters_so_far) writer.add_scalar('grad_expert_stop_max', np.amax(g_expert_stop), iters_so_far) writer.add_scalar('grad_expert_stop_norm', np.linalg.norm(g_expert_stop), iters_so_far) writer.add_scalar('grad_rl_min', np.amin(g_ppo), iters_so_far) writer.add_scalar('grad_rl_max', np.amax(g_ppo), iters_so_far) writer.add_scalar('grad_rl_norm', np.linalg.norm(g_ppo), iters_so_far) writer.add_scalar('g_d_step_min', np.amin(g_d_step), iters_so_far) writer.add_scalar('g_d_step_max', np.amax(g_d_step), iters_so_far) writer.add_scalar('g_d_step_norm', np.linalg.norm(g_d_step), iters_so_far) writer.add_scalar('g_d_final_min', np.amin(g_d_final), iters_so_far) writer.add_scalar('g_d_final_max', np.amax(g_d_final), iters_so_far) writer.add_scalar('g_d_final_norm', np.linalg.norm(g_d_final), iters_so_far) writer.add_scalar('learning_rate', optim_stepsize * cur_lrmult, iters_so_far) for (lossval, name) in zipsame(meanlosses, loss_names): # logger.record_tabular("loss_"+name, lossval) if writer is not None: writer.add_scalar("loss_" + name, lossval, iters_so_far) # logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) if writer is not None: writer.add_scalar("ev_tdlam_before", explained_variance(vpredbefore, tdlamret), iters_so_far) lrlocal = (seg["ep_lens"], seg["ep_lens_valid"], seg["ep_rets"], seg["ep_rets_env"], seg["ep_rets_d_step"], seg["ep_rets_d_final"], seg["ep_final_rew"], seg["ep_final_rew_stat"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, lens_valid, rews, rews_env, rews_d_step, rews_d_final, rews_final, rews_final_stat = map( flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) lenbuffer_valid.extend(lens_valid) rewbuffer.extend(rews) rewbuffer_d_step.extend(rews_d_step) rewbuffer_d_final.extend(rews_d_final) rewbuffer_env.extend(rews_env) rewbuffer_final.extend(rews_final) rewbuffer_final_stat.extend(rews_final_stat) # logger.record_tabular("EpLenMean", np.mean(lenbuffer)) # logger.record_tabular("EpRewMean", np.mean(rewbuffer)) # logger.record_tabular("EpThisIter", len(lens)) if writer is not None: writer.add_scalar("EpLenMean", np.mean(lenbuffer), iters_so_far) writer.add_scalar("EpLenValidMean", np.mean(lenbuffer_valid), iters_so_far) writer.add_scalar("EpRewMean", np.mean(rewbuffer), iters_so_far) writer.add_scalar("EpRewDStepMean", np.mean(rewbuffer_d_step), iters_so_far) writer.add_scalar("EpRewDFinalMean", np.mean(rewbuffer_d_final), iters_so_far) writer.add_scalar("EpRewEnvMean", np.mean(rewbuffer_env), iters_so_far) writer.add_scalar("EpRewFinalMean", np.mean(rewbuffer_final), iters_so_far) writer.add_scalar("EpRewFinalStatMean", np.mean(rewbuffer_final_stat), iters_so_far) writer.add_scalar("EpThisIter", len(lens), iters_so_far) episodes_so_far += len(lens) timesteps_so_far += sum(lens) # logger.record_tabular("EpisodesSoFar", episodes_so_far) # logger.record_tabular("TimestepsSoFar", timesteps_so_far) # logger.record_tabular("TimeElapsed", time.time() - tstart) if writer is not None: writer.add_scalar("EpisodesSoFar", episodes_so_far, iters_so_far) writer.add_scalar("TimestepsSoFar", timesteps_so_far, iters_so_far) writer.add_scalar("TimeElapsed", time.time() - tstart, iters_so_far) if MPI.COMM_WORLD.Get_rank() == 0: with open('molecule_gen/' + args.name_full + '.csv', 'a') as f: f.write('***** Iteration {} *****\n'.format(iters_so_far)) # save if iters_so_far % args.save_every == 0: fname = './ckpt/' + args.name_full + '_' + str(iters_so_far) saver = tf.compat.v1.train.Saver(var_list_pi) saver.save(tf.compat.v1.get_default_session(), fname) print('model saved!', fname) # fname = os.path.join(ckpt_dir, task_name) # os.makedirs(os.path.dirname(fname), exist_ok=True) # saver = tf.train.Saver() # saver.save(tf.get_default_session(), fname) # if iters_so_far==args.load_step: iters_so_far += 1 counter += 1 if counter % args.curriculum_step and counter // args.curriculum_step < args.curriculum_num: level += 1
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None feature_funcs = [] ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) self.std = tf.constant(1.0) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope('vf'): obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) import numpy as np # for i in range(0, ob_space.shape[0]): # # Polinomial # # feature_funcs.append(lambda s, i=i: tf.pow(s, i)) # # Fourier # # feature_funcs.append(lambda s, i=i: tf.cos(i*np.pi*s)) # # RBF # feature_funcs.append(lambda s, i=i: tf.exp(-tf.pow(s - self.ob_rms.mean, 2)/(2*self.ob_rms.std # **2))) # obz = tf.concat([func(ob) for func in feature_funcs], axis = 1) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense( last_out, 1, name='final', kernel_initializer=U.normc_initializer(0.1))[:, 0] with tf.variable_scope('pol'): last_out = obz # for i in range(num_hid_layers): # last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name = 'fc%i' % (i + 1), kernel_initializer = U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense( last_out, pdtype.param_shape()[0] // 2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.multiply( tf.ones(shape=[1, pdtype.param_shape()[0] // 2]), tf.constant(0.05)) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense( last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) pdparam = tf.clip_by_value(pdparam, -10.0, 10.0) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_layers=[], deterministic=True, diagonal=True, trainable_std=True, use_bias=True, use_critic=False, seed=None, verbose=True, hidden_W_init=U.normc_initializer(1.0), higher_mean_init=None, higher_logstd_init=tf.constant_initializer(np.log(0.11)), const_std_init=False): """Params: ob_space: task observation space ac_space : task action space hid__layers: list with width of each hidden layer deterministic: whether the actor is deterministic diagonal: whether the higher order policy has a diagonal covariance matrix use_bias: whether to include bias in neurons use_critic: whether to include a critic network seed: optional random seed """ # Check environment's shapes assert isinstance(ob_space, gym.spaces.Box) assert len(ac_space.shape) == 1 # Set seed if seed is not None: set_global_seeds(seed) # Set some attributes self.diagonal = diagonal self.use_bias = use_bias batch_length = None # Accepts a sequence of eps of arbitrary length self.ac_dim = ac_space.shape[0] self.ob_dim = ob_space.shape[0] self.linear = not hid_layers self.verbose = verbose self._ob = ob = U.get_placeholder( name="ob", dtype=tf.float32, shape=[None] + list(ob_space.shape)) # Actor (N.B.: weight initialization is irrelevant) with tf.variable_scope('actor'): last_out = ob for i, hid_size in enumerate(hid_layers): # Mlp feature extraction last_out = tf.nn.tanh( tf.layers.dense(last_out, hid_size, name='fc%i' % (i+1), kernel_initializer=hidden_W_init, use_bias=use_bias)) if deterministic and isinstance(ac_space, gym.spaces.Box): # Determinisitc action selection self.actor_mean = actor_mean = \ tf.layers.dense(last_out, ac_space.shape[0], name='action', kernel_initializer=hidden_W_init, use_bias=use_bias) else: raise NotImplementedError # Get actor flatten weights with tf.variable_scope('actor') as scope: self.actor_weights = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name) # flatten weights self.flat_actor_weights = tf.concat( [tf.reshape(w, [-1]) for w in self.actor_weights], axis=0) self._n_actor_weights = n_actor_weights = \ self.flat_actor_weights.shape[0] # Higher order policy (Gaussian) with tf.variable_scope('higher'): if higher_mean_init is None: # Initial means sampled from a normal distribution N(0,1) higher_mean_init = tf.where( tf.not_equal(self.flat_actor_weights, tf.constant(0, dtype=tf.float32)), tf.random_normal(shape=[n_actor_weights.value], stddev=0.01), tf.zeros(shape=[n_actor_weights])) # bias init always zero self.higher_mean = tf.get_variable( name='higher_mean', initializer=higher_mean_init, shape=self.flat_actor_weights.get_shape()) # Keep the weights'domain compact # self.higher_mean = higher_mean = tf.clip_by_value( # self.higher_mean, -1, 1, 'higher_mean_clipped') higher_mean = self.higher_mean if diagonal: if const_std_init: self.higher_logstd = higher_logstd = \ tf.get_variable( name='higher_logstd', initializer=higher_logstd_init, trainable=trainable_std) else: self.higher_logstd = higher_logstd = \ tf.get_variable( name='higher_logstd', shape=[n_actor_weights], initializer=higher_logstd_init, trainable=trainable_std) pdparam = tf.concat([higher_mean, higher_mean * 0. + higher_logstd], axis=0) self.pdtype = pdtype = \ DiagGaussianPdType(n_actor_weights.value) else: # Cholesky covariance matrix self.higher_logstd = higher_logstd = tf.get_variable( name='higher_logstd', shape=[n_actor_weights*(n_actor_weights + 1)//2], initializer=tf.initializers.constant(0.)) pdparam = tf.concat([higher_mean, higher_logstd], axis=0) self.pdtype = pdtype = CholeskyGaussianPdType( n_actor_weights.value) # Sample actor weights self.pd = pdtype.pdfromflat(pdparam) sampled_actor_params = self.pd.sample() symm_sampled_actor_params = self.pd.sample_symmetric() self._sample_actor_params = U.function([], [sampled_actor_params]) self._sample_symm_actor_params = U.function( [], list(symm_sampled_actor_params)) # Assign actor weights with tf.variable_scope('actor') as scope: actor_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name) self._use_sampled_actor_params = \ U.assignFromFlat(actor_params, sampled_actor_params) self._get_actor_params = U.GetFlat(actor_params) self._set_actor_params = U.SetFromFlat(actor_params) # Act self._action = action = actor_mean self._act = U.function([ob], [action]) # Manage higher policy weights with tf.variable_scope('higher') as scope: self._higher_params = higher_params = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name) self.flat_higher_params = tf.concat([tf.reshape(w, [-1]) for w in self._higher_params], axis=0) self._n_higher_params = self.flat_higher_params.shape[0] self._get_flat_higher_params = U.GetFlat(higher_params) self._set_higher_params = U.SetFromFlat(self._higher_params) # Evaluating self._actor_params_in = actor_params_in = \ U.get_placeholder(name='actor_params_in', dtype=tf.float32, shape=[batch_length] + [n_actor_weights]) self._rets_in = rets_in = \ U.get_placeholder(name='returns_in', dtype=tf.float32, shape=[batch_length]) ret_mean, ret_std = tf.nn.moments(rets_in, axes=[0]) self._get_ret_mean = U.function([self._rets_in], [ret_mean]) self._get_ret_std = U.function([self._rets_in], [ret_std]) self._logprobs = logprobs = self.pd.logp(actor_params_in) pgpe_times_n = U.flatgrad(logprobs*rets_in, higher_params) self._get_pgpe_times_n = U.function([actor_params_in, rets_in], [pgpe_times_n]) self._get_actor_mean = U.function([ob], [self.actor_mean]) self._get_higher_mean = U.function([ob], [self.higher_mean]) self._get_higher_std = U.function([], tf.exp([self.higher_logstd])) # Batch off-policy PGPE self._probs = tf.exp(logprobs) self._behavioral = None self._renyi_other = None # Renyi computation self._det_sigma = tf.exp(tf.reduce_sum(self.higher_logstd)) # Fisher computation (diagonal case) mean_fisher_diag = tf.exp(-2*self.higher_logstd) if trainable_std: cov_fisher_diag = mean_fisher_diag*0 + 2 self._fisher_diag = tf.concat( [mean_fisher_diag, cov_fisher_diag], axis=0) else: self._fisher_diag = mean_fisher_diag self._get_fisher_diag = U.function([], [self._fisher_diag])
def _build(self): ac_space = self._ac_space num_hid_layers = self._num_hid_layers hid_size = self._hid_size gaussian_fixed_var = self._gaussian_fixed_var if not isinstance(hid_size, list): hid_size = [hid_size] if len(hid_size) != num_hid_layers: hid_size += [hid_size[-1]] * (num_hid_layers - len(hid_size)) self.obs = [] self.pds = [] for j in range(self._config.num_contexts): # obs _ob = {} for ob_name, ob_shape in self._ob_shape.items(): _ob[ob_name] = U.get_placeholder( name="ob_{}/from_{}".format(ob_name, j), dtype=tf.float32, shape=[None] + self._ob_shape[ob_name]) # obs normalization if self._config.obs_norm == 'learn': obz = [(_ob[ob_name] - self.ob_rms[ob_name].mean) / self.ob_rms[ob_name].std for ob_name in self.ob_type] else: obz = [_ob[ob_name] for ob_name in self.ob_type] obz = [tf.clip_by_value(ob, -5.0, 5.0) for ob in obz] obz = tf.concat(obz, -1) # value function with tf.variable_scope('vf', reuse=tf.AUTO_REUSE): last_out = obz for i in range(num_hid_layers): last_out = self._activation( tf.layers.dense( last_out, hid_size[i], name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) vpred = tf.layers.dense( last_out, 1, name="final", kernel_initializer=U.normc_initializer(1.0))[:, 0] if j == self._id: self.vpred = vpred # policy pdtype = make_pdtype(ac_space) if j == self._id: self.pdtype = pdtype with tf.variable_scope('pol', reuse=tf.AUTO_REUSE): last_out = obz for i in range(num_hid_layers): last_out = self._activation( tf.layers.dense( last_out, hid_size[i], name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense( last_out, pdtype.param_shape()[0] // 2, name="final", kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense( last_out, pdtype.param_shape()[0], name="final", kernel_initializer=U.normc_initializer(0.01)) self.obs.append([_ob[ob_name] for ob_name in self.ob_type]) self.pds.append(pdtype.pdfromflat(pdparam)) self.ob = self.obs[self._id] self.pd = self.pds[self._id] # sample action stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic] + self.ob, [ac, self.vpred]) self._value = U.function([stochastic] + self.ob, self.vpred)
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, num_options=2, dc=0, w_intfc=True): assert isinstance(ob_space, gym.spaces.Box) self.w_intfc = w_intfc self.state_in = [] self.state_out = [] self.dc = dc self.num_options = num_options self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None]) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = dense3D2(last_out, 1, "vffinal", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:, 0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "termfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.tpred = tf.nn.sigmoid( dense3D2(tf.stop_gradient(last_out), 1, "termhead", option, num_options=num_options, weight_init=U.normc_initializer(1.0)))[:, 0] termination_sample = tf.greater( self.tpred, tf.random_uniform(shape=tf.shape(self.tpred), maxval=1.)) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense3D2(last_out, pdtype.param_shape()[0] // 2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[num_options, 1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) # self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OP", weight_init=U.normc_initializer(1.0))) # pdb.set_trace() # self.op_pi = tf.constant(1./num_options) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "intfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.intfc = tf.sigmoid( U.dense(last_out, num_options, "intfcfinal", weight_init=U.normc_initializer(1.0))) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "OP%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.op_pi = tf.nn.softmax( U.dense(last_out, num_options, "OPfinal", weight_init=U.normc_initializer(1.0))) self._act = U.function([stochastic, ob, option], [ac]) self.get_term = U.function([ob, option], [termination_sample]) self.get_tpred = U.function([ob, option], [self.tpred]) self.get_vpred = U.function([ob, option], [self.vpred]) self._get_op_int = U.function([ob], [self.op_pi, self.intfc]) self._get_intfc = U.function([ob], [self.intfc]) self._get_op = U.function([ob], [self.op_pi])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope('vf'): obz = ob #tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense( last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0] with tf.variable_scope('pol'): last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name='fc%i' % (i + 1), kernel_initializer=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense( last_out, pdtype.param_shape()[0] // 2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense( last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) # Since we are using a Box for the action space # this distribution is used DiagGaussianPd self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) # if stocastic = true, the call the sample of the distribion # otherwise just use the mean ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) next_ob = U.get_placeholder(name="next_ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) act = U.get_placeholder(name="act", dtype=tf.float32, shape=[sequence_length] + list(ac_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope('qf'): obz = tf.clip_by_value( (next_ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): if i == num_hid_layers - 1: last_out = tf.concat([last_out, act], axis=-1) last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.qpred = tf.layers.dense( last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0] with tf.variable_scope('vf'): obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense( last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0] with tf.variable_scope('pol'): # out_std = tf.exp(0.5*logstd + 0.0) # pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name='fc%i' % (i + 1), kernel_initializer=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense( last_out, pdtype.param_shape()[0] // 2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) # pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) import numpy as np pdparam = tf.concat([ mean, mean * 0.0 + np.random.randn(pdtype.param_shape()[0] // 2) * logstd ], axis=1) else: pdparam = tf.layers.dense( last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, num_options=2, dc=0): assert isinstance(ob_space, gym.spaces.Box) self.ac_space_dim = ac_space.shape[0] self.ob_space_dim = ob_space.shape[0] self.dc = dc self.last_action = tf.zeros(ac_space.shape, dtype=tf.float32) self.last_action_init = tf.zeros(ac_space.shape, dtype=tf.float32) self.num_options = num_options self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None]) # create a filter for the pure shape, meaning excluding u[k-1] obs_shape_pure = ((self.ob_space_dim - self.ac_space_dim), ) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope("obfilter_pure"): self.ob_rms_only = RunningMeanStd(shape=obs_shape_pure) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) obz_pure = tf.clip_by_value( (ob[:, :-self.ac_space_dim] - self.ob_rms_only.mean) / self.ob_rms_only.std, -5.0, 5.0) last_out0 = obz # for option 0 last_out1 = obz_pure # for option 1 for i in range(num_hid_layers): last_out0 = tf.nn.tanh( U.dense(last_out0, hid_size, "vffc0%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out1 = tf.nn.tanh( U.dense(last_out1, hid_size, "vffc1%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out0 = U.dense(last_out0, 1, "vfff0", weight_init=U.normc_initializer(1.0)) last_out1 = U.dense(last_out1, 1, "vfff1", weight_init=U.normc_initializer(1.0)) #self.vpred = dense3D2(last_out, 1, "vffinal", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:,0] #last_out0 = tf.Print(last_out0,[tf.size(last_out0[:,0])]) self.vpred = U.switch(option[0], last_out1, last_out0)[:, 0] #self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OPfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) last_out0 = obz # for option 0 last_out1 = obz_pure # for option 1 for i in range(num_hid_layers): last_out0 = tf.nn.tanh( U.dense(last_out0, hid_size, "oppi0%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out1 = tf.nn.tanh( U.dense(last_out1, hid_size, "oppi1%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out0 = U.dense(last_out0, 1, "oppif0", weight_init=U.normc_initializer(1.0)) last_out1 = U.dense(last_out1, 1, "oppif1", weight_init=U.normc_initializer(1.0)) last_out = tf.concat([last_out0, last_out1], 1) self.op_pi = tf.nn.softmax(last_out) self.tpred = tf.nn.sigmoid( dense3D2(tf.stop_gradient(last_out), 1, "termhead", option, num_options=num_options, weight_init=U.normc_initializer(1.0)))[:, 0] #termination_sample = tf.greater(self.tpred, tf.random_uniform(shape=tf.shape(self.tpred),maxval=1.)) termination_sample = tf.constant([True]) # define the angle #ctrl_in = tf.reshape([(tf.math.atan2(ob[:,1],ob[:,0])),(ob[:,2])], [-1,2]) #last_out = ctrl_in last_out = obz_pure for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense3D2(last_out, pdtype.param_shape()[0] // 2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(0.01), bias=False) mean = tf.nn.tanh(mean) logstd = tf.get_variable( name="logstd", shape=[num_options, 1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) #self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OPfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) #ac = tf.Print (ac, [ac,option,ob], "action and option before selecting: ") ac = U.switch(option[0], ac, tf.stop_gradient(ob[:, -self.ac_space_dim:])) ac = tf.clip_by_value(ac, -1.0, 1.0) #ac = U.switch(option[0], tf.constant(1.0), tf.constant(0.0)) #ac = tf.Print (ac, [ac], "action after selection: ") self.last_action = tf.stop_gradient(ac) self._act = U.function([stochastic, ob, option], [ac, self.vpred, last_out, logstd]) self._get_v = U.function([ob, option], [self.vpred]) self.get_term = U.function([ob, option], [termination_sample]) self.get_tpred = U.function([ob, option], [self.tpred]) self.get_vpred = U.function([ob, option], [self.vpred]) self._get_op = U.function([ob], [self.op_pi])
def _init(self, ob_space, ac_space, kind, atom_type_num, args): self.pdtype = MultiCatCategoricalPdType ### 0 Get input ob = { 'adj': U.get_placeholder( name="adj", dtype=tf.float32, shape=[None, ob_space['adj'].shape[0], None, None]), 'node': U.get_placeholder(name="node", dtype=tf.float32, shape=[None, 1, None, ob_space['node'].shape[2]]) } # only when evaluating given action, at training time self.ac_real = U.get_placeholder(name='ac_real', dtype=tf.int64, shape=[None, 4]) # feed groudtruth action ob_node = tf.compat.v1.layers.dense(ob['node'], 8, activation=None, use_bias=False, name='emb') # embedding layer if args.bn == 1: ob_node = tf.compat.v1.layers.batch_normalization(ob_node, axis=-1) if args.has_concat == 1: emb_node = tf.concat( (GCN_batch(ob['adj'], ob_node, args.emb_size, name='gcn1', aggregate=args.gcn_aggregate), ob_node), axis=-1) else: emb_node = GCN_batch(ob['adj'], ob_node, args.emb_size, name='gcn1', aggregate=args.gcn_aggregate) if args.bn == 1: emb_node = tf.compat.v1.layers.batch_normalization(emb_node, axis=-1) for i in range(args.layer_num_g - 2): if args.has_residual == 1: emb_node = GCN_batch( ob['adj'], emb_node, args.emb_size, name='gcn1_' + str(i + 1), aggregate=args.gcn_aggregate) + self.emb_node1 elif args.has_concat == 1: emb_node = tf.concat( (GCN_batch(ob['adj'], emb_node, args.emb_size, name='gcn1_' + str(i + 1), aggregate=args.gcn_aggregate), self.emb_node1), axis=-1) else: emb_node = GCN_batch(ob['adj'], emb_node, args.emb_size, name='gcn1_' + str(i + 1), aggregate=args.gcn_aggregate) if args.bn == 1: emb_node = tf.compat.v1.layers.batch_normalization(emb_node, axis=-1) emb_node = GCN_batch(ob['adj'], emb_node, args.emb_size, is_act=False, is_normalize=(args.bn == 0), name='gcn2', aggregate=args.gcn_aggregate) emb_node = tf.squeeze(emb_node, axis=1) # B*n*f ### 1 only keep effective nodes # ob_mask = tf.cast(tf.transpose(tf.reduce_sum(ob['node'],axis=-1),[0,2,1]),dtype=tf.bool) # B*n*1 ob_len = tf.reduce_sum(tf.squeeze(tf.cast(tf.cast(tf.reduce_sum( ob['node'], axis=-1), dtype=tf.bool), dtype=tf.float32), axis=-2), axis=-1) # B ob_len_first = ob_len - atom_type_num logits_mask = tf.sequence_mask(ob_len, maxlen=tf.shape( ob['node'])[2]) # mask all valid entry logits_first_mask = tf.sequence_mask( ob_len_first, maxlen=tf.shape( ob['node'])[2]) # mask valid entry -3 (rm isolated nodes) if args.mask_null == 1: emb_node_null = tf.zeros(tf.shape(emb_node)) emb_node = tf.where(condition=tf.tile( tf.expand_dims(logits_mask, axis=-1), (1, 1, emb_node.get_shape()[-1])), x=emb_node, y=emb_node_null) ## get graph embedding emb_graph = tf.reduce_sum(emb_node, axis=1, keepdims=True) if args.graph_emb == 1: emb_graph = tf.tile(emb_graph, [1, tf.shape(emb_node)[1], 1]) emb_node = tf.concat([emb_node, emb_graph], axis=2) ### 2 predict stop emb_stop = tf.compat.v1.layers.dense(emb_node, args.emb_size, activation=tf.nn.relu, use_bias=False, name='linear_stop1') if args.bn == 1: emb_stop = tf.compat.v1.layers.batch_normalization(emb_stop, axis=-1) self.logits_stop = tf.reduce_sum(emb_stop, axis=1) self.logits_stop = tf.compat.v1.layers.dense( self.logits_stop, 2, activation=None, name='linear_stop2_1') # B*2 # explicitly show node num # self.logits_stop = tf.concat((tf.reduce_mean(tf.compat.v1.layers.dense(emb_node, 32, activation=tf.nn.relu, name='linear_stop1'),axis=1),tf.reshape(ob_len_first/5,[-1,1])),axis=1) # self.logits_stop = tf.compat.v1.layers.dense(self.logits_stop, 2, activation=None, name='linear_stop2') # B*2 stop_shift = tf.constant([[0, args.stop_shift]], dtype=tf.float32) pd_stop = CategoricalPdType(-1).pdfromflat(flat=self.logits_stop + stop_shift) ac_stop = pd_stop.sample() ### 3.1: select first (active) node # rules: only select effective nodes self.logits_first = tf.compat.v1.layers.dense(emb_node, args.emb_size, activation=tf.nn.relu, name='linear_select1') self.logits_first = tf.squeeze(tf.compat.v1.layers.dense( self.logits_first, 1, activation=None, name='linear_select2'), axis=-1) # B*n logits_first_null = tf.ones(tf.shape(self.logits_first)) * -1000 self.logits_first = tf.where(condition=logits_first_mask, x=self.logits_first, y=logits_first_null) # using own prediction pd_first = CategoricalPdType(-1).pdfromflat(flat=self.logits_first) ac_first = pd_first.sample() mask = tf.one_hot(ac_first, depth=tf.shape(emb_node)[1], dtype=tf.bool, on_value=True, off_value=False) emb_first = tf.boolean_mask(emb_node, mask) emb_first = tf.expand_dims(emb_first, axis=1) # using groud truth action ac_first_real = self.ac_real[:, 0] mask_real = tf.one_hot(ac_first_real, depth=tf.shape(emb_node)[1], dtype=tf.bool, on_value=True, off_value=False) emb_first_real = tf.boolean_mask(emb_node, mask_real) emb_first_real = tf.expand_dims(emb_first_real, axis=1) ### 3.2: select second node # rules: do not select first node # using own prediction # mlp emb_cat = tf.concat( [tf.tile(emb_first, [1, tf.shape(emb_node)[1], 1]), emb_node], axis=2) self.logits_second = tf.compat.v1.layers.dense(emb_cat, args.emb_size, activation=tf.nn.relu, name='logits_second1') self.logits_second = tf.compat.v1.layers.dense(self.logits_second, 1, activation=None, name='logits_second2') # # bilinear # self.logits_second = tf.transpose(bilinear(emb_first, emb_node, name='logits_second'), [0, 2, 1]) self.logits_second = tf.squeeze(self.logits_second, axis=-1) ac_first_mask = tf.one_hot(ac_first, depth=tf.shape(emb_node)[1], dtype=tf.bool, on_value=False, off_value=True) logits_second_mask = tf.logical_and(logits_mask, ac_first_mask) logits_second_null = tf.ones(tf.shape(self.logits_second)) * -1000 self.logits_second = tf.where(condition=logits_second_mask, x=self.logits_second, y=logits_second_null) pd_second = CategoricalPdType(-1).pdfromflat(flat=self.logits_second) ac_second = pd_second.sample() mask = tf.one_hot(ac_second, depth=tf.shape(emb_node)[1], dtype=tf.bool, on_value=True, off_value=False) emb_second = tf.boolean_mask(emb_node, mask) emb_second = tf.expand_dims(emb_second, axis=1) # using groudtruth # mlp emb_cat = tf.concat( [tf.tile(emb_first_real, [1, tf.shape(emb_node)[1], 1]), emb_node], axis=2) self.logits_second_real = tf.compat.v1.layers.dense( emb_cat, args.emb_size, activation=tf.nn.relu, name='logits_second1', reuse=True) self.logits_second_real = tf.compat.v1.layers.dense( self.logits_second_real, 1, activation=None, name='logits_second2', reuse=True) # # bilinear # self.logits_second_real = tf.transpose(bilinear(emb_first_real, emb_node, name='logits_second'), [0, 2, 1]) self.logits_second_real = tf.squeeze(self.logits_second_real, axis=-1) ac_first_mask_real = tf.one_hot(ac_first_real, depth=tf.shape(emb_node)[1], dtype=tf.bool, on_value=False, off_value=True) logits_second_mask_real = tf.logical_and(logits_mask, ac_first_mask_real) self.logits_second_real = tf.where(condition=logits_second_mask_real, x=self.logits_second_real, y=logits_second_null) ac_second_real = self.ac_real[:, 1] mask_real = tf.one_hot(ac_second_real, depth=tf.shape(emb_node)[1], dtype=tf.bool, on_value=True, off_value=False) emb_second_real = tf.boolean_mask(emb_node, mask_real) emb_second_real = tf.expand_dims(emb_second_real, axis=1) ### 3.3 predict edge type # using own prediction # MLP emb_cat = tf.concat([emb_first, emb_second], axis=-1) self.logits_edge = tf.compat.v1.layers.dense(emb_cat, args.emb_size, activation=tf.nn.relu, name='logits_edge1') self.logits_edge = tf.compat.v1.layers.dense(self.logits_edge, ob['adj'].get_shape()[1], activation=None, name='logits_edge2') self.logits_edge = tf.squeeze(self.logits_edge, axis=1) # # bilinear # self.logits_edge = tf.reshape(bilinear_multi(emb_first,emb_second,out_dim=ob['adj'].get_shape()[1]),[-1,ob['adj'].get_shape()[1]]) pd_edge = CategoricalPdType(-1).pdfromflat(self.logits_edge) ac_edge = pd_edge.sample() # using ground truth # MLP emb_cat = tf.concat([emb_first_real, emb_second_real], axis=-1) self.logits_edge_real = tf.compat.v1.layers.dense( emb_cat, args.emb_size, activation=tf.nn.relu, name='logits_edge1', reuse=True) self.logits_edge_real = tf.compat.v1.layers.dense( self.logits_edge_real, ob['adj'].get_shape()[1], activation=None, name='logits_edge2', reuse=True) self.logits_edge_real = tf.squeeze(self.logits_edge_real, axis=1) # # bilinear # self.logits_edge_real = tf.reshape(bilinear_multi(emb_first_real, emb_second_real, out_dim=ob['adj'].get_shape()[1]), # [-1, ob['adj'].get_shape()[1]]) # ncat_list = [tf.shape(logits_first),ob_space['adj'].shape[-1],ob_space['adj'].shape[0]] self.pd = self.pdtype(-1).pdfromflat([ self.logits_first, self.logits_second_real, self.logits_edge_real, self.logits_stop ]) self.vpred = tf.compat.v1.layers.dense(emb_node, args.emb_size, use_bias=False, activation=tf.nn.relu, name='value1') if args.bn == 1: self.vpred = tf.compat.v1.layers.batch_normalization(self.vpred, axis=-1) self.vpred = tf.reduce_max(self.vpred, axis=1) self.vpred = tf.compat.v1.layers.dense(self.vpred, 1, activation=None, name='value2') self.state_in = [] self.state_out = [] self.ac = tf.concat( (tf.expand_dims(ac_first, axis=1), tf.expand_dims( ac_second, axis=1), tf.expand_dims( ac_edge, axis=1), tf.expand_dims(ac_stop, axis=1)), axis=1) debug = {} debug['ob_node'] = tf.shape(ob['node']) debug['ob_adj'] = tf.shape(ob['adj']) debug['emb_node'] = emb_node debug['logits_stop'] = self.logits_stop debug['logits_second'] = self.logits_second debug['ob_len'] = ob_len debug['logits_first_mask'] = logits_first_mask debug['logits_second_mask'] = logits_second_mask # debug['pd'] = self.pd.logp(self.ac) debug['ac'] = self.ac stochastic = tf.compat.v1.placeholder(dtype=tf.bool, shape=()) self._act = U.function( [stochastic, ob['adj'], ob['node']], [self.ac, self.vpred, debug]) # add debug in second arg if needed
def __init__(self, scope, *, ob_space, ac_space, hiddens, reuse=False, normalize=False): self.recurrent = True self.normalized = False with tf.variable_scope(scope, reuse=reuse): self.scope = tf.get_variable_scope().name self.pdtype = pdtype = make_pdtype(ac_space) assert isinstance(ob_space, gym.spaces.Box) #self.observation_ph = tf.placeholder(tf.float32, [None, None] + list(ob_space.shape), name="observation") self.observation_ph = U.get_placeholder(name="observation", dtype=tf.float32, shape=[None,None] + list(ob_space.shape)) # ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) self.stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") self.taken_action_ph = tf.placeholder(dtype=tf.float32, shape=[None, None, ac_space.shape[0]], name="taken_action") if self.normalized: if self.normalized != 'ob': self.ret_rms = RunningMeanStd(scope="retfilter") self.ob_rms = RunningMeanStd(shape=ob_space.shape, scope="obsfilter") obz = self.observation_ph if self.normalized: obz = tf.clip_by_value((self.observation_ph - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for hidden in hiddens[:-1]: last_out = tf.contrib.layers.fully_connected(last_out, hidden) self.zero_state = [] self.state_in_ph = [] self.state_out = [] cell = tf.contrib.rnn.BasicLSTMCell(hiddens[-1], reuse=reuse) size = cell.state_size self.zero_state.append(np.zeros(size.c, dtype=np.float32)) self.zero_state.append(np.zeros(size.h, dtype=np.float32)) self.state_in_ph.append(tf.placeholder(tf.float32, [None, size.c], name="lstmv_c")) self.state_in_ph.append(tf.placeholder(tf.float32, [None, size.h], name="lstmv_h")) initial_state = tf.contrib.rnn.LSTMStateTuple(self.state_in_ph[-2], self.state_in_ph[-1]) last_out, state_out = tf.nn.dynamic_rnn(cell, last_out, initial_state=initial_state, scope="lstmv") self.state_out.append(state_out) self.vpredz = tf.contrib.layers.fully_connected(last_out, 1, activation_fn=None)[:, :, 0] self.vpred = self.vpredz if self.normalized and self.normalized != 'ob': self.vpred = self.vpredz * self.ret_rms.std + self.ret_rms.mean # raw = not standardized last_out = obz for hidden in hiddens[:-1]: last_out = tf.contrib.layers.fully_connected(last_out, hidden) cell = tf.contrib.rnn.BasicLSTMCell(hiddens[-1], reuse=reuse) size = cell.state_size print(" SIZE ") print(size) self.zero_state.append(np.zeros(size.c, dtype=np.float32)) self.zero_state.append(np.zeros(size.h, dtype=np.float32)) self.state_in_ph.append(tf.placeholder(tf.float32, [None, size.c], name="lstmp_c")) self.state_in_ph.append(tf.placeholder(tf.float32, [None, size.h], name="lstmp_h")) initial_state = tf.contrib.rnn.LSTMStateTuple(self.state_in_ph[-2], self.state_in_ph[-1]) last_out, state_out = tf.nn.dynamic_rnn(cell, last_out, initial_state=initial_state, scope="lstmp") self.state_out.append(state_out) mean = tf.contrib.layers.fully_connected(last_out, ac_space.shape[0], activation_fn=None) logstd = tf.get_variable(name="logstd", shape=[1, ac_space.shape[0]], initializer=tf.zeros_initializer()) self.pd = DiagonalGaussian(mean, logstd) self.sampled_action = switch(self.stochastic_ph, self.pd.sample(), self.pd.mode()) self.zero_state = np.array(self.zero_state) self.state_in_ph = tuple(self.state_in_ph) self.state = self.zero_state for p in self.get_trainable_variables(): tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, tf.reduce_sum(tf.square(p)))
def _build(self): ac_space = self._ac_space num_hid_layers = self._num_hid_layers hid_size = self._hid_size gaussian_fixed_var = self._gaussian_fixed_var # obs self._obs = {} for ob_name, ob_shape in self._ob_shape.items(): self._obs[ob_name] = U.get_placeholder( name="ob_{}_primitive".format(ob_name), dtype=tf.float32, shape=[None] + self._ob_shape[ob_name]) # obs normalization self.ob_rms = {} for ob_name in self.ob_type: with tf.variable_scope("ob_rms_{}".format(ob_name)): self.ob_rms[ob_name] = RunningMeanStd( shape=self._ob_shape[ob_name]) obz = [(self._obs[ob_name] - self.ob_rms[ob_name].mean) / self.ob_rms[ob_name].std for ob_name in self.ob_type] obz = [tf.clip_by_value(ob, -5.0, 5.0) for ob in obz] obz = tf.concat(obz, -1) # value function with tf.variable_scope("vf"): last_out = obz for i in range(num_hid_layers): last_out = self._activation( tf.layers.dense( last_out, hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense( last_out, 1, name="final", kernel_initializer=U.normc_initializer(1.0))[:, 0] # primitive policy self.pdtype = pdtype = make_pdtype(ac_space) with tf.variable_scope("pol"): last_out = obz for i in range(num_hid_layers): last_out = self._activation( tf.layers.dense( last_out, hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense( last_out, pdtype.param_shape()[0] // 2, name="final", kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense( last_out, pdtype.param_shape()[0], name="final", kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) # sample action stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.obs = [self._obs[ob_name] for ob_name in self.ob_type] self._act = U.function([stochastic] + self.obs, [ac, self.vpred]) self._value = U.function(self.obs, self.vpred)
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, lstm_hid_size, kind): print("This is lstm policy for only sensors.") assert isinstance(ob_space, tuple) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob_p = U.get_placeholder(name="ob_physics", dtype=tf.float32, shape=[sequence_length] + list(ob_space[0].shape)) ob_f= U.get_placeholder(name="ob_frames", dtype=tf.float32, shape=[sequence_length]+list(ob_space[1].shape)) #process ob_p with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape = ob_space[0].shape) obpz = tf.clip_by_value((ob_p - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) #process ob_f x = ob_f / 255.0 if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0))) elif kind == 'large': # Nature DQN x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0))) else: raise NotImplementedError # lstm layer for memmory lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_hid_size, state_is_tuple=True, name = "rnn") c_init = np.zeros((1, lstm_cell.state_size.c), np.float32) h_init = np.zeros((1, lstm_cell.state_size.h), np.float32) self.state_init = (c_init, h_init) c_in = U.get_placeholder(name="state_c", dtype=tf.float32,shape=(None, lstm_cell.state_size.c)) h_in = U.get_placeholder(name="state_h", dtype=tf.float32,shape=(None, lstm_cell.state_size.h)) self.state_in = (c_in, h_in) state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in) lstm_outputs, lstm_states = lstm_cell(x, state_in) lstm_c, lstm_h = lstm_states self.state_out = (lstm_c, lstm_h) rnn_out = tf.reshape(lstm_outputs, (-1, lstm_hid_size)) # conjugate sensor and physics ob_last = tf.concat((rnn_out, obpz), axis = -1) # value network with tf.variable_scope("vf"): last_out = ob_last for i in range(num_hid_layers): last_out = tf.nn.relu(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0] with tf.variable_scope("pol"): last_out = ob_last for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0))) logits = tf.layers.dense(last_out, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob_p, ob_f, c_in, h_in], [ac, self.vpred, lstm_c, lstm_h])