def nature_cnn(scaled_images, **kwargs): """ CNN from Nature paper. :param scaled_images: (TensorFlow Tensor) Image input placeholder :param kwargs: (dict) Extra keywords parameters for the convolutional layers of the CNN :return: (TensorFlow Tensor) The CNN output layer """ activ = tf.nn.relu layer_1 = activ( conv(scaled_images, 'c1', n_filters=32, filter_size=8, stride=4, init_scale=np.sqrt(2), **kwargs)) layer_2 = activ( conv(layer_1, 'c2', n_filters=64, filter_size=4, stride=2, init_scale=np.sqrt(2), **kwargs)) layer_3 = activ( conv(layer_2, 'c3', n_filters=64, filter_size=3, stride=1, init_scale=np.sqrt(2), **kwargs)) layer_3 = conv_to_fc(layer_3) return activ(linear(layer_3, 'fc1', n_hidden=512, init_scale=np.sqrt(2)))
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_mem=64, reuse=False, layers=None, net_arch=None, act_fun=tf.tanh, layer_norm=False, feature_extraction="mlp", **kwargs): assert len(ob_space.shape) == 1 nfeatures = ob_space.shape[0] super(MemoryPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, state_shape=(n_mem * nfeatures, ), reuse=reuse, scale=(feature_extraction == "cnn")) if net_arch is None: # Legacy mode if layers is None: layers = [64, 64] else: warnings.warn( "The layers parameter is deprecated. Use the net_arch parameter instead." ) with tf.compat.v1.variable_scope("model", reuse=reuse): extracted_features = tf.compat.v1.layers.flatten(self.obs_ph) input_sequence = batch_to_seq(extracted_features, self.n_env, n_steps) masks = batch_to_seq(self.dones_ph, self.n_env, n_steps) output, self.snew = mlpstack(input_sequence, masks, self.states_ph, 'mem1', n_hidden=64, layer_norm=layer_norm, act_fun=act_fun) output = seq_to_batch(output) value_fn = linear(output, 'vf', 1) self._proba_distribution, self._policy, self.q_value = self.pdtype.proba_distribution_from_latent( output, output) self._value_fn = value_fn else: raise NotImplementedError self._setup_init()
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, layers=None, net_arch=None, act_fun=tf.tanh, cnn_extractor=nature_cnn, feature_extraction="cnn", **kwargs): super(FeedForwardPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=(feature_extraction == "cnn")) self._kwargs_check(feature_extraction, kwargs) if layers is not None: warnings.warn( "Usage of the `layers` parameter is deprecated! Use net_arch instead " "(it has a different semantics though).", DeprecationWarning) if net_arch is not None: warnings.warn( "The new `net_arch` parameter overrides the deprecated `layers` parameter!", DeprecationWarning) if net_arch is None: if layers is None: layers = [64, 64] net_arch = [dict(vf=layers, pi=layers)] with tf.compat.v1.variable_scope("model", reuse=reuse): if feature_extraction == "cnn": pi_latent = vf_latent = cnn_extractor(self.processed_obs, **kwargs) else: pi_latent, vf_latent = mlp_extractor( tf.compat.v1.layers.flatten(self.processed_obs), net_arch, act_fun) self._value_fn = linear(vf_latent, 'vf', 1) self._proba_distribution, self._policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01) self._setup_init()
def mlp_extractor(flat_observations, net_arch, act_fun): """ Constructs an MLP that receives observations as an input and outputs a latent representation for the policy and a value network. The ``net_arch`` parameter allows to specify the amount and size of the hidden layers and how many of them are shared between the policy network and the value network. It is assumed to be a list with the following structure: 1. An arbitrary length (zero allowed) number of integers each specifying the number of units in a shared layer. If the number of ints is zero, there will be no shared layers. 2. An optional dict, to specify the following non-shared layers for the value network and the policy network. It is formatted like ``dict(vf=[<value layer sizes>], pi=[<policy layer sizes>])``. If it is missing any of the keys (pi or vf), no non-shared layers (empty list) is assumed. For example to construct a network with one shared layer of size 55 followed by two non-shared layers for the value network of size 255 and a single non-shared layer of size 128 for the policy network, the following layers_spec would be used: ``[55, dict(vf=[255, 255], pi=[128])]``. A simple shared network topology with two layers of size 128 would be specified as [128, 128]. :param flat_observations: (tf.Tensor) The observations to base policy and value function on. :param net_arch: ([int or dict]) The specification of the policy and value networks. See above for details on its formatting. :param act_fun: (tf function) The activation function to use for the networks. :return: (tf.Tensor, tf.Tensor) latent_policy, latent_value of the specified network. If all layers are shared, then ``latent_policy == latent_value`` """ latent = flat_observations policy_only_layers = [ ] # Layer sizes of the network that only belongs to the policy network value_only_layers = [ ] # Layer sizes of the network that only belongs to the value network # Iterate through the shared layers and build the shared parts of the network for idx, layer in enumerate(net_arch): if isinstance(layer, int): # Check that this is a shared layer layer_size = layer latent = act_fun( linear(latent, "shared_fc{}".format(idx), layer_size, init_scale=np.sqrt(2))) else: assert isinstance( layer, dict ), "Error: the net_arch list can only contain ints and dicts" if 'pi' in layer: assert isinstance( layer['pi'], list ), "Error: net_arch[-1]['pi'] must contain a list of integers." policy_only_layers = layer['pi'] if 'vf' in layer: assert isinstance( layer['vf'], list ), "Error: net_arch[-1]['vf'] must contain a list of integers." value_only_layers = layer['vf'] break # From here on the network splits up in policy and value network # Build the non-shared part of the network latent_policy = latent latent_value = latent for idx, (pi_layer_size, vf_layer_size) in enumerate( zip_longest(policy_only_layers, value_only_layers)): if pi_layer_size is not None: assert isinstance( pi_layer_size, int), "Error: net_arch[-1]['pi'] must only contain integers." latent_policy = act_fun( linear(latent_policy, "pi_fc{}".format(idx), pi_layer_size, init_scale=np.sqrt(2))) if vf_layer_size is not None: assert isinstance( vf_layer_size, int), "Error: net_arch[-1]['vf'] must only contain integers." latent_value = act_fun( linear(latent_value, "vf_fc{}".format(idx), vf_layer_size, init_scale=np.sqrt(2))) return latent_policy, latent_value
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=256, reuse=False, layers=None, net_arch=None, act_fun=tf.tanh, cnn_extractor=nature_cnn, layer_norm=False, feature_extraction="cnn", **kwargs): # state_shape = [n_lstm * 2] dim because of the cell and hidden states of the LSTM super(LstmPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, state_shape=(2 * n_lstm, ), reuse=reuse, scale=(feature_extraction == "cnn")) self._kwargs_check(feature_extraction, kwargs) if net_arch is None: # Legacy mode if layers is None: layers = [64, 64] else: warnings.warn( "The layers parameter is deprecated. Use the net_arch parameter instead." ) with tf.variable_scope("model", reuse=reuse): if feature_extraction == "cnn": extracted_features = cnn_extractor(self.processed_obs, **kwargs) else: extracted_features = tf.layers.flatten(self.processed_obs) for i, layer_size in enumerate(layers): extracted_features = act_fun( linear(extracted_features, 'pi_fc' + str(i), n_hidden=layer_size, init_scale=np.sqrt(2))) input_sequence = batch_to_seq(extracted_features, self.n_env, n_steps) masks = batch_to_seq(self.dones_ph, self.n_env, n_steps) rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm, layer_norm=layer_norm) rnn_output = seq_to_batch(rnn_output) value_fn = linear(rnn_output, 'vf', 1) self._proba_distribution, self._policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(rnn_output, rnn_output) self._value_fn = value_fn else: # Use the new net_arch parameter if layers is not None: warnings.warn( "The new net_arch parameter overrides the deprecated layers parameter." ) if feature_extraction == "cnn": raise NotImplementedError() with tf.variable_scope("model", reuse=reuse): latent = tf.layers.flatten(self.processed_obs) policy_only_layers = [ ] # Layer sizes of the network that only belongs to the policy network value_only_layers = [ ] # Layer sizes of the network that only belongs to the value network # Iterate through the shared layers and build the shared parts of the network lstm_layer_constructed = False for idx, layer in enumerate(net_arch): if isinstance(layer, int): # Check that this is a shared layer layer_size = layer latent = act_fun( linear(latent, "shared_fc{}".format(idx), layer_size, init_scale=np.sqrt(2))) elif layer == "lstm": if lstm_layer_constructed: raise ValueError( "The net_arch parameter must only contain one occurrence of 'lstm'!" ) input_sequence = batch_to_seq(latent, self.n_env, n_steps) masks = batch_to_seq(self.dones_ph, self.n_env, n_steps) rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm, layer_norm=layer_norm) latent = seq_to_batch(rnn_output) lstm_layer_constructed = True else: assert isinstance( layer, dict ), "Error: the net_arch list can only contain ints and dicts" if 'pi' in layer: assert isinstance( layer['pi'], list ), "Error: net_arch[-1]['pi'] must contain a list of integers." policy_only_layers = layer['pi'] if 'vf' in layer: assert isinstance( layer['vf'], list ), "Error: net_arch[-1]['vf'] must contain a list of integers." value_only_layers = layer['vf'] break # From here on the network splits up in policy and value network # Build the non-shared part of the policy-network latent_policy = latent for idx, pi_layer_size in enumerate(policy_only_layers): if pi_layer_size == "lstm": raise NotImplementedError( "LSTMs are only supported in the shared part of the policy network." ) assert isinstance( pi_layer_size, int ), "Error: net_arch[-1]['pi'] must only contain integers." latent_policy = act_fun( linear(latent_policy, "pi_fc{}".format(idx), pi_layer_size, init_scale=np.sqrt(2))) # Build the non-shared part of the value-network latent_value = latent for idx, vf_layer_size in enumerate(value_only_layers): if vf_layer_size == "lstm": raise NotImplementedError( "LSTMs are only supported in the shared part of the value function " "network.") assert isinstance( vf_layer_size, int ), "Error: net_arch[-1]['vf'] must only contain integers." latent_value = act_fun( linear(latent_value, "vf_fc{}".format(idx), vf_layer_size, init_scale=np.sqrt(2))) if not lstm_layer_constructed: raise ValueError( "The net_arch parameter must contain at least one occurrence of 'lstm'!" ) self._value_fn = linear(latent_value, 'vf', 1) # TODO: why not init_scale = 0.001 here like in the feedforward self._proba_distribution, self._policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(latent_policy, latent_value) self._setup_init()
def proba_distribution_from_latent(self, pi_latent_vector, vf_latent_vector, init_scale=1.0, init_bias=0.0): pdparam = linear(pi_latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias) q_values = linear(vf_latent_vector, 'q', self.size, init_scale=init_scale, init_bias=init_bias) return self.proba_distribution_from_flat(pdparam), pdparam, q_values
def proba_distribution_from_latent(self, pi_latent_vector, vf_latent_vector, init_scale=1.0, init_bias=0.0): mean = linear(pi_latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias) logstd = tf.compat.v1.get_variable(name='pi/logstd', shape=[1, self.size], initializer=tf.compat.v1.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) q_values = linear(vf_latent_vector, 'q', self.size, init_scale=init_scale, init_bias=init_bias) return self.proba_distribution_from_flat(pdparam), mean, q_values