def build_two_input_critic_graph(obs_t_ph: tf.Tensor, obs_tPrime_ph: tf.Tensor, exp_spec: ExperimentSpec) -> tf.Tensor: """ Critic network phi input: the observations collected for timestep t and tPrime output: the logits of each action in the action space :return: critic """ with tf.name_scope(vocab.critic_network) as scope: """ ---- Build parameter PHI as a multilayer perceptron ---- """ critic_t = build_MLP_computation_graph( obs_t_ph, 1, exp_spec.theta_nn_h_layer_topo, hidden_layers_activation=exp_spec.theta_hidden_layers_activation, output_layers_activation=exp_spec.theta_output_layers_activation, name=vocab.phi_NeuralNet) critic_tPrime = build_MLP_computation_graph( obs_tPrime_ph, 1, exp_spec.theta_nn_h_layer_topo, hidden_layers_activation=exp_spec.theta_hidden_layers_activation, output_layers_activation=exp_spec.theta_output_layers_activation, reuse=True, name=vocab.phi_NeuralNet) return critic_t, critic_tPrime
def test_build_MLP_computation_graph_with_DISCRETE_adapter(gym_discrete_setup): _, playground = gym_discrete_setup input_placeholder, out_placeholder, Q_values_ph = bloc.gym_playground_to_tensorflow_graph_adapter( playground, action_shape_constraint=(1, )) bloc.build_MLP_computation_graph(input_placeholder, playground.ACTION_CHOICES, hidden_layer_topology=(2, 2))
def test_build_MLP_computation_graph_io(tf_setup, gym_discrete_setup): _, out_p, nn_shape = tf_setup exp_spec, playground = gym_discrete_setup keras_input = keras.Input(shape=(12, )) mlp_hidden_ops = bloc.build_MLP_computation_graph( keras_input, playground.ACTION_CHOICES, nn_shape) print("\n\n>>> {}\n\n".format(mlp_hidden_ops))
def REINFORCE_policy(observation_placeholder: tf.Tensor, action_placeholder: tf.Tensor, Q_values_placeholder: tf.Tensor, experiment_spec: ExperimentSpec, playground: GymPlayground) -> (tf.Tensor, tf.Tensor, tf.Tensor): """ The learning agent: REINFORCE (aka: Basic Policy Gradient) Based on the paper by Williams, R. J. Simple statistical gradient-following algorithms for connectionist reinforcement learning. (1992) Policy gradient is a on-policy method which seek to directly optimize the policy π_θ by using sampled trajectories τ as weight. Those weight will then be used to indicate how good the policy performed. Based on that knowledge, the algorithm update the parameter θ of his policy to make action leading to similar good trajectories more likely and similar bad trajectories less likely. In the case of Deep Reinforcement Learning, the policy parameter θ is a neural net. :type observation_placeholder: tf.Tensor :type action_placeholder: tf.Tensor :type Q_values_placeholder: tf.Tensor :type playground: GymPlayground :type experiment_spec: ExperimentSpec :return: (sampled_action, theta_mlp, pseudo_loss) :rtype: (tf.Tensor, tf.Tensor, tf.Tensor) """ with tf.name_scope(vocab.REINFORCE) as scope: """ ---- Build parameter theta as a multilayer perceptron ---- """ theta_mlp = build_MLP_computation_graph(observation_placeholder, playground.ACTION_CHOICES, experiment_spec.theta_nn_h_layer_topo, hidden_layers_activation=experiment_spec.theta_hidden_layers_activation, output_layers_activation=experiment_spec.theta_output_layers_activation, name=vocab.theta_NeuralNet) # ::Discrete case if isinstance(playground.env.action_space, gym.spaces.Discrete): """ ---- Assess the input shape compatibility ---- """ are_compatible = observation_placeholder.shape.as_list()[-1] == playground.OBSERVATION_SPACE.shape[0] assert are_compatible, ("the observation_placeholder is incompatible with environment, " "{} != {}").format(observation_placeholder.shape.as_list()[-1], playground.OBSERVATION_SPACE.shape[0]) """ ---- Build the policy for discrete space ---- """ sampled_action, log_p_all = policy_theta_discrete_space(theta_mlp, playground) """ ---- Build the pseudo loss function ---- """ pseudo_loss = discrete_pseudo_loss(log_p_all, action_placeholder, Q_values_placeholder, playground, vocab.pseudo_loss) # ::Continuous case elif isinstance(playground.env.action_space, gym.spaces.Box): raise NotImplementedError # (Ice-Boxed) todo:implement --> for policy for continuous space: # ::Other gym environment else: print("\n>>> The agent implementation does not support that environment space " "{} yet.\n\n".format(playground.env.action_space)) raise NotImplementedError return sampled_action, theta_mlp, pseudo_loss
def test_policy_theta_discrete_space_PARAM_FAIL(gym_and_tf_discrete_setup): _, act_p, exp_spec, playground = gym_and_tf_discrete_setup obs_p_wrong_shape = tf_cv1.placeholder(tf.float32, shape=(None, 3)) theta_mlp = bloc.build_MLP_computation_graph( obs_p_wrong_shape, playground.ACTION_CHOICES, exp_spec.theta_nn_h_layer_topo) with pytest.raises(AssertionError): bloc.policy_theta_discrete_space(obs_p_wrong_shape, playground)
def test_policy_theta_continuous_space_ENV_NOT_DISCRETE( gym_and_tf_discrete_setup): obs_p, act_p, exp_spec, discrete_playground = gym_and_tf_discrete_setup obs_p_wrong_shape = tf_cv1.placeholder(tf.float32, shape=(None, 43)) theta_mlp = bloc.build_MLP_computation_graph( obs_p, discrete_playground.ACTION_CHOICES, exp_spec.theta_nn_h_layer_topo) with pytest.raises(AssertionError): bloc.policy_theta_continuous_space(theta_mlp, discrete_playground)
def build_actor_policy_graph( observation_placeholder: tf.Tensor, experiment_spec: ExperimentSpec, playground: GymPlayground) -> (tf.Tensor, tf.Tensor, tf.Tensor): """ The ACTOR graph(aka the policy network) 1. Actor network theta input: the observations collected output: the logits of each action in the action space 2. Policy input: the actor network output: a selected action & the probabilities of each action in the action space :return: sampled_action, log_pi_all, theta_mlp """ with tf.name_scope(vocab.actor_network) as scope: # ::Discrete case if isinstance(playground.env.action_space, gym.spaces.Discrete): """ ---- Assess the input shape compatibility ---- """ are_compatible = observation_placeholder.shape.as_list( )[-1] == playground.OBSERVATION_SPACE.shape[0] assert are_compatible, ( "the observation_placeholder is incompatible with environment, " "{} != {}").format(observation_placeholder.shape.as_list()[-1], playground.OBSERVATION_SPACE.shape[0]) """ ---- Build parameter THETA as a multilayer perceptron ---- """ theta_mlp = build_MLP_computation_graph( observation_placeholder, playground.ACTION_CHOICES, experiment_spec.theta_nn_h_layer_topo, hidden_layers_activation=experiment_spec. theta_hidden_layers_activation, output_layers_activation=experiment_spec. theta_output_layers_activation, name=vocab.theta_NeuralNet) """ ---- Build the policy for discrete space ---- """ sampled_action, log_pi_all = policy_theta_discrete_space( theta_mlp, playground) # ::Continuous case elif isinstance(playground.env.action_space, gym.spaces.Box): raise NotImplementedError # (Ice-Boxed) todo:implement --> for policy for continuous space: # ::Other gym environment else: print( "\n>>> The agent implementation does not support that environment space " "{} yet.\n\n".format(playground.env.action_space)) raise NotImplementedError return sampled_action, log_pi_all, theta_mlp
def test_integration_Playground_to_adapter_to_build_graph( gym_continuous_setup): exp_spec, playground = gym_continuous_setup # (!) fake input data input_data = np.ones((20, *playground.OBSERVATION_SPACE.shape)) input_placeholder, out_placeholder, Q_values_ph = bloc.gym_playground_to_tensorflow_graph_adapter( playground, action_shape_constraint=(1, )) """Build a Multi Layer Perceptron (MLP) as the policy parameter theta using a computation graph""" theta = bloc.build_MLP_computation_graph(input_placeholder, playground.ACTION_CHOICES, exp_spec.theta_nn_h_layer_topo) writer = tf_cv1.summary.FileWriter('./graph', tf_cv1.get_default_graph()) with tf_cv1.Session() as sess: # initialize random variable in the computation graph sess.run(tf_cv1.global_variables_initializer()) # execute mlp computation graph with input data a = sess.run(theta, feed_dict={input_placeholder: input_data}) # print("\n\n>>>run theta:\n{}\n\n".format(a)) writer.close()
def test_policy_theta_discrete_space_PASS(gym_and_tf_discrete_setup): obs_p, act_p, exp_spec, playground = gym_and_tf_discrete_setup theta_mlp = bloc.build_MLP_computation_graph( obs_p, playground.ACTION_CHOICES, exp_spec.theta_nn_h_layer_topo) bloc.policy_theta_discrete_space(theta_mlp, playground)
def build_actor_critic_shared_graph( obs_ph: tf.Tensor, exp_spec: ExperimentSpec, playground: GymPlayground ) -> (tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor): """ The ACTOR-CRITIC shared network variant architecture 1. Actor network theta input: the observations collected output: the logits of each action in the action space 2. Policy input: the actor network output: a selected action & the probabilities of each action in the action space 3. Critic network phi input: the observations collected output: the logits of each action in the action space :return: sampled_action, log_pi_all, theta_shared_MLP, critic """ """ ---- Assess the input shape compatibility ---- """ are_compatible = obs_ph.shape.as_list( )[-1] == playground.OBSERVATION_SPACE.shape[0] assert are_compatible, ( "the observation_placeholder is incompatible with environment, " "{} != {}").format(obs_ph.shape.as_list()[-1], playground.OBSERVATION_SPACE.shape[0]) # ::Discrete case if isinstance(playground.env.action_space, gym.spaces.Discrete): """ ---- Build parameter THETA as a multilayer perceptron ---- """ theta_shared_MLP = build_MLP_computation_graph( obs_ph, playground.ACTION_CHOICES, exp_spec.theta_nn_h_layer_topo, hidden_layers_activation=exp_spec.theta_hidden_layers_activation, output_layers_activation=exp_spec.theta_output_layers_activation, reuse=None, # <-- (!) name=vocab.shared_network) """ ---- Build the policy for discrete space ---- """ sampled_action, log_pi_all = policy_theta_discrete_space( theta_shared_MLP, playground) # ::Continuous case elif isinstance(playground.env.action_space, gym.spaces.Box): raise NotImplementedError # (Ice-Boxed) todo:implement --> for policy for continuous space: # ::Other gym environment else: print( "\n>>> The agent implementation does not support that environment space " "{} yet.\n\n".format(playground.env.action_space)) raise NotImplementedError """ ---- Build the Critic ---- """ phi_shared_MLP = build_MLP_computation_graph( obs_ph, playground.ACTION_CHOICES, exp_spec.theta_nn_h_layer_topo, hidden_layers_activation=exp_spec.theta_hidden_layers_activation, output_layers_activation=exp_spec.theta_output_layers_activation, reuse=True, # <-- (!) name=vocab.shared_network) critic = build_MLP_computation_graph( phi_shared_MLP, 1, (), hidden_layers_activation=exp_spec.theta_hidden_layers_activation, output_layers_activation=exp_spec.theta_output_layers_activation, name=vocab.V_estimate) return sampled_action, log_pi_all, theta_shared_MLP, critic