def build_two_input_critic_graph(obs_t_ph: tf.Tensor, obs_tPrime_ph: tf.Tensor,
                                 exp_spec: ExperimentSpec) -> tf.Tensor:
    """
    Critic network phi
            input: the observations collected for timestep t and tPrime
            output: the logits of each action in the action space

    :return: critic
    """

    with tf.name_scope(vocab.critic_network) as scope:
        """ ---- Build parameter PHI as a multilayer perceptron ---- """
        critic_t = build_MLP_computation_graph(
            obs_t_ph,
            1,
            exp_spec.theta_nn_h_layer_topo,
            hidden_layers_activation=exp_spec.theta_hidden_layers_activation,
            output_layers_activation=exp_spec.theta_output_layers_activation,
            name=vocab.phi_NeuralNet)

        critic_tPrime = build_MLP_computation_graph(
            obs_tPrime_ph,
            1,
            exp_spec.theta_nn_h_layer_topo,
            hidden_layers_activation=exp_spec.theta_hidden_layers_activation,
            output_layers_activation=exp_spec.theta_output_layers_activation,
            reuse=True,
            name=vocab.phi_NeuralNet)

    return critic_t, critic_tPrime
def test_build_MLP_computation_graph_with_DISCRETE_adapter(gym_discrete_setup):
    _, playground = gym_discrete_setup
    input_placeholder, out_placeholder, Q_values_ph = bloc.gym_playground_to_tensorflow_graph_adapter(
        playground, action_shape_constraint=(1, ))
    bloc.build_MLP_computation_graph(input_placeholder,
                                     playground.ACTION_CHOICES,
                                     hidden_layer_topology=(2, 2))
def test_build_MLP_computation_graph_io(tf_setup, gym_discrete_setup):
    _, out_p, nn_shape = tf_setup
    exp_spec, playground = gym_discrete_setup
    keras_input = keras.Input(shape=(12, ))

    mlp_hidden_ops = bloc.build_MLP_computation_graph(
        keras_input, playground.ACTION_CHOICES, nn_shape)
    print("\n\n>>> {}\n\n".format(mlp_hidden_ops))
def REINFORCE_policy(observation_placeholder: tf.Tensor, action_placeholder: tf.Tensor, Q_values_placeholder: tf.Tensor,
                     experiment_spec: ExperimentSpec, playground: GymPlayground) -> (tf.Tensor, tf.Tensor, tf.Tensor):
    """
    The learning agent: REINFORCE (aka: Basic Policy Gradient)
    Based on the paper by Williams, R. J.
         Simple statistical gradient-following algorithms for connectionist reinforcement learning. (1992)

    Policy gradient is a on-policy method which seek to directly optimize the policy π_θ by using sampled trajectories τ
    as weight. Those weight will then be used to indicate how good the policy performed.
    Based on that knowledge, the algorithm update the parameter θ of his policy to make action leading to similar good
    trajectories more likely and similar bad trajectories less likely.
    In the case of Deep Reinforcement Learning, the policy parameter θ is a neural net.

    :type observation_placeholder: tf.Tensor
    :type action_placeholder: tf.Tensor
    :type Q_values_placeholder: tf.Tensor
    :type playground: GymPlayground
    :type experiment_spec: ExperimentSpec
    :return: (sampled_action, theta_mlp, pseudo_loss)
    :rtype: (tf.Tensor, tf.Tensor, tf.Tensor)
    """
    with tf.name_scope(vocab.REINFORCE) as scope:

        """ ---- Build parameter theta as a multilayer perceptron ---- """
        theta_mlp = build_MLP_computation_graph(observation_placeholder, playground.ACTION_CHOICES,
                                                experiment_spec.theta_nn_h_layer_topo,
                                                hidden_layers_activation=experiment_spec.theta_hidden_layers_activation,
                                                output_layers_activation=experiment_spec.theta_output_layers_activation,
                                                name=vocab.theta_NeuralNet)

        # ::Discrete case
        if isinstance(playground.env.action_space, gym.spaces.Discrete):

            """ ---- Assess the input shape compatibility ---- """
            are_compatible = observation_placeholder.shape.as_list()[-1] == playground.OBSERVATION_SPACE.shape[0]
            assert are_compatible, ("the observation_placeholder is incompatible with environment, "
                                    "{} != {}").format(observation_placeholder.shape.as_list()[-1],
                                                       playground.OBSERVATION_SPACE.shape[0])

            """ ---- Build the policy for discrete space ---- """
            sampled_action, log_p_all = policy_theta_discrete_space(theta_mlp, playground)

            """ ---- Build the pseudo loss function ---- """
            pseudo_loss = discrete_pseudo_loss(log_p_all, action_placeholder, Q_values_placeholder, playground,
                                               vocab.pseudo_loss)

        # ::Continuous case
        elif isinstance(playground.env.action_space, gym.spaces.Box):
            raise NotImplementedError   # (Ice-Boxed) todo:implement -->  for policy for continuous space:

        # ::Other gym environment
        else:
            print("\n>>> The agent implementation does not support that environment space "
                  "{} yet.\n\n".format(playground.env.action_space))
            raise NotImplementedError

    return sampled_action, theta_mlp, pseudo_loss
def test_policy_theta_discrete_space_PARAM_FAIL(gym_and_tf_discrete_setup):

    _, act_p, exp_spec, playground = gym_and_tf_discrete_setup
    obs_p_wrong_shape = tf_cv1.placeholder(tf.float32, shape=(None, 3))
    theta_mlp = bloc.build_MLP_computation_graph(
        obs_p_wrong_shape, playground.ACTION_CHOICES,
        exp_spec.theta_nn_h_layer_topo)

    with pytest.raises(AssertionError):
        bloc.policy_theta_discrete_space(obs_p_wrong_shape, playground)
def test_policy_theta_continuous_space_ENV_NOT_DISCRETE(
        gym_and_tf_discrete_setup):

    obs_p, act_p, exp_spec, discrete_playground = gym_and_tf_discrete_setup
    obs_p_wrong_shape = tf_cv1.placeholder(tf.float32, shape=(None, 43))
    theta_mlp = bloc.build_MLP_computation_graph(
        obs_p, discrete_playground.ACTION_CHOICES,
        exp_spec.theta_nn_h_layer_topo)

    with pytest.raises(AssertionError):
        bloc.policy_theta_continuous_space(theta_mlp, discrete_playground)
def build_actor_policy_graph(
        observation_placeholder: tf.Tensor, experiment_spec: ExperimentSpec,
        playground: GymPlayground) -> (tf.Tensor, tf.Tensor, tf.Tensor):
    """
    The ACTOR graph(aka the policy network)

        1. Actor network theta
            input: the observations collected
            output: the logits of each action in the action space

        2. Policy
            input: the actor network
            output: a selected action & the probabilities of each action in the action space

    :return: sampled_action, log_pi_all, theta_mlp
    """
    with tf.name_scope(vocab.actor_network) as scope:

        # ::Discrete case
        if isinstance(playground.env.action_space, gym.spaces.Discrete):
            """ ---- Assess the input shape compatibility ---- """
            are_compatible = observation_placeholder.shape.as_list(
            )[-1] == playground.OBSERVATION_SPACE.shape[0]
            assert are_compatible, (
                "the observation_placeholder is incompatible with environment, "
                "{} != {}").format(observation_placeholder.shape.as_list()[-1],
                                   playground.OBSERVATION_SPACE.shape[0])
            """ ---- Build parameter THETA as a multilayer perceptron ---- """
            theta_mlp = build_MLP_computation_graph(
                observation_placeholder,
                playground.ACTION_CHOICES,
                experiment_spec.theta_nn_h_layer_topo,
                hidden_layers_activation=experiment_spec.
                theta_hidden_layers_activation,
                output_layers_activation=experiment_spec.
                theta_output_layers_activation,
                name=vocab.theta_NeuralNet)
            """ ---- Build the policy for discrete space ---- """
            sampled_action, log_pi_all = policy_theta_discrete_space(
                theta_mlp, playground)

        # ::Continuous case
        elif isinstance(playground.env.action_space, gym.spaces.Box):
            raise NotImplementedError  # (Ice-Boxed) todo:implement -->  for policy for continuous space:

        # ::Other gym environment
        else:
            print(
                "\n>>> The agent implementation does not support that environment space "
                "{} yet.\n\n".format(playground.env.action_space))
            raise NotImplementedError

    return sampled_action, log_pi_all, theta_mlp
def test_integration_Playground_to_adapter_to_build_graph(
        gym_continuous_setup):
    exp_spec, playground = gym_continuous_setup

    # (!) fake input data
    input_data = np.ones((20, *playground.OBSERVATION_SPACE.shape))

    input_placeholder, out_placeholder, Q_values_ph = bloc.gym_playground_to_tensorflow_graph_adapter(
        playground, action_shape_constraint=(1, ))
    """Build a Multi Layer Perceptron (MLP) as the policy parameter theta using a computation graph"""
    theta = bloc.build_MLP_computation_graph(input_placeholder,
                                             playground.ACTION_CHOICES,
                                             exp_spec.theta_nn_h_layer_topo)

    writer = tf_cv1.summary.FileWriter('./graph', tf_cv1.get_default_graph())
    with tf_cv1.Session() as sess:
        # initialize random variable in the computation graph
        sess.run(tf_cv1.global_variables_initializer())

        # execute mlp computation graph with input data
        a = sess.run(theta, feed_dict={input_placeholder: input_data})

        # print("\n\n>>>run theta:\n{}\n\n".format(a))
    writer.close()
def test_policy_theta_discrete_space_PASS(gym_and_tf_discrete_setup):

    obs_p, act_p, exp_spec, playground = gym_and_tf_discrete_setup
    theta_mlp = bloc.build_MLP_computation_graph(
        obs_p, playground.ACTION_CHOICES, exp_spec.theta_nn_h_layer_topo)
    bloc.policy_theta_discrete_space(theta_mlp, playground)
示例#10
0
def build_actor_critic_shared_graph(
        obs_ph: tf.Tensor, exp_spec: ExperimentSpec, playground: GymPlayground
) -> (tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor):
    """
    The ACTOR-CRITIC shared network variant architecture

        1. Actor network theta
            input: the observations collected
            output: the logits of each action in the action space

        2. Policy
            input: the actor network
            output: a selected action & the probabilities of each action in the action space

        3. Critic network phi
            input: the observations collected
            output: the logits of each action in the action space

    :return: sampled_action, log_pi_all, theta_shared_MLP, critic
    """
    """ ---- Assess the input shape compatibility ---- """
    are_compatible = obs_ph.shape.as_list(
    )[-1] == playground.OBSERVATION_SPACE.shape[0]
    assert are_compatible, (
        "the observation_placeholder is incompatible with environment, "
        "{} != {}").format(obs_ph.shape.as_list()[-1],
                           playground.OBSERVATION_SPACE.shape[0])

    # ::Discrete case
    if isinstance(playground.env.action_space, gym.spaces.Discrete):
        """ ---- Build parameter THETA as a multilayer perceptron ---- """
        theta_shared_MLP = build_MLP_computation_graph(
            obs_ph,
            playground.ACTION_CHOICES,
            exp_spec.theta_nn_h_layer_topo,
            hidden_layers_activation=exp_spec.theta_hidden_layers_activation,
            output_layers_activation=exp_spec.theta_output_layers_activation,
            reuse=None,  # <-- (!)
            name=vocab.shared_network)
        """ ---- Build the policy for discrete space ---- """
        sampled_action, log_pi_all = policy_theta_discrete_space(
            theta_shared_MLP, playground)

    # ::Continuous case
    elif isinstance(playground.env.action_space, gym.spaces.Box):
        raise NotImplementedError  # (Ice-Boxed) todo:implement -->  for policy for continuous space:

    # ::Other gym environment
    else:
        print(
            "\n>>> The agent implementation does not support that environment space "
            "{} yet.\n\n".format(playground.env.action_space))
        raise NotImplementedError
    """ ---- Build the Critic ---- """
    phi_shared_MLP = build_MLP_computation_graph(
        obs_ph,
        playground.ACTION_CHOICES,
        exp_spec.theta_nn_h_layer_topo,
        hidden_layers_activation=exp_spec.theta_hidden_layers_activation,
        output_layers_activation=exp_spec.theta_output_layers_activation,
        reuse=True,  # <-- (!)
        name=vocab.shared_network)

    critic = build_MLP_computation_graph(
        phi_shared_MLP,
        1, (),
        hidden_layers_activation=exp_spec.theta_hidden_layers_activation,
        output_layers_activation=exp_spec.theta_output_layers_activation,
        name=vocab.V_estimate)

    return sampled_action, log_pi_all, theta_shared_MLP, critic