Пример #1
0
    def __init__(self, continuous, ob_dim, action_dim, n_layers):
        self.continuous = continuous
        self.state = tf.placeholder(shape=[None, ob_dim], name="observations", dtype=tf.float32)

        with tf.variable_scope('policy'):
            self.pi = pi = build_mlp(2, self.state, action_dim)
            self.dist, self.sample, self.log_prob = dist_continuous(pi)

        with tf.variable_scope('v_pred'):
            vpred = build_mlp(2, self.state, 1)
            self.vpred = tf.squeeze(vpred, axis=1)
Пример #2
0
 def add_critic_network_op(self):
     """
     Build critic network. Assign it to self.q, self.target_q.
     :param scope: variable scope used for parameters in this network
     :return: None
     """
     self.q_scope = "q"
     self.target_q_scope = "target_q"
     with tf.variable_scope(self.critic_network_scope):
         input = tf.concat([tf.layers.flatten(self.state_placeholder), tf.layers.flatten(self.actions_n_placeholder)],
                           axis=1)
         self.q = build_mlp(input, 1, self.q_scope, self.config.n_layers, self.config.layer_size)
         self.target_q = build_mlp(input, 1, self.target_q_scope, self.config.n_layers, self.config.layer_size)
         if self.config.debug_logging:
             self.q = tf.Print(self.q, [self.q], message="q", summarize=20)
             self.target_q = tf.Print(self.target_q, [self.target_q], message="target_q", summarize=20)
Пример #3
0
    def __init__(self,
                 state_shape,
                 hidden_units=(64, 64),
                 hidden_activation=nn.Tanh()):
        super().__init__()

        self.net = build_mlp(input_dim=state_shape[0],
                             output_dim=1,
                             hidden_units=hidden_units,
                             hidden_activation=hidden_activation)
Пример #4
0
 def __init__(self,
              state_shape,
              action_shape,
              hidden_units=(256, 256),
              hidden_activation=nn.ReLU(inplace=True)):
     super().__init__()
     self.net = build_mlp(input_dim=state_shape[0],
                          output_dim=2 * action_shape[0],
                          hidden_units=hidden_units,
                          hidden_activation=hidden_activation)
Пример #5
0
    def __init__(self,
                 state_shape,
                 gamma,
                 hidden_units_r=(64, 64),
                 hidden_units_v=(64, 64),
                 hidden_activation_r=nn.ReLU(inplace=True),
                 hidden_activation_v=nn.ReLU(inplace=True)):
        super().__init__()

        self.g = build_mlp(input_dim=state_shape[0],
                           output_dim=1,
                           hidden_units=hidden_units_r,
                           hidden_activation=hidden_activation_r)
        self.h = build_mlp(input_dim=state_shape[0],
                           output_dim=1,
                           hidden_units=hidden_units_v,
                           hidden_activation=hidden_activation_v)

        self.gamma = gamma
Пример #6
0
    def add_actor_loss_op(self):
        slice_1 = tf.slice(self.actions_n_placeholder, [0, 0, 0], [self.config.batch_size, self.agent_idx, self.action_dim])
        slice_2 = tf.slice(self.actions_n_placeholder, [0, self.agent_idx+1, 0], [self.config.batch_size, self.env.n - self.agent_idx - 1, self.action_dim])
        action_logits = tf.expand_dims(self.mu_noise, axis=1)
        actions_n = tf.concat([slice_1, action_logits, slice_2], axis=1)
        input = tf.concat([tf.layers.flatten(self.state_placeholder), tf.layers.flatten(actions_n)],
                          axis=1)

        combined_q_scope = self.critic_network_scope + "/" + self.q_scope
        self.q_reuse = build_mlp(input, 1, combined_q_scope, self.config.n_layers, self.config.layer_size)
Пример #7
0
    def __init__(self,
                 state_shape,
                 action_shape,
                 hidden_units=(64, 64),
                 hidden_activation=nn.Tanh()):
        super().__init__()

        self.net = build_mlp(input_dim=state_shape[0],
                             output_dim=action_shape[0],
                             hidden_units=hidden_units,
                             hidden_activation=hidden_activation)
        self.log_stds = nn.Parameter(torch.zeros(1, action_shape[0]))
Пример #8
0
    def build_policy_network_op(self):
        """
            Builds the policy network.
        """
        self.mu_scope = "mu"
        self.target_mu_scope = "target_mu"
        with tf.variable_scope(self.actor_network_scope):
            self.mu = build_mlp(self.observation_placeholder, self.action_dim, self.mu_scope,
                                n_layers=self.config.n_layers, size=self.config.layer_size,
                                output_activation=None, use_batch_normalization=self.config.use_batch_normalization)
            if self.config.debug_logging: self.mu = tf.Print(self.mu, [self.mu], message="mu", summarize=20)
            self.target_mu = build_mlp(self.observation_placeholder, self.action_dim, self.target_mu_scope,
                                       n_layers=self.config.n_layers, size=self.config.layer_size,
                                       output_activation=None,
                                       use_batch_normalization=self.config.use_batch_normalization)

            self.mu_normalized = tf.nn.softmax(self.mu, axis=-1)
            self.target_mu_normalized = tf.nn.softmax(self.target_mu, axis=-1)

            if self.config.param_noise:
                self.setup_param_noise(self.observation_placeholder)
                self.mu_noise = tf.nn.softmax(self.mu - tf.log(-tf.log(tf.random_uniform(tf.shape(self.mu)))), axis=-1)
                if self.config.debug_logging: self.mu_noise = tf.Print(self.mu_noise, [self.mu_noise], summarize=10,
                                                                   message="action logits")
                self.target_mu_noise = tf.nn.softmax(self.target_mu - tf.log(-tf.log(tf.random_uniform(tf.shape(self.target_mu)))), axis=-1)

            elif self.config.random_process_exploration == 0:
                self.mu_noise = tf.nn.softmax(self.mu - tf.log(-tf.log(tf.random_uniform(tf.shape(self.mu)))), axis=-1)
                if self.config.debug_logging: self.mu_noise = tf.Print(self.mu_noise, [self.mu_noise], summarize=10,
                                                                   message="action logits")
                self.target_mu_noise = tf.nn.softmax(self.target_mu - tf.log(-tf.log(tf.random_uniform(tf.shape(self.target_mu)))), axis=-1)
            elif self.config.random_process_exploration == 1:
                self.mu_noise = self.mu_normalized
                self.target_mu_noise = self.target_mu_normalized
            elif self.config.random_process_exploration == 2:
                log_std = tf.get_variable("random_process_log_std", shape=[self.action_dim], dtype=tf.float32)
                std = tf.exp(log_std)
                dist = tf.contrib.distributions.MultivariateNormalDiag(self.mu_normalized, std)
                self.mu_noise = dist.sample()
                self.target_mu_noise = self.target_mu_normalized
Пример #9
0
 def __init__(self,
              state_shape,
              action_shape,
              hidden_units=(256, 256),
              hidden_activation=nn.ReLU(inplace=True)):
     super().__init__()
     # print("Network shape")
     # print(state_shape[0])
     # print(state_shape[1])
     # print(action_shape[0])
     self.net1 = build_mlp(
         #input_dim=state_shape[0] + state_shape[1] + action_shape[0],
         input_dim=state_shape[0] + action_shape[0],
         output_dim=1,
         hidden_units=hidden_units,
         hidden_activation=hidden_activation)
     self.net2 = build_mlp(
         #input_dim=state_shape[0] + state_shape[1] + action_shape[0],
         input_dim=state_shape[0] + action_shape[0],
         output_dim=1,
         hidden_units=hidden_units,
         hidden_activation=hidden_activation)
Пример #10
0
    def build_policy_approx_networks(self):
        """
        Build one network per other agent to estimate what the other agents would do
        :return: None
        """
        policy_approximate_logits = []
        policy_approximate_actions = []
        with tf.variable_scope(self.policy_approx_networks_scope):
            for i in range(self.env.n):
                if i == self.agent_idx:
                    policy_approximate_logits.append(None)
                    policy_approximate_actions.append(None)
                    continue
                scope = "agent_" + str(i)
                logits = build_mlp(self.observation_placeholder, self.action_dim, scope, self.config.n_layers,
                                   self.config.layer_size, output_activation=None)
                policy_approximate_logits.append(logits)
                policy_approximate_actions.append(tf.nn.softmax(logits, axis=-1))

        self.policy_approximate_logits = policy_approximate_logits
        self.policy_approximate_actions = policy_approximate_actions