示例#1
0
 def val(state_input):
     """V(s) instead of a bias for the last layers."""
     with tf.variable_scope("val_for_bias"):
         val0 = tf.layers.dense(inputs=state_input,
                                units=self.embed_dim,
                                activation=tf.nn.relu)
         val2 = tf.layers.dense(inputs=val0, units=1, activation=None)
         return val2
示例#2
0
    def build_actor_graph(self):
        """Build explorer graph with minimum principle."""
        with self.graph.as_default():
            with tf.variable_scope("explore_agent"):
                self.agent_outs, self.hidden_outs = self.build_agent_net(
                    inputs_obs=self.ph_obs,
                    seq_max=1,  # 1, importance for inference
                    obs_lengths=[1 for _ in range(self.n_agents)],
                    hidden_state_in=self.ph_hidden_states_in,
                )

            self._explore_paras = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="explore_agent")
示例#3
0
        def hyper_w1(hyper_w1_input):
            """
            Create hyper_w1.

            input shape (none, state_dim)
            """
            with tf.variable_scope("hyper_w1"):
                hw0 = tf.layers.dense(inputs=hyper_w1_input,
                                      units=hypernet_embed,
                                      activation=tf.nn.relu)
                hw1 = tf.layers.dense(inputs=hw0,
                                      units=self.embed_dim * self.n_agents,
                                      activation=None)
                return hw1
示例#4
0
        def hyper_w_final(hyper_w_final_input):
            """
            Create hyper_w_final.

            input shape (none, state_dim)
            """
            with tf.variable_scope("hyper_w_final"):
                hw_f0 = tf.layers.dense(
                    inputs=hyper_w_final_input,
                    units=hypernet_embed,
                    activation=tf.nn.relu,
                )
                hw_f1 = tf.layers.dense(inputs=hw_f0,
                                        units=self.embed_dim,
                                        activation=None)
                return hw_f1
示例#5
0
    def build_train_graph(self):
        """
        Build train graph.

        Because of the different seq_max(1 vs limit),
        train graph cannot connect-up to actor.graph directly.
        Hence, we build an explore sub-graph and train sub-graph,
        which sync with tf.assign between two collections.
        :return:
        """
        with self.graph.as_default():
            with tf.variable_scope("eval_agent"):
                trajectory_agent_outs, _ = self.build_agent_net(
                    inputs_obs=self.ph_train_obs,
                    seq_max=self.fix_seq_length + 1,  # importance
                    obs_lengths=self.ph_train_obs_len,
                    hidden_state_in=None,  # total trajectory, needn't hold hidden
                )

            with tf.variable_scope("target_agent"):
                tar_agent_outs_tmp, _ = self.build_agent_net(
                    inputs_obs=self.ph_train_obs,
                    # fix value, different between explore and train
                    seq_max=self.fix_seq_length + 1,
                    obs_lengths=self.ph_train_obs_len,
                    hidden_state_in=None,
                )
                target_trajectory_agent_outs = tf.stop_gradient(tar_agent_outs_tmp)

            _eval_agent_paras = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="eval_agent")
            _target_agent_paras = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_agent")

            with tf.variable_scope("soft_replacement"):
                self.agent_train_replace_op = [
                    tf.assign(t, e) for t, e in zip(_target_agent_paras,
                                                    _eval_agent_paras)]

                self.agent_explore_replace_op = [
                    tf.assign(t, e) for t, e in zip(self._explore_paras,
                                                    _eval_agent_paras)
                ]

            self._print_trainable_var_name(
                _eval_agent_paras=_eval_agent_paras,
                _target_agent_paras=_target_agent_paras,
                _explore_paras=self._explore_paras,
            )

            # agent out to max q values
            # Calculate estimated Q-Values ----------------
            mac_out = tf.reshape(
                trajectory_agent_outs,
                [self.batch_size, self.fix_seq_length + 1, self.n_agents, -1],
            )
            logging.debug("mac_out: {}".format(mac_out))
            chosen_action_qvals = self.gather_custom(mac_out[:, :-1],
                                                     self.ph_actions)

            # Calculate the Q-Values necessary for the target -----------
            target_mac_out = tf.reshape(
                target_trajectory_agent_outs,
                [self.batch_size, self.fix_seq_length + 1, self.n_agents, -1],
            )
            target_mac_out = target_mac_out[:, 1:]

            # Mask out unavailable actions
            # target_mac_out[avail_actions[:, 1:] == 0] = -9999999
            indices = tf.equal(self.ph_avail_action[:, 1:], 0)
            mask_val = tf.tile(
                [[[[-999999.0]]]],
                [
                    self.batch_size,
                    self.fix_seq_length,
                    self.n_agents,
                    self.avail_action_num,
                ],
            )
            logging.debug("indices:{}, mask_val:{}, target mac out:{}".format(
                indices, mask_val, target_mac_out))

            target_mac_out = tf.where(indices, mask_val, target_mac_out)

            if self.use_double_q:
                # Get actions that maximise live Q (for double q-learning)
                mac_out_detach = tf.stop_gradient(tf.identity(mac_out[:, 1:]))
                mac_out_detach = tf.where(indices, mask_val, mac_out_detach)
                cur_max_actions = tf.expand_dims(
                    tf.argmax(mac_out_detach, axis=-1), -1)
                target_max_qvals = self.gather_custom(target_mac_out,
                                                      cur_max_actions)
            else:
                target_max_qvals = tf.reduce_max(target_mac_out, axis=[-1])

            # eval mixer ---------------
            with tf.variable_scope("eval_mixer"):
                self.q_tot = self._build_mix_net2(chosen_action_qvals,
                                                  self.ph_train_states)

            with tf.variable_scope("target_mixer"):
                q_tot_tmp = self._build_mix_net2(target_max_qvals,
                                                 self.ph_train_target_states)
                self.target_q_tot = tf.stop_gradient(q_tot_tmp)

            _eval_mix_paras = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="eval_mixer")
            _target_mix_paras = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_mixer")

            with tf.variable_scope("soft_replacement"):
                self.mix_train_replace_op = [
                    tf.assign(t, e) for t, e in zip(_target_mix_paras,
                                                    _eval_mix_paras)]

            self._print_trainable_var_name(_eval_mix_paras=_eval_mix_paras,
                                           _target_mix_paras=_target_mix_paras)

            # Calculate 1-step Q-Learning targets
            targets = (self.ph_rewards +
                       self.gamma * (1.0 - self.ph_terminated) * self.target_q_tot)

            # Td-error
            td_error = self.q_tot - tf.stop_gradient(targets)

            # mask = mask.expand_as(td_error)  #fixme: default as same shape!

            # 0-out the targets that came from padded data
            masked_td_error = tf.multiply(td_error, self.ph_mask)

            self.loss = tf.reduce_sum(masked_td_error**2) / tf.reduce_sum(self.ph_mask)

            # Optimise
            optimizer = tf.train.RMSPropOptimizer(
                self.lr, decay=0.95, epsilon=1.5e-7, centered=True)
            grads_and_vars = optimizer.compute_gradients(self.loss)
            capped_gvs = [(
                grad if grad is None else tf.clip_by_norm(
                    grad, clip_norm=self.grad_norm_clip),
                var,
            ) for grad, var in grads_and_vars]
            self.grad_update = optimizer.apply_gradients(capped_gvs)
示例#6
0
 def hyper_b1(state_input):
     """State dependent bias for hidden layer."""
     with tf.variable_scope("hyper_b1"):
         return tf.layers.dense(inputs=state_input,
                                units=self.embed_dim,
                                activation=None)
示例#7
0
    def create_model(self, model_info):
        self.ph_state = tf.placeholder(self.input_dtype,
                                       shape=(None, *self.state_dim),
                                       name="state_input")

        with tf.variable_scope("explore_agent"):
            state_input = Lambda(self._transform)(self.ph_state)
            last_layer = state_input

            for (out_size, kernel, stride) in self.filter_arch[:-1]:
                last_layer = Conv2D(
                    out_size,
                    (kernel, kernel),
                    strides=(stride, stride),
                    activation="relu",
                    padding="same",
                )(last_layer)

            # last convolution
            (out_size, kernel, stride) = self.filter_arch[-1]
            convolution_layer = Conv2D(
                out_size,
                (kernel, kernel),
                strides=(stride, stride),
                activation="relu",
                padding="valid",
            )(last_layer)

            self.pi_logic_outs = tf.squeeze(
                Conv2D(self.action_dim, (1, 1),
                       padding="same")(convolution_layer),
                axis=[1, 2],
            )

            baseline_flat = Flatten()(convolution_layer)
            self.baseline = tf.squeeze(
                tf.layers.dense(
                    inputs=baseline_flat,
                    units=1,
                    activation=None,
                    kernel_initializer=custom_norm_initializer(0.01),
                ),
                1,
            )
            self.out_actions = tf.squeeze(
                tf.multinomial(self.pi_logic_outs,
                               num_samples=1,
                               output_dtype=tf.int32),
                1,
                name="out_action",
            )

        # create learner
        self.ph_bp_logic_outs = tf.placeholder(self.dtype,
                                               shape=(None, self.action_dim),
                                               name="ph_b_logits")

        self.ph_actions = tf.placeholder(tf.int32,
                                         shape=(None, ),
                                         name="ph_action")
        self.ph_dones = tf.placeholder(tf.bool,
                                       shape=(None, ),
                                       name="ph_dones")
        self.ph_rewards = tf.placeholder(self.dtype,
                                         shape=(None, ),
                                         name="ph_rewards")

        # Split the tensor into batches at known episode cut boundaries.
        # [batch_count * batch_step] -> [batch_step, batch_count]
        batch_step = self.sample_batch_steps

        def split_batches(tensor, drop_last=False):
            batch_count = tf.shape(tensor)[0] // batch_step
            reshape_tensor = tf.reshape(
                tensor,
                tf.concat([[batch_count, batch_step],
                           tf.shape(tensor)[1:]],
                          axis=0),
            )

            # swap B and T axes
            res = tf.transpose(
                reshape_tensor,
                [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0]))),
            )

            if drop_last:
                return res[:-1]
            return res

        self.loss = vtrace_loss(
            bp_logic_outs=split_batches(self.ph_bp_logic_outs, drop_last=True),
            tp_logic_outs=split_batches(self.pi_logic_outs, drop_last=True),
            actions=split_batches(self.ph_actions, drop_last=True),
            discounts=split_batches(tf.cast(~self.ph_dones, tf.float32) *
                                    GAMMA,
                                    drop_last=True),
            rewards=split_batches(tf.clip_by_value(self.ph_rewards, -1, 1),
                                  drop_last=True),
            values=split_batches(self.baseline, drop_last=True),
            bootstrap_value=split_batches(self.baseline)[-1],
        )

        global_step = tf.Variable(0, trainable=False, dtype=tf.int32)
        if self.opt_type == "adam":
            if self.lr_schedule:
                learning_rate = self._get_lr(global_step)
            else:
                learning_rate = LR
            optimizer = AdamOptimizer(learning_rate)
        elif self.opt_type == "rmsprop":
            optimizer = tf.train.RMSPropOptimizer(LR,
                                                  decay=0.99,
                                                  epsilon=0.1,
                                                  centered=True)
        else:
            raise KeyError("invalid opt_type: {}".format(self.opt_type))

        grads_and_vars = optimizer.compute_gradients(self.loss)

        # global norm
        grads, var = zip(*grads_and_vars)
        grads, _ = tf.clip_by_global_norm(grads, self.grad_norm_clip)
        clipped_gvs = list(zip(grads, var))

        self.train_op = optimizer.apply_gradients(clipped_gvs,
                                                  global_step=global_step)

        # fixme: help to show the learning rate among training processing
        self.lr = optimizer._lr

        self.actor_var = TFVariables(self.out_actions, self.sess)

        self.sess.run(global_variables_initializer())

        self.explore_paras = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, scope="explore_agent")

        self.saver = Saver({t.name: t
                            for t in self.explore_paras},
                           max_to_keep=self.max_to_keep)

        return True