Exemplo n.º 1
0
def q_train(make_obs_ph_n,
            act_space_n,
            q_index,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            scope="trainer",
            reuse=None,
            num_units=64):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [
            SoftCategoricalPdType(len(act_space)) for act_space in act_space_n
        ]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        target_ph = tf.placeholder(tf.float32, [None], name="target")

        q_input = tf.concat(obs_ph_n + act_ph_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0]
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        q_loss = tf.reduce_mean(tf.square(q - target_ph))

        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss  #+ 1e-3 * q_reg

        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph],
                           outputs=loss,
                           updates=[optimize_expr])
        q_values = U.function(obs_ph_n + act_ph_n, q)

        # target network
        target_q = q_func(q_input,
                          1,
                          scope="target_q_func",
                          num_units=num_units)[:, 0]
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        target_q_values = U.function(obs_ph_n + act_ph_n, target_q)

        return train, update_target_q, {
            'q_values': q_values,
            'target_q_values': target_q_values
        }
Exemplo n.º 2
0
def p_train(make_obs_ph_n,
            act_space_n,
            p_index,
            p_func,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=64,
            scope="trainer",
            reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [
            SoftCategoricalPdType(len(act_space)) for act_space in act_space_n
        ]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]

        p_input = obs_ph_n[p_index]

        p = p_func(p_input,
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="p_func",
                   num_units=num_units)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)

        act_sample = act_pd.sample()
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample()  #act_pd.mode() #
        q_input = tf.concat(obs_ph_n + act_input_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True,
                   num_units=num_units)[:, 0]
        pg_loss = -tf.reduce_mean(q)

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n,
                           outputs=loss,
                           updates=[optimize_expr])
        act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        p_values = U.function([obs_ph_n[p_index]], p)

        # target network
        target_p = p_func(p_input,
                          int(act_pdtype_n[p_index].param_shape()[0]),
                          scope="target_p_func",
                          num_units=num_units)
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(inputs=[obs_ph_n[p_index]],
                                outputs=target_act_sample)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }
Exemplo n.º 3
0
def q_train(make_obs_ph_n,
            act_space_n,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            scope="trainer",
            reuse=None,
            num_units=64):
    with tf.variable_scope(scope, reuse=reuse):
        num_agents = len(make_obs_ph_n)

        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None],
                                               name="action_{}".format(i))
            for i in range(len(act_space_n))
        ]
        target_ph_n = [
            tf.placeholder(tf.float32, [None], name="target_{}".format(i))
            for i in range(num_agents)
        ]
        is_norm_training = tf.placeholder(tf.bool)
        is_inference = tf.placeholder(tf.bool)

        q_input = tf.concat(obs_ph_n + act_ph_n, 1)
        q_n = [
            q_func(q_input,
                   1,
                   scope="q_func_{}".format(i),
                   num_units=num_units)[:, 0] for i in range(num_agents)
        ]
        q_func_vars = [
            U.scope_vars(U.absolute_scope_name("q_func_{}".format(i)))
            for i in range(num_agents)
        ]

        q_loss_n = [
            tf.reduce_mean(tf.square(q - target_ph))
            for q, target_ph in zip(q_n, target_ph_n)
        ]

        # viscosity solution to Bellman differential equation in place of an initial condition
        # q_reg = tf.reduce_mean(tf.square(q))
        q_loss = tf.reduce_sum(q_loss_n)
        loss = q_loss  # + 1e-3 * q_reg

        var_list = list(itertools.chain(*q_func_vars))
        optimize_expr = U.minimize_and_clip(optimizer, loss, var_list,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n + target_ph_n +
                           [is_norm_training, is_inference],
                           outputs=loss,
                           updates=[optimize_expr])
        q_values = U.function(
            obs_ph_n + act_ph_n + [is_norm_training, is_inference], q_n)

        # target network
        target_q_n = [
            q_func(q_input,
                   1,
                   scope="target_q_func_{}".format(i),
                   num_units=num_units)[:, 0] for i in range(num_agents)
        ]
        target_q_func_vars = [
            U.scope_vars(U.absolute_scope_name("target_q_func_{}".format(i)))
            for i in range(num_agents)
        ]

        traget_var_list = list(itertools.chain(*target_q_func_vars))
        update_target_q = make_update_exp(var_list, traget_var_list)

        target_q_values = U.function(
            obs_ph_n + act_ph_n + [is_norm_training, is_inference], target_q_n)

        return train, update_target_q, {
            'q_values': q_values,
            'target_q_values': target_q_values
        }
Exemplo n.º 4
0
    def p_train_function(self,
                         make_obs_ph_n,
                         act_space_n,
                         before_com_func,
                         channel,
                         after_com_func,
                         q_func,
                         optimizer,
                         grad_norm_clipping=None,
                         local_q_func=False,
                         num_units=64,
                         scope="trainer",
                         reuse=None,
                         beta=0.05,
                         ibmac_com=True):
        with tf.variable_scope(scope, reuse=reuse):
            clip_threshold = 1  # 1, 5, 10
            is_norm_training = tf.placeholder(tf.bool)
            is_inference = tf.placeholder(tf.bool)

            ibmac_nocom = not ibmac_com
            num_agents = len(make_obs_ph_n)

            # create distribtuions
            act_pdtype_n = [
                make_pdtype(act_space) for act_space in act_space_n
            ]

            # set up placeholders
            obs_ph_n = make_obs_ph_n
            act_ph_n = [
                act_pdtype_n[i].sample_placeholder([None],
                                                   name="action" + str(i))
                for i in range(num_agents)
            ]

            hiddens_n = [
                before_com_func(obs_ph_n[i],
                                num_units,
                                scope="before_com_{}".format(i),
                                num_units=num_units) for i in range(num_agents)
            ]
            before_com_vars_n = [
                U.scope_vars(U.absolute_scope_name("before_com_{}".format(i)))
                for i in range(num_agents)
            ]

            hiddens_n_for_message = tf.concat([
                before_com_func(obs_ph_n[i],
                                num_units,
                                scope="before_com_{}".format(i),
                                reuse=True,
                                num_units=num_units) for i in range(num_agents)
            ],
                                              axis=1)
            hiddens_n_for_message = tf.stop_gradient(hiddens_n_for_message)
            channel_output = channel(hiddens_n_for_message,
                                     num_units * num_agents,
                                     scope="channel",
                                     num_units=num_units * num_agents)
            message_n, mu_message_n, logvar_message_n = [
                tf.split(item, num_or_size_splits=num_agents, axis=1)
                for item in channel_output
            ]
            logvar_message_n = [
                tf.clip_by_value(log, -10, 10) for log in logvar_message_n
            ]  # constrain kl_loss not to be too large

            message_n = [
                clip_message(message, clip_threshold, is_norm_training,
                             is_inference) for message in message_n
            ]

            channel_vars_n = [U.scope_vars(U.absolute_scope_name("channel"))]

            if ibmac_nocom:
                print('no_com')
                p_n = [
                    after_com_func(hiddens_n[i],
                                   int(act_pdtype_n[i].param_shape()[0]),
                                   scope="p_func_{}".format(i),
                                   num_units=num_units)
                    for i in range(num_agents)
                ]
            else:
                check_n = [
                    hiddens_n[i] + message_n[i] for i in range(num_agents)
                ]
                p_n = [
                    after_com_func(hiddens_n[i] + message_n[i],
                                   int(act_pdtype_n[i].param_shape()[0]),
                                   scope="p_func_{}".format(i),
                                   num_units=num_units)
                    for i in range(num_agents)
                ]
            p_func_vars = [
                U.scope_vars(U.absolute_scope_name("p_func_{}".format(i)))
                for i in range(num_agents)
            ]

            # wrap parameters in distribution
            act_pd_n = [
                act_pdtype_n[i].pdfromflat(p_n[i]) for i in range(num_agents)
            ]

            act_sample_n = [act_pd.sample() for act_pd in act_pd_n]
            p_reg_n = [
                tf.reduce_mean(tf.square(act_pd.flatparam()))
                for act_pd in act_pd_n
            ]

            act_input_n_n = [act_ph_n + [] for _ in range(num_agents)]
            for i in range(num_agents):
                act_input_n_n[i][i] = act_pd_n[i].sample()
            q_input_n = [
                tf.concat(obs_ph_n + act_input_n, 1)
                for act_input_n in act_input_n_n
            ]

            q_n = [
                q_func(q_input_n[i],
                       1,
                       scope="q_func_{}".format(i),
                       reuse=True,
                       num_units=num_units)[:, 0] for i in range(num_agents)
            ]
            pg_loss_n = [-tf.reduce_mean(q) for q in q_n]

            # # 0.25 =bandwidth
            # kl_loss_message_n = [2 * (tf.pow(mu, 2) + tf.pow(tf.exp(log), 2)) - log + np.log(0.5) - 0.5 for mu, log in
            #                      zip(mu_message_n, logvar_message_n)]

            # #1
            # kl_loss_message_n = [0.5 * (tf.pow(mu, 2) + tf.pow(tf.exp(log), 2)) - log - 0.5 for mu, log in
            #                      zip(mu_message_n, logvar_message_n)]
            # #5
            # kl_loss_message_n = [1.0/50 * (tf.pow(mu, 2) + tf.pow(tf.exp(log), 2)) - log + np.log(5) - 0.5 for mu, log in
            #                      zip(mu_message_n, logvar_message_n)]
            # 10
            # kl_loss_message_n = [1.0/200 * (tf.pow(mu, 2) + tf.pow(tf.exp(log), 2)) - log + np.log(10) - 0.5 for mu, log in
            #                     zip(mu_message_n, logvar_message_n)]
            ##bw=1 b1+b2 = 1, alpha = bw
            kl_loss_message_n = [
                1 / 2 * 1 / (tf.pow(self.alpha, 2)) *
                (tf.pow(mu, 2) + tf.pow(tf.exp(log), 2)) - log -
                np.log(self.alpha) - 0.5
                for mu, log in zip(mu_message_n, logvar_message_n)
            ]

            entropy = [tf.exp(log) + 1.4189 for log in logvar_message_n]

            pg_loss = tf.reduce_sum(pg_loss_n)
            p_reg = tf.reduce_sum(p_reg_n)
            kl_loss_message = tf.reduce_mean(kl_loss_message_n)

            if ibmac_nocom:
                loss = pg_loss + p_reg * 1e-3
            else:
                loss = pg_loss + p_reg * 1e-3 + beta * kl_loss_message

            kl_loss = U.function(inputs=obs_ph_n + act_ph_n +
                                 [is_norm_training, is_inference],
                                 outputs=kl_loss_message)

            var_list = []
            var_list.extend(before_com_vars_n)
            if not ibmac_nocom:
                var_list.extend(channel_vars_n)
            var_list.extend(p_func_vars)
            var_list = list(itertools.chain(*var_list))
            optimize_expr = U.minimize_and_clip(optimizer, loss, var_list,
                                                grad_norm_clipping)

            # Create callable functions
            train = U.function(inputs=obs_ph_n + act_ph_n +
                               [is_norm_training, is_inference],
                               outputs=loss,
                               updates=[optimize_expr])
            act = U.function(inputs=obs_ph_n +
                             [is_norm_training, is_inference],
                             outputs=act_sample_n)
            p_values = U.function(inputs=obs_ph_n +
                                  [is_norm_training, is_inference],
                                  outputs=p_n)
            if not ibmac_nocom:
                check_values = U.function(inputs=obs_ph_n +
                                          [is_norm_training, is_inference],
                                          outputs=check_n)
                channel_com = U.function(inputs=obs_ph_n +
                                         [is_norm_training, is_inference],
                                         outputs=channel_output)
                check_mu = U.function(inputs=obs_ph_n +
                                      [is_norm_training, is_inference],
                                      outputs=mu_message_n)
                check_log = U.function(inputs=obs_ph_n +
                                       [is_norm_training, is_inference],
                                       outputs=logvar_message_n)
            else:
                check_values = lambda x: 0
                channel_com = lambda x: 0
                check_mu = lambda x: 0
                check_log = lambda x: 0

            # target network
            target_hiddens_n = [
                before_com_func(obs_ph_n[i],
                                num_units,
                                scope="target_before_com_{}".format(i),
                                num_units=num_units) for i in range(num_agents)
            ]
            target_before_com_vars = [
                U.scope_vars(
                    U.absolute_scope_name("target_before_com_{}".format(i)))
                for i in range(num_agents)
            ]

            target_hiddens_n_for_message = tf.concat([
                before_com_func(obs_ph_n[i],
                                num_units,
                                scope="target_before_com_{}".format(i),
                                reuse=True,
                                num_units=num_units) for i in range(num_agents)
            ],
                                                     axis=1)
            target_hiddens_n_for_message = tf.stop_gradient(
                target_hiddens_n_for_message)
            target_channel_output = channel(target_hiddens_n_for_message,
                                            num_units * num_agents,
                                            scope="target_channel",
                                            num_units=num_units * num_agents)
            target_message_n, target_mu_message_n, target_logvar_message_n = [
                tf.split(item, num_or_size_splits=num_agents, axis=1)
                for item in target_channel_output
            ]
            target_channel_vars = [
                U.scope_vars(U.absolute_scope_name("target_channel"))
            ]
            if ibmac_nocom:
                target_p_n = [
                    after_com_func(target_hiddens_n[i],
                                   int(act_pdtype_n[i].param_shape()[0]),
                                   scope="target_p_func_{}".format(i),
                                   num_units=num_units)
                    for i in range(num_agents)
                ]
            else:
                target_p_n = [
                    after_com_func(target_hiddens_n[i] + target_message_n[i],
                                   int(act_pdtype_n[i].param_shape()[0]),
                                   scope="target_p_func_{}".format(i),
                                   num_units=num_units)
                    for i in range(num_agents)
                ]
                # target_p_n = [after_com_func(tf.concat([target_hiddens_n[i],target_message_n[i]], axis=1), int(act_pdtype_n[i].param_shape()[0]), scope="target_p_func_{}".format(i), num_units=num_units) for i in range(num_agents)]
            target_p_func_vars = [
                U.scope_vars(
                    U.absolute_scope_name("target_p_func_{}".format(i)))
                for i in range(num_agents)
            ]

            target_var_list = []
            target_var_list.extend(target_before_com_vars)
            if not ibmac_nocom:
                target_var_list.extend(target_channel_vars)
            target_var_list.extend(target_p_func_vars)
            target_var_list = list(itertools.chain(*target_var_list))
            update_target_p = make_update_exp(var_list, target_var_list)

            target_act_sample_n = [
                act_pdtype_n[i].pdfromflat(target_p_n[i]).sample()
                for i in range(num_agents)
            ]
            target_act = U.function(inputs=obs_ph_n +
                                    [is_norm_training, is_inference],
                                    outputs=target_act_sample_n)

            check_message_n = U.function(inputs=obs_ph_n +
                                         [is_norm_training, is_inference],
                                         outputs=message_n)
            check_hiddens_n = U.function(inputs=obs_ph_n +
                                         [is_norm_training, is_inference],
                                         outputs=hiddens_n)
            check_entropy = U.function(inputs=obs_ph_n +
                                       [is_norm_training, is_inference],
                                       outputs=entropy)

            return act, train, update_target_p, {
                'p_values': p_values,
                'target_act': target_act,
                'kl_loss': kl_loss,
                'check_values': check_values,
                'channel_com': channel_com,
                'check_mu': check_mu,
                'check_log': check_log,
                'check_message_n': check_message_n,
                'check_hiddens_n': check_hiddens_n,
                'check_entropy': check_entropy
            }