Exemplo n.º 1
0
def q_train(make_obs_ph_n,
            act_space_n,
            q_index,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            scope="trainer",
            reuse=None,
            num_units=64):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        target_ph = tf.placeholder(tf.float32, [None], name="target")

        if local_q_func:
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
        else:
            q_input = tf.concat(obs_ph_n + act_ph_n, 1)
        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0]
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        q_loss = tf.reduce_mean(tf.square(q - target_ph))

        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss  #+ 1e-3 * q_reg

        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph],
                           outputs=loss,
                           updates=[optimize_expr])
        q_values = U.function(obs_ph_n + act_ph_n, q)

        # target network
        target_q = q_func(q_input,
                          1,
                          scope="target_q_func",
                          num_units=num_units)[:, 0]
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        target_q_values = U.function(obs_ph_n + act_ph_n, target_q)

        return train, update_target_q, {
            'q_values': q_values,
            'target_q_values': target_q_values
        }
Exemplo n.º 2
0
def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]

        p_input = obs_ph_n[p_index]

        p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)

        act_sample = act_pd.sample()
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample() #act_pd.mode() #
        q_input = tf.concat(obs_ph_n + act_input_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:,0]
        pg_loss = -tf.reduce_mean(q)

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr])
        act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        p_values = U.function([obs_ph_n[p_index]], p)

        # target network
        target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units)
        target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample)

        return act, train, update_target_p, {'p_values': p_values, 'target_act': target_act}
Exemplo n.º 3
0
Arquivo: ops.py Projeto: mxxhcm/code
def q_train(make_obs_ph_n,
            act_space_n,
            q_index,
            q_func,
            lstm_model,
            optimizer,
            args,
            grad_norm_clipping=None,
            local_q_func=False,
            scope="trainer",
            reuse=None,
            num_units=64,
            use_lstm=True,
            session=None,
            lstm_scope=None):
    with tf.variable_scope(scope, reuse=reuse):
        # ===================q network开始建图=================
        act_pdtype_n = [make_pdtype(act_space)
                        for act_space in act_space_n]  # 创建分布
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        # observation placeholder
        obs_ph_n = make_obs_ph_n  # 创建obs placeholder
        # target q placeholder
        target_ph = tf.placeholder(tf.float32, [None],
                                   name="target")  # 在运行时计算,然后传入,只跟loss有关

        # 在这里进行dimension reduction
    if use_lstm:
        if args.shared_lstm:
            with tf.variable_scope(lstm_scope):
                observation_n = lstm_model(obs_ph_n, scope="lstm", reuse=True)
        else:
            with tf.variable_scope(scope, reuse=reuse):
                observation_n = lstm_model(obs_ph_n, scope="lstm", reuse=reuse)
    else:
        # observation_n = obs_ph_n
        # 所有智能体的obs和action
        observation_n = [tf.squeeze(o, 1) for o in obs_ph_n]

    with tf.variable_scope(scope, reuse=reuse):
        if local_q_func:
            q_input = tf.concat([observation_n[q_index], act_ph_n[q_index]], 1)
        else:
            q_input = tf.concat(observation_n + act_ph_n, 1)
        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:,
                                                                    0]  # 计算q值

        q_func_vars = U.scope_vars(
            U.absolute_scope_name("q_func"))  # q network网络参数
        if use_lstm:
            lstm_func_vars = U.scope_vars(
                U.absolute_scope_name("lstm"))  # lstm参数

        q_loss = tf.reduce_mean(tf.square(q - target_ph))  # mse loss
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss  # + 1e-3 * q_reg

        if use_lstm:
            optimize_expr = U.minimize_and_clip(optimizer, loss,
                                                q_func_vars + lstm_func_vars,
                                                grad_norm_clipping)
        else:
            optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,
                                                grad_norm_clipping)
        # ===============q network建图结束=====================

        # 创建可调用函数
        train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph],
                           outputs=loss,
                           updates=[optimize_expr],
                           session=session)
        q_values = U.function(obs_ph_n + act_ph_n, q, session=session)

        # ==================target q network建图===============
        target_q = q_func(q_input,
                          1,
                          scope="target_q_func",
                          num_units=num_units)[:, 0]
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))
        # ===================target q network建图结束======================

        # 创建可调用函数
        update_target_q = make_update_exp(q_func_vars,
                                          target_q_func_vars,
                                          session=session)
        target_q_values = U.function(obs_ph_n + act_ph_n,
                                     target_q,
                                     session=session)

        return train, update_target_q, {
            'q_values': q_values,
            'target_q_values': target_q_values
        }
Exemplo n.º 4
0
Arquivo: ops.py Projeto: mxxhcm/code
def p_train(make_obs_ph_n,
            act_space_n,
            p_scope,
            p_index,
            p_func,
            q_func,
            lstm_model,
            optimizer,
            args,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=64,
            scope="trainer",
            reuse=None,
            use_lstm=True,
            session=None):
    with tf.variable_scope(scope, reuse=reuse):
        # placeholder
        # action placeholder, list of [batch_size, action_dim]
        act_pdtype_n = [make_pdtype(act_space)
                        for act_space in act_space_n]  # 创建action的分布用来采样
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        # observation placeholder
        obs_ph_n = make_obs_ph_n  # 创建observation的placeholder, list of [batch_size, state_dim, time_step]

    if use_lstm:
        if args.shared_lstm:
            with tf.variable_scope(p_scope):
                observation_n = lstm_model(obs_ph_n, reuse=True, scope="lstm")
                lstm_vars = U.scope_vars(U.absolute_scope_name("lstm"))
        else:
            with tf.variable_scope(scope):
                observation_n = lstm_model(obs_ph_n, reuse=reuse, scope="lstm")
                lstm_vars = U.scope_vars(U.absolute_scope_name("lstm"))
    else:
        with tf.variable_scope(scope, reuse=reuse):
            # observation_n = obs_ph_n
            observation_n = [tf.squeeze(o, 1) for o in obs_ph_n
                             ]  # 所有智能体的obs, list of [batch_size, state_dim]

    p_input = observation_n[p_index]  # 当前智能体的局部obs, [batch_size, state_dim]

    # p是多个actor公用的,q是每一个critic有一个
    with tf.variable_scope(p_scope, reuse=reuse):
        # 计算局部p值,最后用来产生action, [batch_size, action_dim]
        p = p_func(p_input,
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="p_func",
                   reuse=True,
                   num_units=num_units)
        # p函数的参数
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))
        # wrap parameters in distribution,Pd.logits
        act_pd = act_pdtype_n[p_index].pdfromflat(p)  #
        # act_sample = act_pd.sample()    # [batch_size, action_dim]
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))  # [None]
        # 更新action
        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample()  # [batch_size, action]

        # 目标p值的参数
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars,
                                          target_p_func_vars,
                                          session=session)  # 函数调用

    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
        q_input = tf.concat(observation_n + act_input_n,
                            1)  # 所有智能体的s和a, [batch_size, concat_dim]
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True,
                   num_units=num_units)[:, 0]  # 计算Q(s,a), [batch_size,]

        # loss
        pg_loss = -tf.reduce_mean(q)  # policy gradient loss ???
        loss = pg_loss + p_reg * 1e-3  # 使用每一个critic计算的loss都是不同的,第一次需要建图,以后就不需要了

        # p网络的优化器。
        if use_lstm:
            optimize_expr = U.minimize_and_clip(optimizer, loss,
                                                p_func_vars + lstm_vars,
                                                grad_norm_clipping)
        else:
            optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                                grad_norm_clipping)
        # ============p network建图结束=================

        # 创建可以调用的函数,就是往里面喂数据
        # train的调用函数,输入必须是list,
        train = U.function(inputs=obs_ph_n + act_ph_n,
                           outputs=loss,
                           updates=[optimize_expr],
                           session=session)
        return train, update_target_p
Exemplo n.º 5
0
Arquivo: ops.py Projeto: mxxhcm/code
def q_train(make_common_obs_ph, make_sep_obs_ph_n, act_space_n, optimizer, args,
            q_index, q_func,
            lstm_model, cnn_model,
            lstm_scope=None, cnn_scope=None,
            use_lstm=True, use_cnn=True,
            reuse=None, session=None, scope="trainer",
            local_q_func=False, num_units=64, grad_norm_clipping=None):
    # reuse = False
    with tf.variable_scope(scope, reuse=reuse):
        # ===================q network开始建图=================
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]  # 创建分布
        act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n))]
        # target q placeholder
        target_ph = tf.placeholder(tf.float32, [None], name="target")  # 在运行时计算,然后传入,只跟loss有关
        
        # observation placeholder
        common_obs_ph = make_common_obs_ph  # 创建observation的placeholder, list of [batch_size, state_dim, time_step]
        sep_obs_ph_n = make_sep_obs_ph_n
    
    # if shared cnn, reuse=True
    if use_cnn:
        if cnn_scope is None:
            cnn_scope = scope
        
        length = common_obs_ph.shape[1]
        new_common_obs = []
        with tf.variable_scope(cnn_scope, reuse=tf.AUTO_REUSE):
            for i in range(length):
                new_common_obs.append(cnn_model(common_obs_ph[:, i], scope="cnn"))
            new_common_obs = tf.stack(new_common_obs)
            cnn_vars = U.scope_vars(U.absolute_scope_name("cnn"))  # lstm参数
    else:
        new_common_obs = common_obs_ph
    
    # if shared lstm, reuse=True
    if use_lstm:
        if lstm_scope == None:
            lstm_scope = scope
        with tf.variable_scope(lstm_scope, reuse=tf.AUTO_REUSE):
            observation_n = lstm_model(new_common_obs, sep_obs_ph_n, reuse=reuse, scope="lstm")
            lstm_func_vars = U.scope_vars(U.absolute_scope_name("lstm"))  # lstm参数
    else:
        with tf.variable_scope(scope, reuse=reuse):
            # observation_n = obs_ph_n
            observation_n = tf.squeeze(new_common_obs + sep_obs_ph_n, 1)
    
    # reuse = False
    with tf.variable_scope(scope, reuse=reuse):
        if local_q_func:
            q_input = tf.concat([observation_n[q_index], act_ph_n[q_index]], 1)
        else:
            q_input = tf.concat(observation_n + act_ph_n, 1)
        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0]  # 计算q值
        
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))  # q network网络参数
        # if use_lstm:
        #     lstm_func_vars = U.scope_vars(U.absolute_scope_name("lstm"))  # lstm参数
        
        q_loss = tf.reduce_mean(tf.square(q - target_ph))  # mse loss
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss  # + 1e-3 * q_reg
        
        if use_lstm and use_cnn:
            optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars + lstm_func_vars + cnn_vars, grad_norm_clipping)
        else:
            optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping)
        # ===============q network建图结束=====================
        
        # 创建可调用函数
        train = U.function(inputs=[common_obs_ph] + sep_obs_ph_n + act_ph_n + [target_ph], outputs=loss,
                           updates=[optimize_expr], session=session)
        q_values = U.function([common_obs_ph] + sep_obs_ph_n + act_ph_n, q, session=session)
        
        # ==================target q network建图===============
        target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:, 0]
        target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func"))
        # ===================target q network建图结束======================
        
        # 创建可调用函数
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars, session=session)
        target_q_values = U.function([common_obs_ph] + sep_obs_ph_n + act_ph_n, target_q, session=session)
        
        return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values}