def run_policy(policy_type, episodes):
    import gym
    import tensorflow as tf
    from algorithms import advantage_actor_critic as aac
    from algorithms import simple_policy_optimization as spo
    from algorithms import simple_policy_optimization_with_entropy as spowe
    from algorithms import simple_policy_optimization_rnn as spornn
    from algorithms import beta_advantage_actor_critic as baac
    import numpy as np
    ks = tf.keras
    SEED = 420
    tf.set_random_seed(SEED)

    def create_policy_model_beta():
        inp = ks.Input((8, ))
        x = inp
        x = ks.layers.Dense(64, activation='selu')(x)
        x = ks.layers.Dense(32, activation='selu')(x)
        alphas = ks.layers.Dense(2, activation='softplus')(x)
        betas = ks.layers.Dense(2, activation='softplus')(x)
        model = ks.Model(inputs=inp, outputs=[alphas, betas])
        return model

    def create_policy_model_entropy():
        inp = ks.Input((8, ))
        x = inp
        x = ks.layers.Dense(64, activation='selu')(x)
        x = ks.layers.Dense(32, activation='selu')(x)
        means = ks.layers.Dense(2, activation='tanh')(x)
        scales = ks.layers.Dense(2, activation='sigmoid')(x)
        model = ks.Model(inputs=inp, outputs=[means, scales])
        return model

    def create_policy_model_no_entropy():
        inp = ks.Input((8, ))
        x = inp
        x = ks.layers.Dense(64, activation='selu')(x)
        x = ks.layers.Dense(32, activation='selu')(x)
        means = ks.layers.Dense(2, activation='tanh')(x)
        model = ks.Model(inputs=inp, outputs=means)
        return model

    def create_value_model():
        inp = ks.Input((8, ))
        x = inp
        x = ks.layers.Dense(64, activation='selu')(x)
        x = ks.layers.Dense(32, activation='selu')(x)
        value = ks.layers.Dense(1, activation='linear')(x)
        model = ks.Model(inputs=inp, outputs=value)
        return model

    def make_rnn_model():
        inp = ks.Input((None, 8))
        state_inp = ks.Input((32, ))

        mem_out, new_rnn_state = ks.layers.GRU(
            32, return_sequences=True, return_state=True)([inp, state_inp])
        mem_out = ks.layers.TimeDistributed(
            ks.layers.Dense(32, activation='selu'))(mem_out)
        action_means = ks.layers.TimeDistributed(
            ks.layers.Dense(2, activation='tanh'))(mem_out)
        model = ks.models.Model(inputs=[inp, state_inp],
                                outputs=[action_means, new_rnn_state])
        return model

    if policy_type == "ac":
        policy = aac.AdvantageActorCritic(create_policy_model_entropy(),
                                          create_value_model(),
                                          2,
                                          entropy_factor=0.001,
                                          gamma=0.99,
                                          lr=0.001)
    elif policy_type == "spowe":
        policy = spowe.SimplePolicyOptimizerWithEntropy(
            create_policy_model_entropy(),
            2,
            0.001,
            gamma=0.99,
            entropy_factor=0.01)
    elif policy_type == "spornn":
        initial_rnn_state = np.zeros((1, 32))
        policy = spornn.SimplePolicyOptimizerRNN(make_rnn_model(),
                                                 2,
                                                 initial_rnn_state,
                                                 scale_value=0.6,
                                                 gamma=0.99,
                                                 lr=0.0001)
    elif policy_type == "baac":
        policy = baac.BetaAdvantageActorCritic(create_policy_model_beta(),
                                               create_value_model(),
                                               2,
                                               entropy_factor=0.001,
                                               gamma=0.99,
                                               lr=0.001)
    elif policy_type == "ppo":
        policy = baac.BetaAdvantageActorCritic(create_policy_model_beta(),
                                               create_value_model(),
                                               2,
                                               entropy_factor=0.001,
                                               gamma=0.99,
                                               ppo_eps=0.2,
                                               lr=0.001)
    else:
        policy = spo.SimplePolicyOptimizer(create_policy_model_no_entropy(),
                                           2,
                                           0.001,
                                           gamma=0.99,
                                           scale_value=0.6)

    scores = []

    config = tf.ConfigProto(device_count={'GPU': 0})

    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        env = gym.make("LunarLanderContinuous-v2")
        for episode in range(episodes):
            state = env.reset()
            done = False
            trajectory = []
            score = 0
            while not done:
                actions = policy.get_actions(sess, state)
                new_state, r, done, _ = env.step(actions)
                r /= 1000
                score += r

                trajectory.append((state, actions, r))
                state = new_state
            policy.train(sess, trajectory)
            scores.append(score)
            episode += 1
    return scores
Exemplo n.º 2
0
tf.set_random_seed(SEED)
multiplayer_car_env.set_random_seed(SEED)

model1 = ks.models.Sequential()
model1.add(ks.layers.Dense(24, activation='tanh', input_shape=(7, )))
model1.add(ks.layers.Dense(12, activation='tanh'))
model1.add(ks.layers.Dense(2, activation='tanh'))

model2 = ks.models.Sequential()
model2.add(ks.layers.Dense(24, activation='tanh', input_shape=(7, )))
model2.add(ks.layers.Dense(12, activation='tanh'))
model2.add(ks.layers.Dense(2, activation='tanh'))

policy_1 = spo.SimplePolicyOptimizer(model1,
                                     2,
                                     scale_value=0.003,
                                     gamma=0.9,
                                     lr=0.001)
policy_2 = spo.SimplePolicyOptimizer(model2,
                                     2,
                                     scale_value=0.001,
                                     gamma=0.9,
                                     lr=0.001)
env = multiplayer_car_env.MPCarEnv()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    scale = 1.5
    episode = 0
    while True:
        state_1, state_2 = env.reset()
Exemplo n.º 3
0
#     aac.AdvantageActorCritic(create_policy_model_entropy(), create_value_model(), 2, lr=0.0001, gamma=0.97,
#                              entropy_factor=0.001, value_loss_scale=0.03, scale_multiplier=2.0, lambd=0.95),
#     soccer_agent.SoccerAgent(2),
#     5000
# )


initial_rnn_state = np.zeros((1, 32))
agents = [
    baac.BetaAdvantageActorCritic(create_policy_model_beta(), create_value_model(), 2, lr=0.0003, gamma=0.997,
                             entropy_factor=0.007, log=True, value_loss_scale=0.05,  lambd=0.95),
    aac.AdvantageActorCritic(create_policy_model_entropy(), create_value_model(), 2, lr=0.0001, gamma=0.997,
                             entropy_factor=0.0001, value_loss_scale=0.03, scale_multiplier=5.0, lambd=0.95),
    spornn.SimplePolicyOptimizerRNN(make_rnn_model(), 2, initial_rnn_state, scale_value=3.0, gamma=0.997,
                                    lr=0.0001),
    spo.SimplePolicyOptimizer(create_policy_model_no_entropy(), 2, 0.0001, gamma=0.997, scale_value=0.6),
    dummy_agent.DummyAgent(2),
    random_agent.RandomAgent(2),
    # soccer_agent.SoccerAgent(2),
    # lin_agent_1
]

pids = {agents[i]: i for i in range(len(agents))}

pids_in_queue = set(pids.values())

def name(agent):
    return str(agent.__class__.__name__) + str(id(agent))


#env = multiplayer_car_env.MPCarEnv(force_fair_game=False, max_steps=500)
Exemplo n.º 4
0
def run_policy(policy_type):
    from algorithms import dummy_agent
    import tensorflow as tf
    from algorithms import advantage_actor_critic as aac
    from algorithms import simple_policy_optimization as spo
    from algorithms import simple_policy_optimization_with_entropy as spowe
    from algorithms import simple_policy_optimization_rnn as spornn
    import numpy as np
    from algorithms import random_agent
    from competition_system.player import Player
    ks = tf.keras

    # SEED = 420
    # tf.set_random_seed(SEED)
    # car_env.set_random_seed(SEED)

    def create_policy_model_entropy():
        inp = ks.Input((12,))
        x = inp
        x = ks.layers.Dense(64, activation='tanh')(x)
        x = ks.layers.Dense(32, activation='tanh')(x)
        means = ks.layers.Dense(2, activation='tanh')(x)
        scales = ks.layers.Dense(2, activation='sigmoid')(x)
        model = ks.Model(inputs=inp, outputs=[means, scales])
        return model

    def create_policy_model_no_entropy():
        inp = ks.Input((12,))
        x = inp
        x = ks.layers.Dense(64, activation='tanh')(x)
        x = ks.layers.Dense(32, activation='tanh')(x)
        means = ks.layers.Dense(2, activation='tanh')(x)
        model = ks.Model(inputs=inp, outputs=means)
        return model

    def create_value_model():
        inp = ks.Input((12,))
        x = inp
        x = ks.layers.Dense(64, activation='tanh')(x)
        x = ks.layers.Dense(32, activation='tanh')(x)
        value = ks.layers.Dense(1, activation='linear')(x)
        model = ks.Model(inputs=inp, outputs=value)
        return model

    def make_rnn_model():
        inp = ks.Input((None, 12))
        state_inp = ks.Input((32,))

        mem_out, new_rnn_state = ks.layers.GRU(32, return_sequences=True, return_state=True)([inp, state_inp])
        mem_out = ks.layers.TimeDistributed(ks.layers.Dense(32, activation='tanh'))(mem_out)
        action_means = ks.layers.TimeDistributed(ks.layers.Dense(2, activation='tanh'))(mem_out)
        model = ks.models.Model(inputs=[inp, state_inp], outputs=[action_means, new_rnn_state])
        return model

    if policy_type == "ac":
        policy = aac.AdvantageActorCritic(create_policy_model_entropy(), create_value_model(), 2, entropy_factor=0.01,
                                          gamma=0.997,
                                          lr=0.0001)
    elif policy_type == "spowe":
        policy = spowe.SimplePolicyOptimizerWithEntropy(create_policy_model_entropy(), 2, 0.0001, gamma=0.997,
                                                        entropy_factor=0.01)
    elif policy_type == "spornn":
        initial_rnn_state = np.zeros((1, 32))
        policy = spornn.SimplePolicyOptimizerRNN(make_rnn_model(), 2, initial_rnn_state, scale_value=0.6, gamma=0.997,
                                                 lr=0.0001)
    elif policy_type == "spo":
        policy = spo.SimplePolicyOptimizer(create_policy_model_no_entropy(), 2, 0.0001, gamma=0.997, scale_value=0.6)

    elif policy_type == "random":
        policy = random_agent.RandomAgent(2)

    else:
        policy = dummy_agent.DummyAgent(2)

    scores = []
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        env = Player("%s" % str(policy), port=PORT)
        while True:
            state = env.reset()
            done = False
            trajectory = []
            score = 0
            while not done:
                actions = policy.get_actions(sess, state)
                new_state, r, done, _ = env.step(actions)

                score += r

                trajectory.append((state, actions, r))
                state = new_state
            policy.train(sess, trajectory)
            scores.append(score)
import tensorflow as tf
from algorithms import simple_policy_optimization as spo
from environments import car_env

ks = tf.keras

SEED = 420
tf.set_random_seed(SEED)
car_env.set_random_seed(SEED)

model = ks.models.Sequential()
model.add(ks.layers.Dense(24, activation='tanh', input_shape=(6,)))
model.add(ks.layers.Dense(12, activation='tanh'))
model.add(ks.layers.Dense(2, activation='tanh'))

policy = spo.SimplePolicyOptimizer(model, 2, scale_value=0.3, gamma=0.9, lr=0.001)
env = car_env.CarEnv(True)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    scale = 3.0
    episode = 0
    while True:
        state = env.reset()
        done = False
        trajectory = []
        while not done:
            actions = policy.get_actions(sess, state, scale)
            new_state, r, done, _ = env.step(actions)

            trajectory.append((state, actions, r))