def run_policy(policy_type, episodes): import gym import tensorflow as tf from algorithms import advantage_actor_critic as aac from algorithms import simple_policy_optimization as spo from algorithms import simple_policy_optimization_with_entropy as spowe from algorithms import simple_policy_optimization_rnn as spornn from algorithms import beta_advantage_actor_critic as baac import numpy as np ks = tf.keras SEED = 420 tf.set_random_seed(SEED) def create_policy_model_beta(): inp = ks.Input((8, )) x = inp x = ks.layers.Dense(64, activation='selu')(x) x = ks.layers.Dense(32, activation='selu')(x) alphas = ks.layers.Dense(2, activation='softplus')(x) betas = ks.layers.Dense(2, activation='softplus')(x) model = ks.Model(inputs=inp, outputs=[alphas, betas]) return model def create_policy_model_entropy(): inp = ks.Input((8, )) x = inp x = ks.layers.Dense(64, activation='selu')(x) x = ks.layers.Dense(32, activation='selu')(x) means = ks.layers.Dense(2, activation='tanh')(x) scales = ks.layers.Dense(2, activation='sigmoid')(x) model = ks.Model(inputs=inp, outputs=[means, scales]) return model def create_policy_model_no_entropy(): inp = ks.Input((8, )) x = inp x = ks.layers.Dense(64, activation='selu')(x) x = ks.layers.Dense(32, activation='selu')(x) means = ks.layers.Dense(2, activation='tanh')(x) model = ks.Model(inputs=inp, outputs=means) return model def create_value_model(): inp = ks.Input((8, )) x = inp x = ks.layers.Dense(64, activation='selu')(x) x = ks.layers.Dense(32, activation='selu')(x) value = ks.layers.Dense(1, activation='linear')(x) model = ks.Model(inputs=inp, outputs=value) return model def make_rnn_model(): inp = ks.Input((None, 8)) state_inp = ks.Input((32, )) mem_out, new_rnn_state = ks.layers.GRU( 32, return_sequences=True, return_state=True)([inp, state_inp]) mem_out = ks.layers.TimeDistributed( ks.layers.Dense(32, activation='selu'))(mem_out) action_means = ks.layers.TimeDistributed( ks.layers.Dense(2, activation='tanh'))(mem_out) model = ks.models.Model(inputs=[inp, state_inp], outputs=[action_means, new_rnn_state]) return model if policy_type == "ac": policy = aac.AdvantageActorCritic(create_policy_model_entropy(), create_value_model(), 2, entropy_factor=0.001, gamma=0.99, lr=0.001) elif policy_type == "spowe": policy = spowe.SimplePolicyOptimizerWithEntropy( create_policy_model_entropy(), 2, 0.001, gamma=0.99, entropy_factor=0.01) elif policy_type == "spornn": initial_rnn_state = np.zeros((1, 32)) policy = spornn.SimplePolicyOptimizerRNN(make_rnn_model(), 2, initial_rnn_state, scale_value=0.6, gamma=0.99, lr=0.0001) elif policy_type == "baac": policy = baac.BetaAdvantageActorCritic(create_policy_model_beta(), create_value_model(), 2, entropy_factor=0.001, gamma=0.99, lr=0.001) elif policy_type == "ppo": policy = baac.BetaAdvantageActorCritic(create_policy_model_beta(), create_value_model(), 2, entropy_factor=0.001, gamma=0.99, ppo_eps=0.2, lr=0.001) else: policy = spo.SimplePolicyOptimizer(create_policy_model_no_entropy(), 2, 0.001, gamma=0.99, scale_value=0.6) scores = [] config = tf.ConfigProto(device_count={'GPU': 0}) with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) env = gym.make("LunarLanderContinuous-v2") for episode in range(episodes): state = env.reset() done = False trajectory = [] score = 0 while not done: actions = policy.get_actions(sess, state) new_state, r, done, _ = env.step(actions) r /= 1000 score += r trajectory.append((state, actions, r)) state = new_state policy.train(sess, trajectory) scores.append(score) episode += 1 return scores
tf.set_random_seed(SEED) multiplayer_car_env.set_random_seed(SEED) model1 = ks.models.Sequential() model1.add(ks.layers.Dense(24, activation='tanh', input_shape=(7, ))) model1.add(ks.layers.Dense(12, activation='tanh')) model1.add(ks.layers.Dense(2, activation='tanh')) model2 = ks.models.Sequential() model2.add(ks.layers.Dense(24, activation='tanh', input_shape=(7, ))) model2.add(ks.layers.Dense(12, activation='tanh')) model2.add(ks.layers.Dense(2, activation='tanh')) policy_1 = spo.SimplePolicyOptimizer(model1, 2, scale_value=0.003, gamma=0.9, lr=0.001) policy_2 = spo.SimplePolicyOptimizer(model2, 2, scale_value=0.001, gamma=0.9, lr=0.001) env = multiplayer_car_env.MPCarEnv() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) scale = 1.5 episode = 0 while True: state_1, state_2 = env.reset()
# aac.AdvantageActorCritic(create_policy_model_entropy(), create_value_model(), 2, lr=0.0001, gamma=0.97, # entropy_factor=0.001, value_loss_scale=0.03, scale_multiplier=2.0, lambd=0.95), # soccer_agent.SoccerAgent(2), # 5000 # ) initial_rnn_state = np.zeros((1, 32)) agents = [ baac.BetaAdvantageActorCritic(create_policy_model_beta(), create_value_model(), 2, lr=0.0003, gamma=0.997, entropy_factor=0.007, log=True, value_loss_scale=0.05, lambd=0.95), aac.AdvantageActorCritic(create_policy_model_entropy(), create_value_model(), 2, lr=0.0001, gamma=0.997, entropy_factor=0.0001, value_loss_scale=0.03, scale_multiplier=5.0, lambd=0.95), spornn.SimplePolicyOptimizerRNN(make_rnn_model(), 2, initial_rnn_state, scale_value=3.0, gamma=0.997, lr=0.0001), spo.SimplePolicyOptimizer(create_policy_model_no_entropy(), 2, 0.0001, gamma=0.997, scale_value=0.6), dummy_agent.DummyAgent(2), random_agent.RandomAgent(2), # soccer_agent.SoccerAgent(2), # lin_agent_1 ] pids = {agents[i]: i for i in range(len(agents))} pids_in_queue = set(pids.values()) def name(agent): return str(agent.__class__.__name__) + str(id(agent)) #env = multiplayer_car_env.MPCarEnv(force_fair_game=False, max_steps=500)
def run_policy(policy_type): from algorithms import dummy_agent import tensorflow as tf from algorithms import advantage_actor_critic as aac from algorithms import simple_policy_optimization as spo from algorithms import simple_policy_optimization_with_entropy as spowe from algorithms import simple_policy_optimization_rnn as spornn import numpy as np from algorithms import random_agent from competition_system.player import Player ks = tf.keras # SEED = 420 # tf.set_random_seed(SEED) # car_env.set_random_seed(SEED) def create_policy_model_entropy(): inp = ks.Input((12,)) x = inp x = ks.layers.Dense(64, activation='tanh')(x) x = ks.layers.Dense(32, activation='tanh')(x) means = ks.layers.Dense(2, activation='tanh')(x) scales = ks.layers.Dense(2, activation='sigmoid')(x) model = ks.Model(inputs=inp, outputs=[means, scales]) return model def create_policy_model_no_entropy(): inp = ks.Input((12,)) x = inp x = ks.layers.Dense(64, activation='tanh')(x) x = ks.layers.Dense(32, activation='tanh')(x) means = ks.layers.Dense(2, activation='tanh')(x) model = ks.Model(inputs=inp, outputs=means) return model def create_value_model(): inp = ks.Input((12,)) x = inp x = ks.layers.Dense(64, activation='tanh')(x) x = ks.layers.Dense(32, activation='tanh')(x) value = ks.layers.Dense(1, activation='linear')(x) model = ks.Model(inputs=inp, outputs=value) return model def make_rnn_model(): inp = ks.Input((None, 12)) state_inp = ks.Input((32,)) mem_out, new_rnn_state = ks.layers.GRU(32, return_sequences=True, return_state=True)([inp, state_inp]) mem_out = ks.layers.TimeDistributed(ks.layers.Dense(32, activation='tanh'))(mem_out) action_means = ks.layers.TimeDistributed(ks.layers.Dense(2, activation='tanh'))(mem_out) model = ks.models.Model(inputs=[inp, state_inp], outputs=[action_means, new_rnn_state]) return model if policy_type == "ac": policy = aac.AdvantageActorCritic(create_policy_model_entropy(), create_value_model(), 2, entropy_factor=0.01, gamma=0.997, lr=0.0001) elif policy_type == "spowe": policy = spowe.SimplePolicyOptimizerWithEntropy(create_policy_model_entropy(), 2, 0.0001, gamma=0.997, entropy_factor=0.01) elif policy_type == "spornn": initial_rnn_state = np.zeros((1, 32)) policy = spornn.SimplePolicyOptimizerRNN(make_rnn_model(), 2, initial_rnn_state, scale_value=0.6, gamma=0.997, lr=0.0001) elif policy_type == "spo": policy = spo.SimplePolicyOptimizer(create_policy_model_no_entropy(), 2, 0.0001, gamma=0.997, scale_value=0.6) elif policy_type == "random": policy = random_agent.RandomAgent(2) else: policy = dummy_agent.DummyAgent(2) scores = [] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) env = Player("%s" % str(policy), port=PORT) while True: state = env.reset() done = False trajectory = [] score = 0 while not done: actions = policy.get_actions(sess, state) new_state, r, done, _ = env.step(actions) score += r trajectory.append((state, actions, r)) state = new_state policy.train(sess, trajectory) scores.append(score)
import tensorflow as tf from algorithms import simple_policy_optimization as spo from environments import car_env ks = tf.keras SEED = 420 tf.set_random_seed(SEED) car_env.set_random_seed(SEED) model = ks.models.Sequential() model.add(ks.layers.Dense(24, activation='tanh', input_shape=(6,))) model.add(ks.layers.Dense(12, activation='tanh')) model.add(ks.layers.Dense(2, activation='tanh')) policy = spo.SimplePolicyOptimizer(model, 2, scale_value=0.3, gamma=0.9, lr=0.001) env = car_env.CarEnv(True) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) scale = 3.0 episode = 0 while True: state = env.reset() done = False trajectory = [] while not done: actions = policy.get_actions(sess, state, scale) new_state, r, done, _ = env.step(actions) trajectory.append((state, actions, r))