def __init__(self, env: FiniteActionEnvironment, model: QNetworkSL, replay_memory_size: int = 3000, minibatch_size: int = 32, epsilon=0.05, epsilon_step_factor=1.0, epsilon_min=0.0): """ Initialized the Deep SARSA(λ) agent :param env: The FiniteActionEnvironment that should be learned by the agent :param model: The function approximator used to estimate and learn Q(s,a) :param replay_memory_size: The size of the replay memory (in trajectories) :param minibatch_size: The size of the minibatches sampled from the raply memory each step for training :param epsilon: The probability of performing a random move, used for exploration :param epsilon_step_factor: The epsilon decay parameter. Epsilon is multiplied each step with this factor. :param epsilon_min: The minimum value of epsilon. It will not decay further than this. """ super().__init__(env) self.epsilon_step_factor = epsilon_step_factor self.epsilon_min = epsilon_min self.qnetwork = model self.epsilon_v = epsilon self.policy = model.derive_policy(EpsilonGreedyPolicy, env.valid_actions_from, epsilon=self.epsilon) self.replay_memory = deque(maxlen=replay_memory_size) self.minibatch_size = minibatch_size
def run_a2c_experiment(entropy_reg, run: int): """ This function runs a single run of a2c on cartpole using the specified parameters :param entropy_reg: Entropy regularization on the policy loss function, higher means a more random policy :param run: Specifies the run number, this is used in the filename of the output file """ import keras as ks import numpy as np from agents.actor_critic import ActorCriticAgent from environments.cartpole import CartPole from q_network_sarsa_lambda import QNetworkSL from p_network import PNetwork from experiment_util import Logger value_network = ks.models.Sequential() value_network.add( ks.layers.Dense(150, activation='relu', input_shape=(4, ))) value_network.add(ks.layers.Dense(50, activation='relu', input_shape=(4, ))) value_network.add(ks.layers.Dense(2, activation='linear')) value_network.compile(optimizer=ks.optimizers.Adam(lr=0.001), loss='mse') policy_network = ks.models.Sequential() policy_network.add( ks.layers.Dense(150, activation='relu', input_shape=(4, ))) policy_network.add( ks.layers.Dense(50, activation='relu', input_shape=(4, ))) policy_network.add(ks.layers.Dense(2, activation='softmax')) l = Logger(filename="../results/AC_VS_SL_cartpole_a2c_%.5f_%d.h5" % (entropy_reg, run)) env = CartPole(render=False) actions = env.valid_actions() dn = QNetworkSL(value_network, actions, lambda x: np.reshape(x.state, newshape=(1, 4)), lambd=0.9, gamma=1.0, reward_factor=0.01, fixed_length=100, lambda_min=1e-2) pn = PNetwork(policy_network, actions, lambda x: np.array(x.state), fixed_steps=100, entropy_regularization=entropy_reg, alpha=0.001, use_advantage=True) ac = ActorCriticAgent(env, dn, pn, replay_memory_size=1000) c = ac.get_configuration() experiment = l.start_experiment(c) q = ac.learn(num_episodes=250, result_handler=experiment.log)
def snake_conv_sarsa(episodes=10000, file_name='snek'): import tensorflow as tf from keras.backend.tensorflow_backend import set_session config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 1 / NUM_RUNS - 0.05 set_session(tf.Session(config=config)) import keras as ks import numpy as np from experiment_util import Logger from agents.deep_sarsa import DeepSarsa from environments.snake import SnakeVisual from q_network_sarsa_lambda import QNetworkSL logger = Logger(filename=file_name) env = SnakeVisual(grid_size=[8, 8], render=False, render_freq=10) actions = env.valid_actions() size = np.shape(env.reset().state) nn = ks.models.Sequential() nn.add( ks.layers.Conv2D(filters=16, kernel_size=(3, 3), activation='sigmoid', input_shape=size)) nn.add( ks.layers.Conv2D(filters=24, kernel_size=(3, 3), activation='sigmoid')) nn.add( ks.layers.Conv2D(filters=32, kernel_size=(3, 3), activation='sigmoid')) nn.add(ks.layers.Flatten()) nn.add(ks.layers.Dense(units=16, activation='sigmoid')) nn.add(ks.layers.Dense(units=3, activation='linear')) nn.compile(optimizer=ks.optimizers.Adam(lr=0.0001), loss='mse') def normalize_state(s): return np.reshape(s.state, newshape=(1, ) + size) dqn = QNetworkSL(nn, actions, normalize_state, lambd=0.9, lambda_min=1e-3, gamma=0.9, reward_factor=1, fixed_length=100) dql = DeepSarsa(env, dqn, epsilon=0.3, epsilon_step_factor=0.9999, epsilon_min=0.005, replay_memory_size=1000) experiment = logger.start_experiment(dql.get_configuration()) q = dql.learn(num_episodes=episodes, result_handler=experiment.log) experiment.save_attribute("weights", nn.get_weights())
def run_saraslambda_experiment(epsilon_start, epsilon_min, epsilon_decay, run: int): """ Runs deep sarasa lambda on cartpole :param epsilon_start: Starting epsilon value :param epsilon_min: Minimum epsilon value :param epsilon_decay: Factor multiplied with epsilon each step :param run: Run identifier used in the output filename :return: """ import keras as ks import numpy as np from agents.deep_sarsa import DeepSarsa from environments.cartpole import CartPole from q_network_sarsa_lambda import QNetworkSL from experiment_util import Logger value_network = ks.models.Sequential() value_network.add( ks.layers.Dense(150, activation='relu', input_shape=(4, ))) value_network.add(ks.layers.Dense(50, activation='relu', input_shape=(4, ))) value_network.add(ks.layers.Dense(2, activation='linear')) value_network.compile(optimizer=ks.optimizers.Adam(lr=0.001), loss='mse') l = Logger(filename="../results/AC_VS_SL_cartpole_sl_%.4f_%.4f_%f_%d.h5" % (epsilon_start, epsilon_min, epsilon_decay, run)) env = CartPole(render=False) actions = env.valid_actions() dn = QNetworkSL(value_network, actions, lambda x: np.reshape(x.state, newshape=(1, 4)), lambd=0.9, gamma=1.0, reward_factor=0.01, fixed_length=100, lambda_min=1e-2) sarsa = DeepSarsa(env, dn, replay_memory_size=1000, epsilon_min=epsilon_min, epsilon_step_factor=epsilon_decay, epsilon=epsilon_start) c = sarsa.get_configuration() experiment = l.start_experiment(c) q = sarsa.learn(num_episodes=250, result_handler=experiment.log)
def snake_deep_sarsa(episodes=10000, file_name='snek'): import tensorflow as tf from keras.backend.tensorflow_backend import set_session config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 1 / NUM_RUNS - 0.05 set_session(tf.Session(config=config)) import keras as ks import numpy as np from experiment_util import Logger from agents.deep_sarsa import DeepSarsa from environments.snake import SnakeContinuous from q_network_sarsa_lambda import QNetworkSL logger = Logger(filename=file_name) neural_network = ks.models.Sequential() neural_network.add( ks.layers.Dense(150, activation='relu', input_shape=(9, ))) neural_network.add(ks.layers.Dense(50, activation='relu')) neural_network.add(ks.layers.Dense(3, activation='linear')) neural_network.compile(optimizer=ks.optimizers.Adam(lr=0.001), loss='mse') env = SnakeContinuous(grid_size=[8, 8], render=False, render_freq=10) actions = env.valid_actions() dqn = QNetworkSL(neural_network, actions, lambda x: np.reshape(x.state, newshape=(1, 9)), lambd=0.9, lambda_min=1e-3, gamma=0.9, reward_factor=1, fixed_length=100) dql = DeepSarsa(env, dqn, epsilon=0.3, epsilon_step_factor=0.9999, epsilon_min=0.005, replay_memory_size=1000) experiment = logger.start_experiment(dql.get_configuration()) q = dql.learn(num_episodes=episodes, result_handler=experiment.log) experiment.save_attribute("weights", neural_network.get_weights())
def normalize_state(s): o = np.zeros(shape=(1, 8)) o[0, 0] = s.state['player_y'] / height o[0, 1] = s.state['player_vel'] o[0, 2] = s.state['next_pipe_dist_to_player'] / width o[0, 3] = s.state['next_pipe_top_y'] / (height / 2) o[0, 4] = s.state['next_pipe_bottom_y'] / (height / 2) o[0, 5] = s.state['next_next_pipe_dist_to_player'] / width o[0, 6] = s.state['next_next_pipe_top_y'] / (height / 2) o[0, 7] = s.state['next_next_pipe_bottom_y'] / (height / 2) return o vn = QNetworkSL(neural_network, actions, normalize_state, lambd=0.9, gamma=0.9, reward_factor=1, fixed_length=100, lambda_min=1e-2) pn = PNetwork(policy_network, actions, lambda x: normalize_state(x)[0], fixed_steps=100, entropy_regularization=0.1, alpha=0.001, use_advantage=True) dql = ActorCriticAgent(env, vn, pn, replay_memory_size=1000) q = dql.learn()
neural_network = ks.models.Sequential() neural_network.add( ks.layers.Dense(150, activation='relu', input_shape=(4, ))) neural_network.add(ks.layers.Dense(50, activation='relu')) neural_network.add(ks.layers.Dense(2, activation='linear')) neural_network.compile(optimizer=ks.optimizers.Adam(lr=0.001), loss='mse') env = CartPole(render=False) actions = env.valid_actions() dqn = QNetworkSL(neural_network, actions, lambda x: np.reshape(x.state, newshape=(1, 4)), lambd=lambd[i], lambda_min=1e-3, gamma=1.0, reward_factor=0.01, fixed_length=100) dql = DeepSarsa(env, dqn, epsilon=1.0, epsilon_step_factor=0.9995, epsilon_min=0.0, replay_memory_size=1000) c = dql.get_configuration() print(c) experiment = l.start_experiment(c) try:
from environments.snake import SnakeContinuous from q_network_sarsa_lambda import QNetworkSL neural_network = ks.models.Sequential() neural_network.add( ks.layers.Dense(150, activation='relu', input_shape=(9, ))) neural_network.add(ks.layers.Dense(50, activation='relu')) neural_network.add(ks.layers.Dense(3, activation='linear')) neural_network.compile(optimizer=ks.optimizers.Adam(lr=0.001), loss='mse') env = SnakeContinuous(grid_size=[8, 8], render=True, render_freq=10) actions = env.valid_actions() dqn = QNetworkSL(neural_network, actions, lambda x: np.reshape(x.state, newshape=(1, 9)), lambd=0.9, gamma=0.9, reward_factor=0.01, fixed_length=100) dql = DeepSarsa(env, dqn, epsilon=0.3, epsilon_step_factor=0.999, epsilon_min=0.05, replay_memory_size=1000) q = dql.learn()
def experiment(run_n, episodes, sigmas, lambda_parameter): """ Runs a single experiment for each sigma value of Deep SARSA lambda on Cartpole :param run_n: The run number, used in the filename of the experiment :param episodes: Number of epsiodes to run :param sigmas: Values of sigma (noise standard deviation) :param lambda_parameter: The lambda value for this experiment :return: The filename of the output file """ import tensorflow as tf # This code is used to stop tensorflow from allocating all GPU memory ar once. This allows for more runs on one GPU # These settings are ignored when running on CPU (which is often faster for this experiment) from keras.backend.tensorflow_backend import set_session config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.2 set_session(tf.Session(config=config)) import keras as ks import numpy as np from agents.deep_sarsa import DeepSarsa from environments.cartpole import NoisyCartPole from q_network_sarsa_lambda import QNetworkSL from experiment_util import Logger filename = ("results/cartpole_deepsarsalambda_lambda_%1.2f_%d.h5" % (lambda_parameter, run_n)) l = Logger(filename=filename) for sigma in sigmas: neural_network = ks.models.Sequential() neural_network.add( ks.layers.Dense(150, activation='relu', input_shape=(4, ))) neural_network.add(ks.layers.Dense(50, activation='relu')) neural_network.add(ks.layers.Dense(2, activation='linear')) neural_network.compile(optimizer=ks.optimizers.Adam(lr=0.001), loss='mse') env = NoisyCartPole(std=sigma, render=False) actions = env.valid_actions() dqn = QNetworkSL(neural_network, actions, lambda x: np.reshape(x.state, newshape=(1, 4)), lambd=lambda_parameter, lambda_min=1e-3, gamma=1.0, reward_factor=0.01, fixed_length=100) dql = DeepSarsa(env, dqn, epsilon=1.0, epsilon_step_factor=0.9995, epsilon_min=0.0, replay_memory_size=1000) c = dql.get_configuration() experiment = l.start_experiment(c) q = dql.learn(num_episodes=episodes, result_handler=experiment.log) experiment.save_attribute("weights", neural_network.get_weights()) print("%s finished sigma=%1.2f, run=%i" % (filename, sigma, run_n)) return filename