def __init__(self, input_dim, num_actions, network_params=None, set_device=None, gradient_clipping_norm=None, reward_to_go=True, learning_rate=0.01, seed=1364): self.seed = seed # Training parameters self.gamma = 0.99 self.total_steps_so_far = 0 self.save_model_frequency = 100 self.learning_rate = learning_rate self.latest_learning_rate = learning_rate self.gradient_clipping_norm = gradient_clipping_norm # Experience Replay Memory self.memory_size = 40000 self.replay_memory = deque([], maxlen=self.memory_size) # ---------------------------------------- # Make the algorithm outputs reproducible make_deterministic(seed) # ---------------------------------------- # if gpu is to be used if set_device is None: self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # self.device = "cuda" if torch.cuda.is_available() else "cpu" else: self.device = torch.device(set_device) # Policy gradient parameters self.episode_return_batch = torch.Tensor() self.actions_batch = torch.Tensor() self.log_probs_batch = torch.Tensor() self.reward_to_go = reward_to_go self.batch_size = 4096 # Batch size for Policy Gradient should be large to reduce variance # (Explanation of the network_params in networks/network_builder.py) if network_params is None: network_params = { 'input_dim': input_dim, 'conv_layers': [(3, 16, 5, 2), (16, 32, 5, 2), (32, 32, 5, 2)], 'dense_layers': [num_actions], 'conv_bn': True, 'activation': 'relu' } self.policy_net = CreateNet(network_params).to(self.device) # self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=self.learning_rate) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate, eps=1e-4)
def run_single_episode(self, env, episode=None): # Make each episode deterministic based on the total_iteration_number make_deterministic(self.total_steps_so_far, env.env) finished = False episode_rewards = [] log_probs = [] # Create the first state of the episode state_1 = env.get_state(episode_start=True) while not finished: action, log_prob = self.get_action_and_log_prob(state_1) # Take the selected action in the environment _, reward, finished, _ = env.env.step(action) episode_rewards.append(reward) log_probs.append(log_prob) # If not finished, set the current_state as previous state if not finished: state_1 = env.get_state() # One single training iteration is passed self.total_steps_so_far += 1 # If the agent has received a satisfactory episode reward, stop it. if sum(episode_rewards) >= env.score_required_to_win: finished = True ep_len = len(episode_rewards) # Computing episode return for all time points in the episode (G_t) # input: vector [x0, x1, x2], output: [x0 + discount * x1 + (discount ^ 2) * x2, # x1 + discount * x2, # x2] if self.reward_to_go: episode_return = torch.tensor([ sum(episode_rewards[i:] * (self.gamma**np.arange(ep_len - i))) for i in range(ep_len) ]) else: episode_return = torch.ones(ep_len) * sum(episode_rewards) self.episode_return_batch = torch.cat( [self.episode_return_batch, episode_return]) self.log_probs_batch = torch.cat( [self.log_probs_batch, torch.cat(log_probs)]) # Policy Network optimisation: # ---------------------------- if len(self.episode_return_batch) >= self.batch_size: _ = self.learning_step() # Return the total rewards collected within this single episode run return episode_rewards
def run_single_episode(self, env, episode): # Make each episode deterministic based on the total_iteration_number make_deterministic(self.total_steps_so_far, env.env) finished = False episode_rewards = [] episode_losses = [] # Create the first state of the episode state_1 = env.get_state(episode_start=True) while not finished: action_1 = self.get_action(env, state_1) # Take the selected action in the environment s2, reward_1, finished, _ = env.env.step(action_1) # when episode is finished, state_2 does not matter, # and won't contribute to the optimisation # (because state_1 was the last state of the episode) state_2 = (0 * state_1) if finished else env.get_state() # Add the current transition (s, a, r, s', done) to the replay memory self.add_experience_to_replay_memory(state_1, action_1, reward_1, state_2, finished) # Policy Network optimisation: # ---------------------------- # If there are enough sample transitions inside the replay_memory, # then we can start training our policy network using them; # Otherwise we move on to the next state of the episode. if len(self.replay_memory) >= self.batch_size: # Take a random sample minibatch from the replay memory minibatch = self.sample_from_replay_memory(self.batch_size) # Compute the TD loss over the minibatch loss = self.learning_step(minibatch) # Track the value of loss (for debugging purpose) episode_losses.append(loss.item()) # Go to the next step of the episode state_1 = state_2 # Add up the rewards collected during this episode episode_rewards.append(reward_1) # One single training iteration is passed self.total_steps_so_far += 1 # If the agent has received a satisfactory episode reward, stop it. if sum(episode_rewards) >= env.score_required_to_win: finished = True if (episode % self.target_network_update) == 0: # Update the target network with the latest policy network parameters self.target_net.load_state_dict(self.policy_net.state_dict()) # Return the total rewards collected within this single episode run return episode_rewards
def __init__(self, epsilon_decay=0.005, seed=1364): # Epsilon parameters self.current_epsilon = 0.99 self.epsilon_start = 0.99 self.epsilon_end = 0.05 self.epsilon_decay = epsilon_decay # ---------------------------------------- # Make the algorithm outputs reproducible make_deterministic(seed)
def __init__(self, seed=1364): # Define the environment self.env = gym.make('CartPole-v0').unwrapped # ---------------------------------------- # Make the algorithm outputs reproducible make_deterministic(seed, self.env) # ---------------------------------------- self.env.reset() # Get number of actions from gym action space self.num_actions = self.env.action_space.n # Get the space size self.input_dim = self.env.state.size self.score_required_to_win = 200 self.average_score_required_to_win = self.env.spec.reward_threshold
def __init__(self, seed=1364): # Define the environment self.env = gym.make('CartPole-v0').unwrapped # ---------------------------------------- # Make the algorithm outputs reproducible make_deterministic(seed, self.env) # ---------------------------------------- self.env.reset() # Get number of actions from gym action space self.num_actions = self.env.action_space.n # Get screen size so that we can initialize Q-network layers correctly based on shape # returned from AI gym. Typical dimensions at this point are close to 3x40x90 # which is the result of a cropped and down-scaled render buffer in get_screen() # the output of get_screen is a torch frame of shape (B, C, H, W) _, _, screen_height, screen_width = self.get_screen().shape self.input_dim = (screen_height, screen_width) self.score_required_to_win = 200 self.average_score_required_to_win = self.env.spec.reward_threshold
def run_single_episode(self, env, episode, number_of_learning_iterations_in_one_step=1): # Make each episode deterministic based on the total_iteration_number make_deterministic(self.total_steps_so_far, env.env) finished = False episode_rewards = [] episode_losses = [] # Create the first state of the episode state_1 = env.get_state(episode_start=True) while not finished: env.env.render(mode='rgb_array') action_1 = self.get_action(env, state_1) # Take the selected action in the environment s2, reward_1, finished, _ = env.env.step(action_1) # when episode is finished, state_2 does not matter, # and won't contribute to the optimisation # (because state_1 was the last state of the episode) state_2 = (0 * state_1) if finished else env.get_state() # Add the current transition (s, a, r, s', done) to the replay memory self.add_experience_to_replay_memory(state_1, action_1, reward_1, state_2, finished) # Policy Network optimisation: # ---------------------------- # If there are enough sample transitions inside the replay_memory, # then we can start training our policy network using them; # Otherwise we move on to the next state of the episode. if len(self.replay_memory) >= self.batch_size: if self.total_steps_so_far % self.steps_between_learning_steps == 0: for _ in range(number_of_learning_iterations_in_one_step): # Take a random sample minibatch from the replay memory minibatch = self.sample_from_replay_memory( self.batch_size) # Compute the TD loss over the minibatch _, _ = self.learning_step(minibatch) # Track the value of loss (for debugging purpose) # episode_losses.append(loss.item()) # Update the target networks (polyac averaging) self.soft_update_target_networks() # Go to the next step of the episode state_1 = state_2 # Add up the rewards collected during this episode episode_rewards.append(reward_1) # One single training iteration is passed self.total_steps_so_far += 1 # If the agent has received a satisfactory episode reward, stop it. if sum(episode_rewards) >= env.score_required_to_win: finished = True # If the episode takes longer than 'max_episode_length', terminate it. if len(episode_rewards) > self.max_episode_length: break # print(f"episode: {episode}, reward: {reward_1}, action_1: {action_1}") # Return the total rewards collected within this single episode run return episode_rewards
def __init__(self, input_dim, action_dimension, set_device=None, gradient_clipping_norm=None, learning_rate_actor=0.01, learning_rate_critic=0.01, actor_noise_scale=0.1, steps_between_learning_steps=1, max_episode_length=2000, polyac=0.99, seed=1364): self.seed = seed # Training parameters self.gamma = 0.99 self.batch_size = 256 self.target_network_update = 10 self.total_steps_so_far = 0 self.max_episode_length = max_episode_length self.save_model_frequency = 100 self.learning_rate_actor = learning_rate_actor self.learning_rate_critic = learning_rate_critic self.gradient_clipping_norm = gradient_clipping_norm self.polyac = polyac self.steps_between_learning_steps = steps_between_learning_steps # Explorer self.actor_noise_scale = actor_noise_scale self.noise = OU_Noise(action_dimension, seed, 0, 0.15, 0.25) self.noise.reset() # Experience Replay Memory self.memory_size = 1000000 self.replay_memory = deque([], maxlen=self.memory_size) # ---------------------------------------- # Make the algorithm outputs reproducible make_deterministic(seed) # ---------------------------------------- # if gpu is to be used if set_device is None: self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # self.device = "cuda" if torch.cuda.is_available() else "cpu" else: self.device = torch.device(set_device) # network instantiation self.actor_net = ActorDDPG(input_dim, action_dimension).to(self.device) self.critic_net = CriticDDPG(input_dim, action_dimension).to(self.device) self.actor_net_target = ActorDDPG(input_dim, action_dimension).to(self.device) self.critic_net_target = CriticDDPG(input_dim, action_dimension).to(self.device) self.actor_net_target.load_state_dict(self.actor_net.state_dict()) self.actor_net_target.eval() self.critic_net_target.load_state_dict(self.critic_net.state_dict()) self.critic_net_target.eval() # self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=self.learning_rate) self.optimizer_actor = optim.Adam(self.actor_net.parameters(), lr=self.learning_rate_actor, eps=1e-4) self.optimizer_critic = optim.Adam(self.critic_net.parameters(), lr=self.learning_rate_critic, eps=1e-4)
def __init__(self, input_dim, num_actions, network_params=None, explorer=None, set_device=None, gradient_clipping_norm=None, learning_rate=0.01, double_dqn=False, seed=1364): self.seed = seed # Training parameters self.gamma = 0.99 self.batch_size = 256 self.target_network_update = 10 self.total_steps_so_far = 0 self.save_model_frequency = 100 self.learning_rate = learning_rate self.latest_learning_rate = learning_rate self.gradient_clipping_norm = gradient_clipping_norm # Explorer if explorer is None: self.explorer = ActionExplorer(epsilon_decay=0.005, seed=seed) else: self.explorer = explorer # Experience Replay Memory self.memory_size = 40000 self.replay_memory = deque([], maxlen=self.memory_size) # ---------------------------------------- # Make the algorithm outputs reproducible make_deterministic(seed) # ---------------------------------------- # if gpu is to be used if set_device is None: self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # self.device = "cuda" if torch.cuda.is_available() else "cpu" else: self.device = torch.device(set_device) # Alternative DQN training self.double_dqn = double_dqn # Q-network instantiation # (Explanation of the network_params in networks/network_builder.py) if network_params is None: network_params = { 'input_dim': input_dim, 'conv_layers': [(3, 16, 5, 2), (16, 32, 5, 2), (32, 32, 5, 2)], 'dense_layers': [num_actions], 'conv_bn': True, 'activation': 'relu' } self.policy_net = CreateNet(network_params).to(self.device) self.target_net = CreateNet(network_params).to(self.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() # self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=self.learning_rate) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate, eps=1e-4)