def train(self): for e in range(self.episode): # collect trajectories old_probs, states, actions, rewards = pong_utils.collect_trajectories( self.envs, self.pong_agent.policy, tmax=self.tmax) total_rewards = np.sum(rewards, axis=0) self.pong_agent.train(self.epoch, old_probs, states, actions, rewards, epsilon=self.epsilon, beta=self.beta) # the clipping parameter reduces as time goes on self.epsilon *= .999 # the regulation term also reduces # this reduces exploration in later runs self.beta *= .995 # get the average reward of the parallel environments self.mean_rewards.append(np.mean(total_rewards)) self.time_display.display(e, total_rewards) self.time_display.timer.finish() torch.save(self.pong_agent.policy, 'PongAgent.policy')
surrogates = (R_future * ratio_PPO).mean() # include a regularization term # this steers new_policy towards 0.5 # which prevents policy to become exactly 0 or 1 # this helps with exploration # add in 1.e-10 to avoid log(0) which gives nan # entropy = -(new_probs*torch.log(old_probs+1.e-10) + (1.0-new_probs)*torch.log(1.0-old_probs+1.e-10)) # surrogates += torch.mean(beta*entropy) return surrogates envs = pong_utils.parallelEnv('PongDeterministic-v4', n=4, seed=12345) prob, state, action, reward = pong_utils.collect_trajectories(envs, policy, tmax=100) Lsur = clipped_surrogate(policy, prob, state, action, reward) print(Lsur) from parallelEnv import parallelEnv import numpy as np # keep track of how long training takes # WARNING: running through all 800 episodes will take 30-45 minutes # training loop max iterations episode = 500 # widget bar to display progress import progressbar as pb
model = ActorCritic().to(device) #return dist, v if args.load_weight: model.load_state_dict( torch.load(f'PongDeterministic-v4_{load_weight_n}.pth')) optimizer = optim.Adam(model.parameters(), lr=lr) f1 = envs.reset() f2 = envs.step([0] * num_envs) if __name__ == "__main__": while not early_stop and frame_idx < max_frames: frame_idx += 1 print(frame_idx) if frame_idx % 100 == 0: num_steps += args.additional_num_step log_probs, states, actions, rewards, next_state, masks, values = collect_trajectories( envs, model, num_steps) scores = np.asarray(rewards).sum(axis=0) scores_list.append(scores.mean()) print("Mean:", scores.mean(), "\nRaw:", scores) # stop if any of the trajectories is done # we want all the lists to be retangular for _ in range(n_updates): # uncomment to utilize your own clipped function! # raise Exception(type(states), states[0].size()) if args.beta_decay and beta > 0.01: beta *= discount L = -clipped_surrogate(model, log_probs, states, actions, rewards, discount, epsilon, beta)
# envs = parallelEnv('PongDeterministic-v4', n=8, seed=1234) discount_rate = .99 beta = .01 tmax = 200 SGD_epoch = 4 epsilon = 0.1 max_num_ac = 5 min_num_ac = 5 # keep track of progress mean_rewards = [] for e in range(episode): # collect trajectories old_probs, states, actions, rewards = \ pong_utils.collect_trajectories(envs, policy, tmax=tmax) if len(envs.ps[0].aircrafts) > max_num_ac: max_num_ac = len(envs.ps[0].aircrafts) if len(envs.ps[0].aircrafts) < min_num_ac: min_num_ac = len(envs.ps[0].aircrafts) total_rewards = np.sum(rewards, axis=0) # gradient ascent step for _ in range(SGD_epoch): # uncomment to utilize your own clipped function! # L = -clipped_surrogate(policy, old_probs, states, actions, rewards, epsilon=epsilon, beta=beta) L = -pong_utils.clipped_surrogate(policy, old_probs, states,
timer = pb.ProgressBar(widgets=widget, maxval=episode).start()''' discount_rate = .99 beta = .01 value_coef = .5 tmax = 200 SGD_epoch = 4 epsilon = 0.1 # keep track of progress mean_rewards = [] for e in range(episode): # collect trajectories old_probs, states, actions, rewards = \ pong_utils.collect_trajectories(envs, policy, tmax=tmax) total_rewards = np.sum(rewards, axis=0) # gradient ascent step for _ in range(SGD_epoch): L = -pong_utils.clipped_surrogate(policy, old_probs, states, actions, rewards, epsilon=epsilon, beta=beta, value_coef=value_coef) optimizer.zero_grad() L.backward()
def main(loop): beta = .01 tmax = int(250 / Env.vw) SGD_epoch = 4 epsilon = 0.1 episode = 500 envs = Env.envs() # check which device is being used. # I recommend disabling gpu until you've made sure that the code runs device = pong_utils.device print("using device: ", device) # keep track of progress mean_rewards = [] policy = pong_utils.Policy().to(device) # we use the adam optimizer with learning rate 2e-4 # optim.SGD is also possible optimizer = optim.Adam(policy.parameters(), lr=1e-4) for e in range(episode): # collect trajectories old_probs, states, actions, rewards = \ pong_utils.collect_trajectories(envs, policy, tmax=tmax) total_rewards = np.sum(rewards, axis=0) # gradient ascent step for _ in range(SGD_epoch): # uncomment to utilize your own clipped function! # L = -clipped_surrogate(policy, old_probs, states, actions, rewards, epsilon=epsilon, beta=beta) L = -pong_utils.clipped_surrogate(policy, old_probs, states, actions, rewards, epsilon=epsilon, beta=beta) optimizer.zero_grad() L.backward() optimizer.step() del L # the clipping parameter reduces as time goes on epsilon *= .999 # the regulation term also reduces # this reduces exploration in later runs beta *= .995 # get the average reward of the parallel environments mean_rewards.append(np.mean(total_rewards)) # display some progress every 20 iterations if (e + 1) % 20 == 0: print("Episode: {0:d}, score: {1:f}".format( e + 1, np.mean(total_rewards))) print(total_rewards) env = envs.ps[0] mean_rewards = np.array(mean_rewards) np.savetxt('data_{}.csv'.format(loop), mean_rewards, newline='\n')
import pong_utils device = pong_utils.device print("using device: ", device) import gym env = gym.make('PongDeterministic-v4') print("List of available actions: ", env.unwrapped.get_action_meanings()) # The actions 'RIGHTFIRE' = 4 and 'LEFTFIRE" = 5 makes the game restarts if done import matplotlib.pyplot as plt from agent import Policy agent = Policy() agent = agent.to(device) pong_utils.play(env, agent, time=100) envs = pong_utils.parallelEnv('PongDeterministic-v4', n=4, seed=12345) prob, state, action, reward = pong_utils.collect_trajectories(envs, agent, tmax=100)
x = self.conv_2(x) x = self.relu(x) x = self.maxpool(x) # MLP x = x.view(-1,9248) # flatten the tensor return self.sig(self.fc(x)) # P(left) = 1-P(right) policy = Policy().to(device) optimizer = optim.Adam(policy.parameters(), lr=1e-4) #%% Trajectories rollout envs = pong_utils.parallelEnv('PongDeterministic-v4', n=8, seed=12345) prob, state, action, reward = pong_utils.collect_trajectories(envs, policy, tmax=100) #%% Function Definitions def surrogate(policy, old_probs, states, actions, rewards, discount = 0.995, beta=0.01): discount = discount**np.arange(len(rewards)) rewards = np.asarray(rewards)*discount[:,np.newaxis] # convert rewards to future rewards rewards_future = rewards[::-1].cumsum(axis=0)[::-1] # Normalize rewards mean = np.mean(rewards_future, axis=1) std = np.std(rewards_future, axis=1) + 1.0e-10