def __init__(self, episode=500, discount_rate=0.99, epsilon=0.1, beta=0.01, tmax=320, epoch=4, concurrent_agent=8, seed=1231): self.envs = parallelEnv('PongDeterministic-v4', concurrent_agent, seed) self.pong_agent = Agent.PongAgent() self.time_display = Timer(episode) self.episode = episode self.discount_rate = discount_rate self.epsilon = epsilon self.beta = beta self.tmax = tmax self.epoch = epoch # keep track of progress self.mean_rewards = []
from parallelEnv import parallelEnv import numpy as np # keep track of how long training takes # WARNING: running through all 800 episodes will take 30-45 minutes # training loop max iterations episode = 500 # widget bar to display progress import progressbar as pb widget = ['training loop: ', pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA()] timer = pb.ProgressBar(widgets=widget, maxval=episode).start() envs = parallelEnv('PongDeterministic-v4', n=8, seed=1234) discount_rate = .99 epsilon = 0.1 beta = .01 tmax = 200 SGD_epoch = 4 # keep track of progress mean_rewards = [] for e in range(episode): # collect trajectories old_probs, states, actions, rewards = \ pong_utils.collect_trajectories(envs, policy, tmax=tmax)
def train(episode, env_name): gamma = .99 gae_lambda = 0.95 use_gae = False beta = .01 cliprange = 0.1 best_score = -np.inf goal_score = 195.0 nenvs = 8 rollout_length = 200 minibatches = 10 * 8 # Calculate the batch_size nbatch = nenvs * rollout_length optimization_epochs = 4 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #env= gym.make(env_name) envs = parallelEnv(env_name, nenvs, seed=1234) agent = PPOAgent(state_size=envs.observation_space.shape[0], action_size=envs.action_space.n, seed=0, hidden_layers=[64, 64], lr_policy=1e-4, use_reset=True, device=device) print("------------------") print(agent.policy) print("------------------") # keep track of progress mean_rewards = [] scores_window = deque(maxlen=100) loss_storage = [] for i_episode in range(episode + 1): log_probs_old, states, actions, rewards, values, dones, vals_last = collect_trajectories( envs, agent.policy, rollout_length) returns = np.zeros_like(rewards) advantages = np.zeros_like(rewards) if not use_gae: for t in reversed(range(rollout_length)): if t == rollout_length - 1: returns[t] = rewards[t] + gamma * (1 - dones[t]) * vals_last else: returns[t] = rewards[t] + gamma * ( 1 - dones[t]) * returns[t + 1] advantages[t] = returns[t] - values[t] else: for t in reversed(range(rollout_length)): if t == rollout_length - 1: returns[t] = rewards[t] + gamma * (1 - dones[t]) * vals_last td_error = returns[t] - values[t] else: returns[t] = rewards[t] + gamma * ( 1 - dones[t]) * returns[t + 1] td_error = rewards[t] + gamma * ( 1 - dones[t]) * values[t + 1] - values[t] advantages[t] = advantages[t] * gae_lambda * gamma * ( 1 - dones[t]) + td_error # convert to pytorch tensors and move to gpu if available returns = torch.from_numpy(returns).float().to(device).view(-1, ) advantages = torch.from_numpy(advantages).float().to(device).view(-1, ) advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-10) for _ in range(optimization_epochs): sampler = random_sample(nbatch, minibatches) envs.render() for inds in sampler: mb_log_probs_old = log_probs_old[inds] mb_states = states[inds] mb_actions = actions[inds] mb_returns = returns[inds] mb_advantages = advantages[inds] loss_p, loss_v, loss_ent = agent.update(mb_log_probs_old, mb_states, mb_actions, mb_returns, mb_advantages, cliprange=cliprange, beta=beta) loss_storage.append([loss_p, loss_v, loss_ent]) total_rewards = np.sum(rewards, axis=0) scores_window.append(np.mean(total_rewards)) # last 100 scores mean_rewards.append( np.mean(total_rewards )) # get the average reward of the parallel environments cliprange *= .999 # the clipping parameter reduces as time goes on beta *= .999 # the regulation term reduces if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) print(total_rewards) if np.mean(scores_window) >= goal_score and np.mean( scores_window) >= best_score: torch.save(agent.policy.state_dict(), "policy_cartpole.pth") best_score = np.mean(scores_window) return mean_rewards, loss_storage
# WARNING: running through all 800 episodes will take 30-45 minutes # training loop max iterations #episode = 500 episode = 800 nr_workers = 8 ## widget bar to display progress #!pip install progressbar #import progressbar as pb #widget = ['training loop: ', pb.Percentage(), ' ', # pb.Bar(), ' ', pb.ETA() ] #timer = pb.ProgressBar(widgets=widget, maxval=episode).start() # initialize environment envs = parallelEnv('PongDeterministic-v4', n=nr_workers, seed=1234) discount_rate = .99 beta = .01 tmax = 320 epsilon = 0.1 use_ppo = False SGD_PPO_epochs = 4 title = "PPO_PG" if use_ppo else "REINFORCE" title += "_{}_workers".format(nr_workers) print("Starting {} for {} episodes".format(title, episode)) # keep track of progress mean_rewards = []
n=16)) # get first state # In[150]: state.shape # In[ ]: #F**K ME MAN!!! # ## Parallelization # In[140]: # load mulitple parallel agents, in this case 16. envs = parallelEnv(env_id, n=16, seed=1234) # In[141]: train(envs, main_model, optimizer, num_episodes=100, print_every=5) # In[22]: plt.plot(overall_cost) plt.xlabel("Episode #") plt.ylabel("Overall cost") plt.show() # In[23]: plt.plot(np.array(overall_reward).T[0]) #reward for agent #1