示例#1
0
 def __init__(self,
              episode=500,
              discount_rate=0.99,
              epsilon=0.1,
              beta=0.01,
              tmax=320,
              epoch=4,
              concurrent_agent=8,
              seed=1231):
     self.envs = parallelEnv('PongDeterministic-v4', concurrent_agent, seed)
     self.pong_agent = Agent.PongAgent()
     self.time_display = Timer(episode)
     self.episode = episode
     self.discount_rate = discount_rate
     self.epsilon = epsilon
     self.beta = beta
     self.tmax = tmax
     self.epoch = epoch
     # keep track of progress
     self.mean_rewards = []
示例#2
0
from parallelEnv import parallelEnv
import numpy as np

# keep track of how long training takes
# WARNING: running through all 800 episodes will take 30-45 minutes

# training loop max iterations
episode = 500

# widget bar to display progress
import progressbar as pb

widget = ['training loop: ', pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA()]
timer = pb.ProgressBar(widgets=widget, maxval=episode).start()

envs = parallelEnv('PongDeterministic-v4', n=8, seed=1234)

discount_rate = .99
epsilon = 0.1
beta = .01
tmax = 200
SGD_epoch = 4

# keep track of progress
mean_rewards = []

for e in range(episode):

    # collect trajectories
    old_probs, states, actions, rewards = \
        pong_utils.collect_trajectories(envs, policy, tmax=tmax)
示例#3
0
def train(episode, env_name):
    gamma = .99
    gae_lambda = 0.95
    use_gae = False
    beta = .01
    cliprange = 0.1
    best_score = -np.inf
    goal_score = 195.0

    nenvs = 8
    rollout_length = 200
    minibatches = 10 * 8
    # Calculate the batch_size
    nbatch = nenvs * rollout_length
    optimization_epochs = 4

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    #env= gym.make(env_name)

    envs = parallelEnv(env_name, nenvs, seed=1234)
    agent = PPOAgent(state_size=envs.observation_space.shape[0],
                     action_size=envs.action_space.n,
                     seed=0,
                     hidden_layers=[64, 64],
                     lr_policy=1e-4,
                     use_reset=True,
                     device=device)

    print("------------------")
    print(agent.policy)
    print("------------------")

    # keep track of progress
    mean_rewards = []
    scores_window = deque(maxlen=100)
    loss_storage = []

    for i_episode in range(episode + 1):
        log_probs_old, states, actions, rewards, values, dones, vals_last = collect_trajectories(
            envs, agent.policy, rollout_length)

        returns = np.zeros_like(rewards)
        advantages = np.zeros_like(rewards)

        if not use_gae:
            for t in reversed(range(rollout_length)):
                if t == rollout_length - 1:
                    returns[t] = rewards[t] + gamma * (1 -
                                                       dones[t]) * vals_last
                else:
                    returns[t] = rewards[t] + gamma * (
                        1 - dones[t]) * returns[t + 1]
                advantages[t] = returns[t] - values[t]
        else:
            for t in reversed(range(rollout_length)):
                if t == rollout_length - 1:
                    returns[t] = rewards[t] + gamma * (1 -
                                                       dones[t]) * vals_last
                    td_error = returns[t] - values[t]
                else:
                    returns[t] = rewards[t] + gamma * (
                        1 - dones[t]) * returns[t + 1]
                    td_error = rewards[t] + gamma * (
                        1 - dones[t]) * values[t + 1] - values[t]
                advantages[t] = advantages[t] * gae_lambda * gamma * (
                    1 - dones[t]) + td_error

        # convert to pytorch tensors and move to gpu if available
        returns = torch.from_numpy(returns).float().to(device).view(-1, )
        advantages = torch.from_numpy(advantages).float().to(device).view(-1, )
        advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                         1e-10)

        for _ in range(optimization_epochs):
            sampler = random_sample(nbatch, minibatches)
            envs.render()

            for inds in sampler:
                mb_log_probs_old = log_probs_old[inds]
                mb_states = states[inds]
                mb_actions = actions[inds]
                mb_returns = returns[inds]
                mb_advantages = advantages[inds]
                loss_p, loss_v, loss_ent = agent.update(mb_log_probs_old,
                                                        mb_states,
                                                        mb_actions,
                                                        mb_returns,
                                                        mb_advantages,
                                                        cliprange=cliprange,
                                                        beta=beta)
                loss_storage.append([loss_p, loss_v, loss_ent])

        total_rewards = np.sum(rewards, axis=0)
        scores_window.append(np.mean(total_rewards))  # last 100 scores
        mean_rewards.append(
            np.mean(total_rewards
                    ))  # get the average reward of the parallel environments
        cliprange *= .999  # the clipping parameter reduces as time goes on
        beta *= .999  # the regulation term reduces

        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)))
            print(total_rewards)
        if np.mean(scores_window) >= goal_score and np.mean(
                scores_window) >= best_score:
            torch.save(agent.policy.state_dict(), "policy_cartpole.pth")
            best_score = np.mean(scores_window)

    return mean_rewards, loss_storage
示例#4
0
        # WARNING: running through all 800 episodes will take 30-45 minutes

        # training loop max iterations
        #episode = 500
        episode = 800
        nr_workers = 8

        ## widget bar to display progress
        #!pip install progressbar
        #import progressbar as pb
        #widget = ['training loop: ', pb.Percentage(), ' ',
        #          pb.Bar(), ' ', pb.ETA() ]
        #timer = pb.ProgressBar(widgets=widget, maxval=episode).start()

        # initialize environment
        envs = parallelEnv('PongDeterministic-v4', n=nr_workers, seed=1234)

        discount_rate = .99
        beta = .01
        tmax = 320
        epsilon = 0.1
        use_ppo = False
        SGD_PPO_epochs = 4
        title = "PPO_PG" if use_ppo else "REINFORCE"

        title += "_{}_workers".format(nr_workers)

        print("Starting {} for {} episodes".format(title, episode))

        # keep track of progress
        mean_rewards = []
示例#5
0
                                             n=16))  # get first state

# In[150]:

state.shape

# In[ ]:

#F**K ME MAN!!!

# ## Parallelization

# In[140]:

# load mulitple parallel agents, in this case 16.
envs = parallelEnv(env_id, n=16, seed=1234)

# In[141]:

train(envs, main_model, optimizer, num_episodes=100, print_every=5)

# In[22]:

plt.plot(overall_cost)
plt.xlabel("Episode #")
plt.ylabel("Overall cost")
plt.show()

# In[23]:

plt.plot(np.array(overall_reward).T[0])  #reward for agent #1