def ddpg(agent_instance, print_every=100): scores_deque = deque(maxlen=print_every) scores_deque.append(0) scores = np.zeros(num_agents) # initialize the score (for each agent) n_episodes = 0 start_time = time.time() # start time for printing history = [] agent_obj = [ddpg_agent.Agent(**agent_instance) for _ in range(num_agents)] # generate list of agents for each agent instance according to number of agents while np.mean(scores_deque) < 0.8: n_episodes += 1 env_info = env.reset(train_mode=True)[brain_name] # reset the environment states = env_info.vector_observations # get the current # state (for each agent) scores = np.zeros(num_agents) # initialize the score (for each agent) for agent in agent_obj: agent.reset() learn_count = 0 while True: learn_count += 1 actions = np.array([agent_obj[i].act(states[i], add_noise=True) for i in range(num_agents)]) # select an action (for each agent) #env_info = env.step(actions)[brain_name] # send all actions to tne environment env_info = env.step(np.reshape(np.concatenate(([actions[i] for i in range(num_agents)]), axis = 0), (1, action_tensor_size)))[brain_name] # send all actions to the environment next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished for i in range(num_agents): agent_obj[i].step(states[i], actions[i], rewards[i], next_states[i], dones[i], learn_count, update_count) states = next_states # roll over states to next time step scores += rewards # update the score (for each agent) if np.any(dones): # exit loop if episode finished break scores_deque.append(np.max(scores)) history.append(np.mean(scores)) delta_time = str(timedelta(seconds=time.time() - start_time)) # elapsed time count = 0 for agent in agent_obj: count += 1 torch.save(agent.actor_local.state_dict(), 'checkpoint_actor_%s.pth' % count) torch.save(agent.critic_local.state_dict(), 'checkpoint_critic_%s.pth' % count) if n_episodes % (print_every) == 0: print("\rEpisode: {}\tHighest Score {: .2f}\tAverage score for last {} episodes was {: .2f}\tTime: {:.9}".format(n_episodes, np.max(scores_deque), print_every, np.mean(scores_deque), delta_time)) print("\n\rLast Episode: {}\tHighest Score {: .2f}\tAverage score for last {} episodes was {: .2f}\tTime: {:.9}".format(n_episodes, np.max(scores_deque), print_every, np.mean(scores_deque), delta_time)) return history
def ddpg(agent_instance, print_every=100): agent_obj = [ ddpg_agent.Agent(**agent_instance) for _ in range(num_agents) ] # generate list of agents for each agent instance according to number of agents count = 0 for agent in agent_obj: count += 1 agent_obj[count - 1].actor_local.load_state_dict( torch.load('checkpoint_actor_%s.pth' % count)) agent_obj[count - 1].critic_local.load_state_dict( torch.load('checkpoint_critic_%s.pth' % count)) while True: env_info = env.reset( train_mode=False)[brain_name] # reset the environment states = env_info.vector_observations # get the current # state (for each agent) scores = np.zeros(num_agents) # initialize the score (for each agent) for agent in agent_obj: agent.reset() learn_count = 0 while True: learn_count += 1 actions = np.array([ agent_obj[i].act(states[i], add_noise=False) for i in range(num_agents) ]) # select an action (for each agent) #env_info = env.step(actions)[brain_name] # send all actions to tne environment env_info = env.step( np.reshape(np.concatenate( (actions[0], actions[1]), axis=0), (1, action_tensor_size)))[ brain_name] # send all actions to the environment next_states = env_info.vector_observations # get next state (for each agent) dones = env_info.local_done # see if episode finished states = next_states # roll over states to next time step if np.any(dones): # exit loop if episode finished break
def ddpg_runner(args): env = args['environment'] brain_name = args['brain_name'] scores = [] agent = ddpg_agent.Agent(args['agent_args']) achievement = args['achievement'] achievement_length = args['achievement_length'] scores_deque = deque(maxlen=achievement_length) for i_episode in range(1, args['episodes'] + 1): env_info = env.reset( train_mode=True)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) score = 0 while True: actions = agent.act(states) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done # see if episode finished agent.step(states, actions, rewards, next_states, dones) states = next_states score += np.mean(rewards) if np.any(dones): break scores_deque.append(score) scores.append(score) print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_deque)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_deque))) torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth') torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth') if np.mean(scores_deque) > achievement: torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth') torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth') return scores return scores
def ddpg(args): scores_deque = deque(maxlen=args['maxlen']) env = args['environment'] brain_name = args['brain_name'] scores = [] agent = ddpg_agent.Agent(args['agent_args']) for i_episode in range(1, args['episodes'] + 1): env_info = env.reset( train_mode=True)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) score = 0 while True: actions = agent.act(states) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations rewards = env_info.rewards learning_rewards = [0] * len( rewards) # get reward (for each agent) for i in range(len(rewards)): learning_rewards[ i] = -0.0001 if rewards[i] == 0 else rewards[i] dones = env_info.local_done # see if episode finished agent.step(states, actions, learning_rewards, next_states, dones) states = next_states score += np.mean(rewards) if np.any(dones): break scores_deque.append(score) scores.append(score) print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_deque))) torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth') torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth') if np.mean(scores_deque) > 30: return scores return scores
def main(args): env = Tennis(args.env_path) config = { 'state_size': env.state_size, 'action_size': env.action_size, 'reward_accum_steps': 1000, 'random_seed': 1, 'gamma': 0.99, 'update_cycle': 400, 'update_times': 10, 'buffer_size': int(1e6), 'batch_size': 1024, 'warm_start_size': 1024, 'n_episode': 1000000, 'max_t': 1000, 'window_size': 100, 'ckpt_prefix': 'checkpoint', 'reset_cycle': 20000, } agents = [ddpg_agent.Agent(**config) for _ in range(env.num_agents)] if args.train: scores = maddpg(agents, env, **config) plot(scores, args.png_path) if args.show: for agent_i, agent in enumerate(agents): agent.actor_local.load_state_dict( torch.load('checkpoint_actor_{}.pth'.format(agent_i), lambda a, b: a)) agent.critic_local.load_state_dict( torch.load('checkpoint_critic_{}.pth'.format(agent_i), lambda a, b: a)) show(agents, env)
return agent if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--max_minutes', type=int, default=60) parser.add_argument('--max_episodes', type=int, default=2000) parser.add_argument('--name', type=str, default='player') parser.add_argument('--memory_size', type=int, default=int(1e5)) parser.add_argument('--warm_up', type=int, default=int(1e4)) parser.add_argument('--batch_size', type=int, default=1024) parser.add_argument('--discount', type=float, default=0.995) parser.add_argument('--tau', type=float, default=1e-3) parser.add_argument('--gradient_clip', type=float, default=1) parser.add_argument('--random_process', type=str, default='gaussian') parser.add_argument('--random_theta', type=float, default=0.1) parser.add_argument('--random_std', type=float, default=1) parser.add_argument('--random_std_decay', type=float, default=0.999) parser.add_argument('--update_every', type=int, default=5) parser.add_argument('--update_epochs', type=int, default=5) parser.add_argument('--h1_size', type=int, default=256) parser.add_argument('--h2_size', type=int, default=256) parser.add_argument('--actor_lr', type=float, default=1e-3) parser.add_argument('--critic_lr', type=float, default=1e-4) args = parser.parse_args() params = args.__dict__ print(f'Train agent with params: {params}') agent = train(ddpg_agent.Agent(gym_env, state_size, action_size, params), args.max_minutes, args.max_episodes) agent.save()
if np.mean(scores_deque) >= checkpoint_score: checkpt = "Episode" + str(i_episode) print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode, np.mean(scores_deque))) agent.checkpoint(checkpt) break return scores # In[49]: agent = ddpg_agent.Agent(state_size=state_size, action_size=action_size, random_seed=0, num_envs=num_envs, checkpt_folder="MultiEnvCheckPt") # In[50]: rr_scores = train(env=env, agent=agent) # Multiple parallel Env # In[51]: plot_scores(rr_scores) # random replay scores # When finished, you can close the environment. # In[6]:
from collections import deque import matplotlib.pyplot as plt #%matplotlib inline import importlib from ddpg_agent import Agent import ddpg_agent import model av_reward = deque(maxlen=100) importlib.reload(ddpg_agent) #importlib.reload(model) agent_list = [] #import ipdb; ipdb.set_trace() agent_list.append( ddpg_agent.Agent(state_size=24, action_size=2, random_seed=100)) for a in range(1): agent = ddpg_agent.Agent(state_size=24, action_size=2, random_seed=a) #agent.memory = agent_list[0].memory agent_list.append(agent) num_episodes = 10000 max_t = 10000 training_reward_list = [] best_score = 0 for episode in range(num_episodes): print(episode) env_info = env.reset(train_mode=True)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) scores = np.zeros(num_agents) # initialize the score (for each agent) time_step = 0 for agent in agent_list:
# reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) # size of each action action_size = brain.vector_action_space_size # examine the state space states = env_info.vector_observations state_size = states.shape[1] agent = ddpg_agent.Agent(num_agents=num_agents, state_size=state_size, action_size=action_size, random_seed=31337) def tick_simulation(actions): env_info = env.step(actions)[brain_name] return env_info.vector_observations, env_info.rewards, env_info.local_done def plot_scores(scores, mean_scores): fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(1, len(scores) + 1), scores) plt.plot(np.arange(1, len(mean_scores) + 1), mean_scores) plt.ylabel('Score') plt.xlabel('Episode #')
config.replay_fn = lambda: Replay( config.action_size, buffer_size=int(1e6), batch_size=128) config.noise_fn = lambda: OUNoise( config.action_size, mu=0., theta=0.15, sigma=0.1, seed=config.seed) config.discount = 0.99 config.target_mix = 3e-3 config.max_episodes = 3000 config.max_steps = int(1e6) config.goal_score = 1 config.CHECKPOINT_FOLDER = "MultiAgentCheckPt" maddpg_agent = ddpg_agent.Agent(config=config) # In[21]: ddpg_scores, ddpg_avg_scores = train(env=env, agent=maddpg_agent, config=config) # Multiple parallel Env # In[26]: plot_scores(ddpg_scores, ddpg_avg_scores) # random replay scores # When finished, you can close the environment. # In[6]:
def ddpg(agent_instance, print_every=1000): scores_deque0 = deque(maxlen=print_every) scores_deque0.append(0) scores_deque1 = deque(maxlen=print_every) scores_deque1.append(0) scores = np.zeros(num_agents) # initialize the score (for each agent) n_episodes = 0 start_time = time.time() # start time for printing history0 = [] history1 = [] agent_obj = [ddpg_agent.Agent(**agent_instance) for _ in range(num_agents)] # generate list of agents for each agent instance according to number of agents while np.mean(scores_deque0) < 0.8: n_episodes += 1 env_info = env.reset(train_mode=True)[brain_name] # reset the environment states = np.reshape(env_info.vector_observations # get the current # state (for each agent) scores = np.zeros(num_agents) # initialize the score (for each agent) for agent in agent_obj: agent.reset() learn_count = 0 while True: learn_count += 1 actions = np.array([agent_obj[i].act(states[i], i) for i in range(num_agents)]) # select an action (for each agent) env_info = env.step(np.reshape(np.concatenate((actions[0], actions[1]), axis = 0), (1, 4)))[brain_name] # send all actions to the environment #env_info = env.step(actions)[brain_name] # send all actions to the environment next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished #(agent_obj[i].step(states[i], actions[i], rewards[i], next_states[i], dones[i], learn_count, update_count, i) for i in range(num_agents)) (agent_obj[i].step(states, actions[i], rewards[i], next_states, dones[i], learn_count, update_count, i) for i in range(num_agents)) states = next_states # roll over states to next time step scores += rewards # update the score (for each agent) #print("\rEpisode: {}\tAvg. Agent Score: {:.2f}\t{} Episode Rolling Average Score: {:.2f}\tStep: {}".format(n_episodes, np.mean(scores), print_every, np.mean(scores_deque), learn_count), end =" ") if np.any(dones): # exit loop if episode finished break print('\rEpisode {}\tActor 1s score was {: .2f}\t Actor 2s score was {: .2f}'.format(n_episodes, scores[0], scores[1]), end =" ") scores_deque0.append(np.max(scores)) scores_deque1.append(scores[1]) history0.append(scores[0]) history1.append(scores[1]) delta_time = str(timedelta(seconds=time.time() - start_time)) # elapsed time (torch.save(agent[i].actor_local.state_dict(), 'checkpoint_actor%s.pth' % i) for i in range(num_agents)) (torch.save(agent[i].critic_local.state_dict(), 'checkpoint_critic%s.pth' % i ) for i in range(num_agents)) if n_episodes % (print_every) == 0: print("\n\rEpisode: {}\tActor 1s avg score was {: .2f}\t Actor 2s avg score was {: .2f}\t A1 Max {}\tTime: {:.9}".format(n_episodes, np.mean(scores_deque0), np.mean(scores_deque1), np.max(scores_deque0), delta_time)) print("\n\rEpisode: {}\tActor 1s avg score was {: .2f}\t Actor 2s avg score was {: .2f}\t A1 Max {}\tTime: {:.9}".format(n_episodes, np.mean(scores_deque0), np.mean(scores_deque1), np.max(scores_deque0), delta_time)) return history0, history1 agent_instance ={"state_size": state_size, "action_size": action_size, "random_seed": random_seed, "clip_constant": clip_constant} results0, results1 = ddpg(agent_instance) fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(1, len(results0)+1), results0) plt.ylabel('Score') plt.xlabel('Episode #') plt.show()
# get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=False)[brain_name] num_agents = len(env_info.agents) action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] # create agent from checkpoint agent = ddpga.Agent(state_size, action_size, random_seed=15) agent.actor_local.load_state_dict(torch.load('checkpoint_actor.pth')) agent.critic_local.load_state_dict(torch.load('checkpoint_critic.pth')) rounds = 5 scores_all = [] for r in range(rounds): env_info = env.reset(train_mode=False)[brain_name] states = env_info.vector_observations scores = np.zeros(num_agents) while True: actions = agent.act( states, add_noise=False) # select an action (for each agent)
if np.mean(scores_deque) >= breakpoint_score: fname += str(i_episode) print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(scores_deque))) torch.save(agent.qnetwork_local.state_dict(), fname + 'checkpoint.pth') break return scores #%% agent = ddpg_agent.Agent(state_size=state_size, action_size=action_size, random_seed=0) #%% rr_scores = train(env=env, agent=agent) # random replay training #%% import matplotlib.pyplot as plt def plot_scores(scores): # plot the scores fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(len(scores)), scores) plt.ylabel('Score')
batch_size=args.batch_size, seed=args.seed) else: memory = DeterministicReplayBuffer(action_size=action_size, state_size=state_size, buffer_size=args.buffer_size) # agent if args.algorithm == "ddpg": agent = ddpg.Agent(state_size=state_size, action_size=action_size, seed=args.seed, batch_size=args.batch_size, memory=memory, lr_actor=args.lr_actor, lr_critic=args.lr_critic, clip_critic=args.clip_critic, gamma=args.gamma, tau=args.tau, weight_decay=args.weight_decay, update_network_steps=args.update_network_steps, sgd_epoch=args.sgd_epoch, checkpoint_prefix=args.checkpoint_prefix) else: agent = ppo.Agent(state_size=state_size, action_size=action_size, seed=args.seed, batch_size=args.batch_size, memory=memory, lr_actor=args.lr_actor, gamma=args.gamma, eps=args.eps,
import matplotlib.pyplot as plt #%matplotlib inline import importlib from ddpg_agent import Agent import ddpg_agent import model av_reward = deque(maxlen=100) importlib.reload(ddpg_agent) #importlib.reload(model) agent_list = [] #import ipdb; ipdb.set_trace() agent_list.append( ddpg_agent.Agent(state_size=33, action_size=action_size, random_seed=100)) for a in range(19): agent = ddpg_agent.Agent(state_size=33, action_size=action_size, random_seed=a) agent.memory = agent_list[0].memory agent_list.append(agent) num_episodes = 150 max_t = 10000 training_reward_list = [] best_score = 0 for episode in range(num_episodes): print(episode) env_info = env.reset(train_mode=True)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) scores = np.zeros(num_agents) # initialize the score (for each agent)
break return scores import sys Reacher_path = sys.argv[1] env_20 = Reacher(Reacher_path) import ddpg_agent reward_accum_steps = 20 agent_20 = ddpg_agent.Agent(state_size=33, action_size=4, random_seed=1, gamma=0.99, update_cycle=reward_accum_steps * 20, update_times=reward_accum_steps * 20 // 40, buffer_size=int(1e6), batch_size=1024, warm_start_size=1024) scores = ddpg(agent_20, env_20, 1000, is_20=True, ckpt_prefix='checkpoint_20', reward_accum_steps=reward_accum_steps) fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(1, len(scores) + 1), scores)