def train(episodes=700, max_t=1000): try: agent = Agent(state_size=33, action_size=4, seed=0) scores = [] scores_window = deque(maxlen=100) config = Config() eps = config.EPS_START for i_episode in range(1, episodes + 1): env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations agent.reset() score = np.zeros(1) for _ in range(max_t + 1): action = agent.act(state, eps) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations # get the next state reward = env_info.rewards # get the reward done = env_info.local_done # see if episode has finished score += env_info.rewards agent.step(state, action, reward, next_state, done) state = next_state eps = eps - config.LIN_EPS_DECAY eps = np.maximum(eps, config.EPS_END) if np.any(done): break scores_window.append(score) scores.append(score) print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") if i_episode % 2 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) mean = np.mean(scores_window) if mean > 30.0 and mean <= 31.5: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(scores_window))) torch.save(agent.actor_local.state_dict(), 'solved_actor_trained_model.pth') torch.save(agent.critic_local.state_dict(), 'solved_critic_trained_model.pth') torch.save(agent.actor_local.state_dict(), 'actor_trained_model.pth') torch.save(agent.critic_local.state_dict(), 'critic_trained_model.pth') return scores except KeyboardInterrupt: torch.save(agent.actor_local.state_dict(), 'interrupt_actor_trained_model.pth') torch.save(agent.critic_local.state_dict(), 'interrupt_critic_trained_model.pth') plot_score_chart(scores) sys.exit(0)
def ddpg_dual(n_episodes=5000, max_t=2000, solved_at=0.5): sharedActor = Agent(state_size=state_size, action_size=action_size, random_seed=2) avg_score = [] scores_deque = deque(maxlen=100) best_score = 0.0 env_solved = False for i_episode in range(1, n_episodes + 1): scores = np.zeros(num_agents) env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations # get the current state (for each agent) sharedActor.reset() for t in range(max_t): actions = sharedActor.act(states) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): sharedActor.step(state, action, reward, next_state, done, t) states = next_states scores += rewards # update the score (for each agent) if np.any(dones): # exit loop if episode finished break score = np.max(scores) avg_score.append(score) scores_deque.append(score) print('\rEpisode:{} \tScore:{:.3f} \tAverage Score: {:.3f} solved:{}'. format(i_episode, score, np.mean(scores_deque), env_solved), end="") if i_episode % 10 == 0: print("\n") if score > best_score and np.mean(scores_deque) >= solved_at: if env_solved == False: env_solved = True print( '\nEnv solved in {:.3f} episodes!\tAverage Score ={:.3f} over last {} Episodes' .format(i_episode - 100, np.mean(scores_deque), 100)) torch.save(sharedActor.actor_local.state_dict(), "actor.pth") torch.save(sharedActor.critic_local.state_dict(), "critic.pth") best_score = score break return avg_score
class DdpgDeepleng(): def __init__(self): # Create the Gym environment self.env = gym.make('DeeplengDocking-v1') rospy.loginfo("Gym environment done") self.agent = Agent(state_size=13, action_size=3, random_seed=2) # Set the logging system rospack = rospkg.RosPack() pkg_path = rospack.get_path('deepleng_control') outdir = pkg_path + '/training_results' # env = wrappers.Monitor(env, outdir, force=True) # rospy.loginfo("Monitor Wrapper started") self.max_episodes = 200 self.max_timesteps = 1000 def __call__(self, *args, **kwargs): scores = [] for episode in range(1, self.max_episodes + 1): state = self.env.reset() self.agent.reset() score = 0 print( "==========================================================================" ) print("Episode no. {}".format(episode)) print( "==========================================================================" ) for stp in range(1, self.max_timesteps + 1): # print("___________________________________________________________________________") print("Step no. {}".format(stp)) # print("Current state: {}".format([round(elem, 2) for elem in state])) print("Current state: {}".format(state)) action = self.agent.act(np.array(state)) print("Action taken: {}".format(action)) next_state, reward, done, _ = self.env.step(action) print("Reward for action: {}".format(reward)) print("Next state: {}".format(next_state)) self.agent.step(state, action, reward, next_state, done) state = np.array(next_state) score += reward if done: # print("Done") break print( "___________________________________________________________________________" ) scores.append(score) torch.save( self.agent.actor_local.state_dict(), '/home/dfki.uni-bremen.de/mpatil/Desktop/checkpoint_actor.pth') torch.save( self.agent.critic_local.state_dict(), '/home/dfki.uni-bremen.de/mpatil/Desktop/checkpoint_critic.pth' ) self.env.close() return scores
def ddpg(n_episodes=2000, store_every=10): scores_deque = deque(maxlen=store_every) scores = [] agents = Agent(state_size=state_size, action_size=action_size, num_agents=num_agents, random_seed=0) for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=GRAPHICS_OFF)[brain_name] state = env_info.vector_observations agents.reset() score = np.zeros(num_agents) while True: action = agents.act(state) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done agents.step(state, action, rewards, next_state, dones) state = next_state score += rewards if np.any(dones): break scores_deque.append(np.mean(score)) scores.append(np.mean(score)) avg_score = np.mean(scores_deque) print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}\t {}'.format( i_episode, np.mean(scores_deque), np.mean(score), strftime("%H:%M:%S", gmtime())), end="") if i_episode % store_every == 0 or avg_score >= TARGET_SCORE: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, avg_score)) if avg_score >= TARGET_SCORE: torch.save(agents.actor_local.state_dict(), "ckpt/{}".format(ACTOR_CHECKPOINT_NAME)) torch.save(agents.critic_local.state_dict(), "ckpt/{}".format(CRITIC_CHECKPOINT_NAME)) break return scores
def ddpg_train(n_episodes, seed, buffer_size, batch_size, gamma, tau, lr_actor, lr_critic, weight_decay): scores = [] scores_deque = deque(maxlen=100) agent = Agent(n_agents, state_size, action_size, seed, buffer_size, batch_size, gamma, tau, lr_actor, lr_critic, weight_decay) load(agent) for i_episode in range(n_episodes): env_info = env.reset( train_mode=True)[brain_name] # reset the environment states = env_info.vector_observations agent.reset() # reset the agent noise score = np.zeros(n_agents) while True: actions = agent.act(states) # send the action to the environment env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations # get the next state rewards = env_info.rewards # get the reward dones = env_info.local_done # see if episode has finished agent.step(states, actions, rewards, next_states, dones) score += rewards # update the score states = next_states # roll over the state to next time step if np.any(dones): # exit loop if episode finished break scores.append(np.mean(score)) scores_deque.append(np.mean(score)) print('\rEpisode: \t{} \tScore: \t{:.2f} \tAverage Score: \t{:.2f}'. format(i_episode, np.mean(score), np.mean(scores_deque)), end="") if n_episodes % 10 == 0: torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth') torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth') if np.mean(scores_deque) >= 30.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode, np.mean(scores_deque))) break fig = plt.figure() ax = fig.add_subplot(111) ax.grid() ax.plot(np.arange(len(scores)), scores) ax.set(xlabel="Episode #", ylabel="'Score", title="DDPG Network") fig.savefig("ddpg_network.pdf")
class Worker(mp.Process): def __init__(self, gnet, opt, global_ep, global_ep_r, res_queue, name): super(Worker, self).__init__() self.name = 'w{}'.format(name) self.g_ep, self.g_ep_r, self.res_queue = global_ep, global_ep_r, res_queue self.gnet, self.opt = gnet, opt self.agent = Agent(state_size, action_size, gnet['actor'], gnet['critic'] \ , opt['actor_optimizer'], opt['critic_optimizer'], random_seed) # local agent self.env = gym.make('LunarLanderContinuous-v2') def run(self): total_step = 1 while self.g_ep.value < MAX_EP: state = self.env.reset() ep_r = 0. self.agent.reset() for t in range(MAX_EP_STEP): # if self.name == 'w1': # self.env.render() action = self.agent.act(state) action = np.clip(action, -1, 1) next_state, reward, done, _ = self.env.step(action) self.agent.step(state, action, reward, next_state, done, t) if t == MAX_EP_STEP - 1: done = True ep_r += reward if total_step % UPDATE_GLOBAL_ITER == 0 or done: # time to sync if done: # done and print information record(self.g_ep, self.g_ep_r, ep_r, self.res_queue, self.name) break state = next_state total_step += 1
def ddpg(n_episodes=1000, max_t=500, print_every=100): scores_deque = deque(maxlen=print_every) scores = [] # Create the env and the agent terminating_angle = 15 env = CubeEnv(np.deg2rad(terminating_angle)) agent = Agent(state_size=3, action_size=1, random_seed=2) plotter = LivePlotter(env, max_t, terminating_angle, n_episodes) for i_episode in range(1, n_episodes + 1): state = env.reset() agent.reset() score = 0 done = False plotter.reset() while not done: # Select the next action and update system action = agent.act(state) * 10 next_state, reward, done, _ = env.step(action) agent.step(state, action, reward, next_state, done) # Update plots and metrics state = next_state score += reward plotter.add_data_from_env(env) scores_deque.append(score) scores.append(score) plotter.add_score(score) print('\rEpisode {}\tScore: {}'.format(i_episode, score), end="") # Display the plotrs plotter.display() # Save model torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth') torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth') return scores
# return scores #scores = ddpg() n_episodes = 15 max_t = 300 print_every = 100 scores_deque = deque(maxlen=print_every) scores = [] for i_episode in range(1, n_episodes + 1): state = env.reset() agent.reset() score = 0 for t in range(max_t): action = agent.act(state) next_state, reward, done, _ = env.step(action) agent.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores_deque.append(score) scores.append(score) print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)), end="") torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth') torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth') if i_episode % print_every == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format(
def ddpg(n_episodes=500, max_t=1000, start_steps=10, learn_frequency=20, learn_count=10, random_seed=1): """Deep Deterministic Policy Gradient (DDPG) Params ====== n_episodes (int) : maximum number of training episodes max_t (int) : maximum number of timesteps per episode start_steps (int) : number of starting steps actions are chosen randomly learn_frequency (int) : frequency of learning per timestep learn_count (int) : number of learning steps to do at learning timestep random_seed (int) : random seed for agent's weights """ agent = Agent(state_size=state_size, action_size=action_size, random_seed=random_seed) #Initialize the Agent avg_scores_episode = [] # list containing scores from each episode avg_scores_moving = [ ] # list containing avg scores from window at each episode scores_window = deque(maxlen=100) # last 100 scores for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] # reset environment states = env_info.vector_observations # get current state for each agent scores = np.zeros(num_agents) # initialize score for each agent agent.reset() # reset noise of the agent for t in range(max_t): #Randomly sample actions during the starting steps if i_episode <= start_steps: actions = np.random.randn( num_agents, action_size) # select an action randomly actions = np.clip(actions, -1, 1) # all actions between -1 and 1 else: actions = agent.act( states, add_noise=True ) # select an action according to policy (for each agent) env_info = env.step(actions)[ brain_name] # send actions to environment next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode has finished (for each agent) # for each agent's experience, save it and learn for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): if t % learn_frequency == 0: # Learn with frequency agent.step(state, action, reward, next_state, done, learn=True, learn_count=learn_count) else: agent.step(state, action, reward, next_state, done, learn=False) #just add, don't learn states = next_states scores += rewards # add the rewards from the timestep to the scores if np.any( dones ): # finish episode if any agent has reached a terminal state break scores_window.append( np.mean(scores)) # save the most recent score to scores window avg_scores_episode.append( np.mean(scores)) # save the most recent score to avg_scores avg_scores_moving.append( np.mean(scores_window) ) # save the most recent score window average to moving averages print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") if i_episode % 1 == 0: # Print every episode print( '\rEpisode {}\tAverage Score: {:.2f} \t Current Score: {:.2f}'. format(i_episode, np.mean(scores_window), np.mean(scores))) #environment is solved if np.mean(scores_window) >= 30.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(scores_window))) torch.save(agent.actor_local.state_dict(), "checkpoint_actor.pth") #Save actors' weights torch.save(agent.critic_local.state_dict(), "checkpoint_critic.pth") #Save critics' weights break return avg_scores_episode, avg_scores_moving # Return average score of each episode and moving average at that time
def train( n_episodes, max_t, env_fp, no_graphics, seed, save_every_nth, buffer_size, batch_size, gamma, tau, lr_actor, lr_critic, weight_decay, log, ): log.info("#### Initializing environment...") # init environment env = UnityEnvironment(file_name=env_fp, no_graphics=no_graphics) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) log.info(f"Number of agents: {num_agents}") # size of each action action_size = brain.vector_action_space_size log.info(f"Size of each action: {action_size}") # examine the state space states = env_info.vector_observations state_size = states.shape[1] log.info( f"There are {states.shape[0]} agents. Each observes a state with length: {state_size}" ) log.info(f"The state for the first agent looks like: {states[0]}") agent = Agent( num_agents=len(env_info.agents), state_size=state_size, action_size=action_size, buffer_size=buffer_size, batch_size=batch_size, gamma=gamma, tau=tau, lr_actor=lr_actor, lr_critic=lr_critic, weight_decay=weight_decay, random_seed=seed, ) log.info("#### Training...") scores_deque = deque(maxlen=100) scores = [] for i_episode in range(1, n_episodes + 1): brain_name = env.brain_names[0] env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations agent.reset() score = np.zeros((len(env_info.agents), 1)) for t in range(max_t): actions = agent.act(states) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations rewards = env_info.rewards rewards = np.array(rewards).reshape((next_states.shape[0], 1)) dones = env_info.local_done dones = np.array(dones).reshape((next_states.shape[0], 1)) agent.step(states, actions, rewards, next_states, dones) score += rewards states = next_states if np.any(dones): break scores_deque.append(np.mean(score)) scores.append(np.mean(score)) print( "Episode {}\tAverage Score: {:.2f}\tScore: {:.2f}".format( i_episode, np.mean(scores_deque), scores[-1]), end="\r", ) if i_episode % 100 == 0: print("\rEpisode {}\tAverage Score: {:.2f}".format( i_episode, np.mean(scores_deque))) if i_episode % save_every_nth == 0: save_checkpoint( state={ "episode": i_episode, "actor_state_dict": agent.actor_local.state_dict(), "critic_state_dict": agent.critic_local.state_dict(), "scores_deque": scores_deque, "scores": scores, }, filename="checkpoint.pth", ) plot_scores( scores=scores, title=f"Avg score over {len(env_info.agents)} agents", fname="avg_scores.png", savefig=True, ) if np.mean(scores_deque) >= 30: torch.save(agent.actor_local.state_dict(), "checkpoint_actor.pth") torch.save(agent.critic_local.state_dict(), "checkpoint_critic.pth") print( "\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}" .format(i_episode - 100, np.mean(scores_deque))) break
for episode in range(episodes): # Reset the enviroment cur_state = env.reset(seed=episode) score = 0 for i in range(iterationss + 1): # Predict the best action for the current state. action = agent.act(cur_state, add_noise=True) # Action is performed and new state, reward, info are received. new_state, reward, done, info = env.step(action) print("episode: ", episode, " sample: ", i, " reward: ", reward) # current state, action, reward, new state are stored in the experience replay agent.step(cur_state, action, reward, new_state, done) # roll over new state cur_state = new_state score += reward if done: break scores_deque.append(score) scores.append(score) print('\rEpisode {}\tAverage Score: {:.2f}'.format(episode, np.mean(scores_deque)), end="") torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth') torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
# Train until environment ends the episode while True: for env_agent_idx in range(num_agents): # Let deep learning agent act based on states actions[env_agent_idx] = agent.act(states[env_agent_idx]) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done for env_agent_idx in range(num_agents): # Save to replay buffer agent.memorize(states[env_agent_idx], actions[env_agent_idx], \ rewards[env_agent_idx], next_states[env_agent_idx], \ dones[env_agent_idx]) # Learn agent.step() states = next_states score += np.sum(rewards) / len(rewards) if np.any(dones): break # Check and track scores scores_deque.append(score) scores.append(score) average_score = np.mean(scores_deque) print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'.format( i_episode, average_score, score), end="") if i_episode % print_every == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, average_score)) # Save coefficients to file if environment is solved with current network coefficients
env_info = env.reset(train_mode=True)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) scores = np.zeros(num_agents) # initialize the score (for each agent) agent_1.reset() agent_2.reset() for t in range(t_max): actions_1 = agent_1.act(np.expand_dims(states[0], 0), True) actions_2 = agent_2.act(np.expand_dims(states[1], 0), True) # actions_1 = np.clip(actions_1, -1, 1) # all actions between -1 and 1 actions = np.concatenate((actions_1, actions_2)) env_info = env.step(actions)[ brain_name] # send all actions to tne environment next_states, rewards, dones = env_info.vector_observations, env_info.rewards, env_info.local_done # for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones): agent_1.step(np.expand_dims(states[0], 0), actions_1, rewards[0], np.expand_dims(next_states[0], 0), dones[0], t) agent_2.step(np.expand_dims(states[1], 0), actions_2, rewards[1], np.expand_dims(next_states[1], 0), dones[1], t) scores += rewards # update the score (for each agent) states = next_states # roll over states to next time step if np.any(dones): # exit loop if episode finished break score.append(np.max(scores)) ev_score.append(np.mean(scores_deque)) scores_deque.append(np.max(scores)) print('Score (max over agents) from episode {}: {:.5f}'.format( i_episode, np.max(scores)), end='\r') if i_episode % print_every == 0 or np.mean(scores_deque) > 0.5:
# Predict the best action for the current state. cur_state1 = np.delete(cur_state, 8) cur_state2 = np.delete(cur_state, 7) # print(cur_state[5:]) action1 = agent1.act(cur_state1, add_noise=True) action2 = agent2.act(cur_state2, add_noise=True) # print(action1,action2) # Action is performed and new state, reward, info are received. new_state, reward1, reward2, done1, done2, info = env.step( action1, action2) # current state, action, reward, new state are stored in the experience replay new_state1 = np.delete(new_state, 8) new_state2 = np.delete(new_state, 7) agent1.step(cur_state1, action1, reward1, new_state1, done1) agent2.step(cur_state2, action2, reward2, new_state2, done2) # roll over new state cur_state = new_state if info.done1 and info.done2: shortfall_hist1 = np.append(shortfall_hist1, info.implementation_shortfall1) shortfall_deque1.append(info.implementation_shortfall1) shortfall_hist2 = np.append(shortfall_hist2, info.implementation_shortfall2) shortfall_deque2.append(info.implementation_shortfall2) break if (episode + 1
def ddpg(model_number, UPD, BUFFER_SIZE, BATCH_SIZE, LR_ACTOR, LR_CRITIC, fc1_units, fc2_units, a_gradient_clipping, a_leaky, a_dropout, c_gradient_clipping, c_batch_norm, c_leaky, c_dropout, n_episodes=400, max_t=2000, print_every=100): """Deep Q-Learning. Params ====== n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode ... """ agent = Agent(state_size, action_size, UPD, BUFFER_SIZE, BATCH_SIZE, LR_ACTOR, LR_CRITIC, fc1_units, fc2_units, a_gradient_clipping, a_leaky, a_dropout, c_gradient_clipping, c_batch_norm, c_leaky, c_dropout, 0, 12345) scores = [] # list containing scores from each episode scores_window = deque(maxlen=print_every) # last 100 scores for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0] agent.reset() score = 0 for t in range(max_t): action = agent.act(state, a_dropout, a_leaky) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] agent.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores_window.append(score) # save most recent score scores.append(score) # save most recent score with open('results.txt', 'a') as output: output.writelines(\ '{}, {}, {:.2f}, {:.2f}, {}, {}, {}, {:.4f}, {:.4f}, {}, {}, {}, {}, {}, {}, {}, {}, {} \n'.format( model_number, i_episode, np.mean(scores_window), score, UPD, BUFFER_SIZE, BATCH_SIZE, LR_ACTOR, LR_CRITIC, fc1_units, fc2_units, a_gradient_clipping, a_leaky, a_dropout, c_gradient_clipping, c_batch_norm, c_leaky, c_dropout)) output.flush() print('\rModel nr: {}, Episode {}, avg. score: {:.2f}, score: {:.2f}'.format\ (model_number, i_episode, np.mean(scores_window), score), end="") if i_episode % print_every == 0: print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:2f}'.format( i_episode, np.mean(scores_window), score)) if np.mean(scores_window) >= 30.0: with open('./models/models_solved.txt', 'a') as solved: solved.writelines('{}, {} \n'.format(model_number, i_episode)) solved.flush() print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode, np.mean(scores_window))) torch.save( agent.actor_local.state_dict(), './models/checkpoint_actor_' + str(model_number) + '.pth') torch.save( agent.critic_local.state_dict(), './models/checkpoint_critic_' + str(model_number) + '.pth') break return scores
scores_deque = deque(maxlen=print_every) scores_final = [] agent = Agent(state_size, action_size, num_agents, random_seed=2) # ----------------------- training the agents ----------------------- # for i_episode in range(n_episodes): env_info = env.reset(train_mode=False)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) scores = np.zeros(num_agents) # initialize the score (for each agent) while True: actions = agent.act(states) # select an action (for each agent) env_info = env.step(actions)[ brain_name] # send all actions to tne environment next_states = env_info.vector_observations # get next state (for each agent) next_states shape:(2,24) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished agent.step(states, actions, rewards, next_states, dones) scores += env_info.rewards # update the score (for each agent) states = next_states # roll over states to next time step if np.any(dones): # exit loop if episode finished break scores_deque.append(max(scores)) scores_final.append(scores) print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores)), end="") if i_episode % 100 == 0: print('Total score (averaged over agents) this episode: {}'.format( np.mean(scores_deque))) if np.mean(scores_deque) > 0.5: torch.save(agent.actor_local.state_dict(), './checkpoint_actor.pth') torch.save(agent.critic_local.state_dict(), './checkpoint_critic.pth')
agents = Agent(state_size=state_size, action_size=action_size, num_agents=num_agents, random_seed=0) agents.actor_local.load_state_dict( torch.load("ckpt/{}".format(ACTOR_CHECKPOINT_NAME))) agents.critic_local.load_state_dict( torch.load("ckpt/{}".format(CRITIC_CHECKPOINT_NAME))) for i_episode in range(1, n_episodes + 1): print('Starting episode {}'.format(i_episode)) env_info = env.reset(train_mode=GRAPHICS_OFF)[brain_name] state = env_info.vector_observations agents.reset() score = np.zeros(num_agents) while True: action = agents.act(state) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done agents.step(state, action, rewards, next_state, dones) state = next_state score += rewards if np.any(dones): print('Score: {}'.format(np.mean(score))) break
def ddpg(n_episodes=500, max_t=200, train_mode=True): env = UnityEnvironment(file_name='./1_agent/Reacher.app') brain_name = env.brain_names[0] brain = env.brains[brain_name] action_size = brain.vector_action_space_size env_info = env.reset(train_mode=train_mode)[brain_name] states = env_info.vector_observations agent = Agent(state_size=states.shape[1], action_size=action_size, random_seed=2) brain_name = env.brain_names[0] scores = [] scores_deque = deque(maxlen=100) max_score = -np.Inf for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=train_mode)[brain_name] num_agents = len(env_info.agents) # agent.reset() score = 0 states = env_info.vector_observations # while True: for t in range(max_t): agent.reset() actions = agent.act(states) # actions = np.clip(actions, -1,1) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations rewards = env_info.rewards # rewards = [1.0 if x > 0.0 else 0.0 for x in rewards] dones = env_info.local_done agent.step(states, actions, rewards, next_states, dones) states = next_states score += np.mean(env_info.rewards) if np.any(dones): break scores_deque.append(score) scores.append(score) print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'.format( i_episode, np.mean(scores_deque), score), end="") if i_episode % 100 == 0: torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth') torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth') print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'.format( i_episode, np.mean(scores_deque), score), end="") print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_deque))) if np.mean(scores_deque) >= 30.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode, np.mean(scores_deque))) torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth') torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth') break env.close() return scores
total_episodes = [] for i_episode in tqdm(range(1, n_episodes + 1)): env_info = env.reset(train_mode=True)[brain_name] # reset the environment t = 1 states = env_info.vector_observations # get the current state agent.reset() # Reset noise with different inertia scores = np.zeros(num_agents) last_non_zeros_in_batch = 0 while True: actions = agent.act(states) env_info = env.step(actions)[ brain_name] # send all actions to tne environment rewards = env_info.rewards # get reward (for each agent) next_states = env_info.vector_observations # get next state (for each agent) dones = env_info.local_done # see if episode finished agent.step(states, actions, rewards, next_states, dones, t, i_episode) scores += rewards states = next_states if any(dones): break t += 1 scores_deque.append(np.max(scores)) scores_by_episode.append(np.max(scores)) total_episodes.append(i_episode) if i_episode % print_every == 0: print( '\rEpisode {}\tRolling Average: {:.4f}\tScore: {:.2f}\tsteps: {}\t'
def train_ddpg(dev, weights_file_actor, weights_file_critic, n_episodes=1000, max_t=1000): """DDPG Learning. Params ====== dev (string): cpu or gpu weights_file_actor (string): name of the file to save the weights of the actor weights_file_critic (string): name of the file to save the weights of the critic n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode """ scores = [ ] # list containing scores from each episode (average of all the agents) averages = [ ] # list containing averages of the scores. Position i (1-index) has the average of the last min(i,100) episodes scores_window = deque( maxlen=100) # last 100 averaged scores for all the agents env = UnityEnvironment(file_name='./Tennis_Linux/Tennis.x86_64') brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] num_agents = len(env_info.agents) states = env_info.vector_observations state_size = states.shape[1] action_size = brain.vector_action_space_size agent = Agent(state_size, action_size, random_seed=0, device=dev) print('Number of agents: {:d}'.format(num_agents)) for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] agent.reset() # reset noise for the actions states = env_info.vector_observations current_scores = np.zeros( num_agents) # initialize the score for all the agents for t in range(max_t): actions = agent.act(states) # process the states of all the agents env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done agent.step(states, actions, rewards, next_states, dones) states = next_states current_scores += rewards if np.any(dones): break max_score = np.max( current_scores) # current maximum score of all the agents scores.append(max_score) scores_window.append(max_score) averages.append(np.mean(scores_window)) if (i_episode % 100 != 0): print('\rEpisode {}\tScore: {:.3f}\tAverage Score: {:.3f}'.format( i_episode, max_score, averages[i_episode - 1]), end="") else: print('\rEpisode {}\tScore: {:.3f}\tAverage Score: {:.3f}'.format( i_episode, max_score, averages[i_episode - 1])) if (averages[i_episode - 1] >= 0.5): print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.3f}' .format(i_episode - 100, averages[i_episode - 1])) torch.save(agent.actor_local.state_dict(), weights_file_actor) torch.save(agent.critic_local.state_dict(), weights_file_critic) break env.close() return scores, averages
def main(): env = UnityEnvironment(file_name='Reacher.app') brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] num_agents = len(env_info.agents) action_size = brain.vector_action_space_size states = env_info.vector_observations state_size = states.shape[1] agent = Agent(state_size=state_size, action_size=action_size, random_seed=3) scores_deque = deque(maxlen=100) scores = [] for i_episode in range(1, 1000): begin = time.time() curr_scores = np.zeros( num_agents) # initialize the score (for each agent) env_info = env.reset( train_mode=True)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) agent.reset() for t in range(1000): actions = agent.act(states) env_info = env.step(actions)[ brain_name] # send all actions to the environment next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished agent.step(states, actions, rewards, next_states, dones, t) states = next_states curr_scores += rewards if np.any(dones): break curr_score = np.mean(curr_scores) scores_deque.append(curr_score) average_score = np.mean(scores_deque) scores.append(curr_score) print( '\rEpisode {}\tTime: {:.2f}\tAvg: {:.2f}\tScore: {:.2f}\tMin {:.2f}\tMax {:.2f}' .format(i_episode, time.time() - begin, average_score, curr_score, min(curr_scores), max(curr_scores))) if i_episode % 10 == 0: torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth') torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth') if average_score >= 30.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, average_score)) torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth') torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth') break env.close() return
def train(config, n_episodes=1000, base_port=5005, save_path=None, name=None): """Deep Q-Learning. Params ====== n_episodes (int): maximum number of training episodes eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ writer = SummaryWriter(comment=name) env = UnityEnvironment( file_name="Reacher_Linux_NoVis/Reacher.x86_64", no_graphics=True, base_port=base_port) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] dummy_input = (torch.zeros(1, config.num_agents, config.state_size),) agent = Agent(config) writer.add_graph(agent.actor_local, dummy_input, True) #writer.add_graph(agent.critic_local, dummy_input, True) num_agents = config.num_agents # reset env_info = env.reset(train_mode=True)[brain_name] episode_scores = [] # list containing scores from each episode episode_scores_window = deque(maxlen=100) # last 100 scores with trange(n_episodes, desc='episode') as episode_bar: for episode in episode_bar: env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations # get the current state (for each agent) scores = np.zeros(num_agents) # initialize the score (for each agent) while True: actions = agent.act(states) # select an action (for each agent) env_info = env.step(actions)[brain_name] # send all actions to tne environment next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished agent.step(states, actions, rewards, next_states, dones, writer=writer) # learn scores += env_info.rewards # update the score (for each agent) states = next_states # roll over states to next time step if np.any(dones): # exit loop if episode finished break episode_scores_window.append(np.mean(scores)) # save most recent score episode_scores.append(np.mean(scores)) # save most recent score episode_bar.set_postfix(avg_score=np.mean(episode_scores_window)) writer.add_scalar('data/score', np.mean(scores), episode) results = pd.Series(episode_scores, name=name) if save_path: torch.save(agent.actor_local.state_dict(), os.path.join(save_path,'checkpoint_actor.pth')) torch.save(agent.critic_local.state_dict(), os.path.join(save_path, 'checkpoint_critic.pth')) results.to_csv(os.path.join(save_path,'results.csv')) env.close() writer.close() return results, agent
def training(): # config parameters number_of_episodes = 4000 episode_length = 1000 random_seed = 4 #np.random.randint(10000) # create env, get essential env info env, brain_name, num_agents, action_size, state_size = create_env() agent_reward = [[] for _ in range(num_agents)] agent_reward_deque = [deque(maxlen=100) for _ in range(num_agents)] score_full = [] score_deque = deque(maxlen=100) # create ddpg agent for self play agents = Agent(state_size, action_size, random_seed, num_agents) for i_episode in range(1, number_of_episodes + 1): # reset the environment and get initial observation env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations # reshape states, assume each agent can see global condition #states = np.reshape(states,(1,-1)) # reset ddpg agents #for agent in agents: # agent.reset() agents.reset() episode_scores = np.zeros(num_agents) for t in range(episode_length): actions = [] #for ii in range(num_agents): # actions.append(agents.act(states[ii])) actions = agents.act(states) env_actions = actions #np.reshape(np.array(actions),(1,-1)) # play one step env_info = env.step(env_actions)[brain_name] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done episode_scores += rewards # store transition, learn if necessary for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): agents.step(state, action, reward, next_state, done, t) states = next_states if np.any(dones): break for i in range(num_agents): agent_reward[i].append(episode_scores[i]) agent_reward_deque[i].append(episode_scores[i]) score_full.append(max(episode_scores)) score_deque.append(max(episode_scores)) if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(score_deque))) if np.mean(score_deque) >= 0.5: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode, np.mean(score_deque))) #for i in range(num_agents): # torch.save(agents[i].actor_local.state_dict(), 'checkpoint_actor'+str(i) +'.pth') # torch.save(agents[i].critic_local.state_dict(), 'checkpoint_critic'+str(i)+'.pth') torch.save(agents.actor_local.state_dict(), 'checkpoint_actor.pth') torch.save(agents.critic_local.state_dict(), 'checkpoint_critic.pth') break env.close() return agents, agent_reward, score_full, random_seed
# determine actions for the unity agents from current sate, using noise for exploration actions_1 = agent_1.act(states, add_noise=True) actions_2 = agent_2.act(states, add_noise=True) # send the actions to the unity agents in the environment and receive resultant environment information actions = np.concatenate((actions_1, actions_2), axis=0) actions = np.reshape(actions, (1, 4)) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations # get the next states for each unity agent in the environment next_states = np.reshape(next_states, (1, 48)) rewards = env_info.rewards # get the rewards for each unity agent in the environment dones = env_info.local_done # see if episode has finished for each unity agent in the environment #Send (S, A, R, S') info to the training agent for replay buffer (memory) and network updates agent_1.step(states, actions_1, rewards[0], next_states, dones[0]) agent_2.step(states, actions_2, rewards[1], next_states, dones[1]) # set new states to current states for determining next actions states = next_states #print(states) # Update episode score for each unity agent agent_scores += rewards # If any unity agent indicates that the episode is done, # then exit episode loop, to begin new episode if np.any(dones): break # Add episode score to Scores and... # Calculate mean score over last 100 episodes
for ep in range(0, episodes): env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations # state = env.reset() agent.reset() # Resets the noise in the agent scores = np.zeros(num_agents) # Step through time steps and learn the actor and critic for t in range(max_time): actions = agent.act(states) # Get actions from policy (for each agent) env_info = env.step(actions)[brain_name] # Perform actions in environment next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # get dones (for each agent) agent.step(states, actions, rewards, next_states, dones) # Add experience to buffer states = next_states # Reset states scores += env_info.rewards # Accumulate rewards if np.any(dones): break scores_deque.append(np.mean(scores)) all_scores.append(np.mean(scores)) # Add to total list of scores # Print results as they are computed mn_score = np.mean(scores_deque) print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'.format(ep+1, mn_score, np.mean(scores)), end="") if ep+1 % 100 == 0 or mn_score > max_score: torch.save({'local': agent.actor_local.state_dict(), 'target': agent.actor_target.state_dict(), 'opt': agent.actor_optimizer.state_dict()}, 'cc_actor.pth')
def multi_agent_ddpg(env, brain_name, title, n_episodes, action_size, state_size, num_agents, print_every, n_updates, update_intervals, device): # create save dir for this experiment if title is None: title = "experiment" current_time = strftime("%Y-%m-%d_%H:%M:%S", gmtime()) title = title + "_" + current_time # write a new file os.makedirs("experiments/{}".format(title), exist_ok=True) f = open("experiments/{}/scores.txt".format(title), "w") f.close() all_agents_statesize = state_size * num_agents agent1 = Agent(state_size=all_agents_statesize, action_size=action_size, num_agents=1, random_seed=123, device=device) agent2 = Agent(state_size=all_agents_statesize, action_size=action_size, num_agents=1, random_seed=123, device=device) scores_deque = deque(maxlen=100) mean_scores = [] for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations states = np.reshape( states, (1, all_agents_statesize )) # reshape so we can feed both agents states to each agent # reset agent1.reset() agent2.reset() # place to store scores agent_scores = np.zeros(num_agents) t = 0 while True: # two agents actions actions_1 = agent1.act(states, add_noise=True) actions_2 = agent2.act(states, add_noise=True) # step environment for two agents and get next states actions = np.concatenate((actions_1, actions_2), axis=0) actions = np.reshape(actions, (1, 4)) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations next_states = np.reshape(next_states, (1, all_agents_statesize)) rewards = env_info.rewards dones = env_info.local_done # update the agents accordingly (ddpg) agent1.step(states, actions_1, rewards[0], next_states, dones[0], n_updates, update_intervals, t) agent2.step(states, actions_2, rewards[1], next_states, dones[1], n_updates, update_intervals, t) states = next_states agent_scores += rewards if np.any(dones): break t += 1 scores_deque.append(np.max(agent_scores)) print('\rEpisode {}\tLast 100 average Score: {:.2f}'.format( i_episode, np.mean(scores_deque)), end="") # save score and model every print_every if i_episode % print_every == 0: f = open("experiments/{}/scores.txt".format(title), "a") f.write("{},{}\n".format(i_episode, np.mean(scores_deque))) f.close() print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_deque))) mean_scores.append(np.mean(scores_deque)) # save if best model if np.mean(scores_deque) == max(mean_scores): torch.save( agent1.actor_local.state_dict(), 'experiments/{}/checkpoint_actor1.pth'.format(title)) torch.save( agent1.critic_local.state_dict(), 'experiments/{}/checkpoint_critic1.pth'.format(title)) torch.save( agent2.actor_local.state_dict(), 'experiments/{}/checkpoint_actor2.pth'.format(title)) torch.save( agent2.critic_local.state_dict(), 'experiments/{}/checkpoint_critic2.pth'.format(title)) if np.mean(scores_deque) >= 1.0 and i_episode > 100: print("\rEnvironment solved with average score of 30") break
def ddpg(n_episodes=500, max_t=200, train_mode=True): env = UnityEnvironment(file_name='./env/Tennis.app') brain_name = env.brain_names[0] brain = env.brains[brain_name] action_size = brain.vector_action_space_size env_info = env.reset(train_mode=train_mode)[brain_name] states = env_info.vector_observations agent = Agent(state_size=states.shape[1], action_size=action_size, random_seed=2) brain_name = env.brain_names[0] scores = [] scores_deque = deque(maxlen=100) scores_mean = [] max_score = -np.Inf for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=train_mode)[brain_name] num_agents = len(env_info.agents) # agent.reset() score = np.zeros((2, )) states = env_info.vector_observations for t in range(max_t): agent.reset() actions = agent.act(states) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done agent.step(states, actions, rewards, next_states, dones) states = next_states score += env_info.rewards if np.any(dones): break scores_deque.append(np.max(score)) scores_mean.append(np.mean(scores_deque)) scores.append(score) print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {}'.format( i_episode, np.mean(scores_deque), score), end="") if np.max(score) > max_score: torch.save(agent.actor_local.state_dict(), 'checkpoint_actor_best.pth') torch.save(agent.critic_local.state_dict(), 'checkpoint_critic_best.pth') print('\rSaving Weights for max score old: {} -> new: {} '.format( max_score, np.max(score))) max_score = np.max(score) if i_episode % 100 == 0: torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth') torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth') print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {}'.format( i_episode, np.mean(scores_deque), score), end="") print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_deque))) # if np.mean(scores_deque)>=0.5: # print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque))) # torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth') # torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth') # break env.close() return scores, scores_mean