class DdpgInfer(): def __init__(self): self.env = gym.make('DeeplengDocking-v1') rospy.loginfo("Gym environment done") self.agent = Agent(state_size=13, action_size=3, random_seed=2) rospack = rospkg.RosPack() pkg_path = rospack.get_path('deepleng_control') self.outdir = pkg_path + '/training_results' self.agent.actor_local.load_state_dict( torch.load(self.outdir + '/checkpoint_actor.pth')) self.agent.critic_local.load_state_dict( torch.load(self.outdir + '/checkpoint_critic.pth')) def __call__(self, *args, **kwargs): state = self.env.reset() for t in range(500): action = self.agent.act(state, add_noise=False) # env.render() state, reward, done, _ = self.env.step(action) print("state:", state) print("Reward: ", reward) if done: break self.env.close()
def train(episodes=700, max_t=1000): try: agent = Agent(state_size=33, action_size=4, seed=0) scores = [] scores_window = deque(maxlen=100) config = Config() eps = config.EPS_START for i_episode in range(1, episodes + 1): env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations agent.reset() score = np.zeros(1) for _ in range(max_t + 1): action = agent.act(state, eps) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations # get the next state reward = env_info.rewards # get the reward done = env_info.local_done # see if episode has finished score += env_info.rewards agent.step(state, action, reward, next_state, done) state = next_state eps = eps - config.LIN_EPS_DECAY eps = np.maximum(eps, config.EPS_END) if np.any(done): break scores_window.append(score) scores.append(score) print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") if i_episode % 2 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) mean = np.mean(scores_window) if mean > 30.0 and mean <= 31.5: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(scores_window))) torch.save(agent.actor_local.state_dict(), 'solved_actor_trained_model.pth') torch.save(agent.critic_local.state_dict(), 'solved_critic_trained_model.pth') torch.save(agent.actor_local.state_dict(), 'actor_trained_model.pth') torch.save(agent.critic_local.state_dict(), 'critic_trained_model.pth') return scores except KeyboardInterrupt: torch.save(agent.actor_local.state_dict(), 'interrupt_actor_trained_model.pth') torch.save(agent.critic_local.state_dict(), 'interrupt_critic_trained_model.pth') plot_score_chart(scores) sys.exit(0)
def play(env): brain_name = env.brain_names[0] brain = env.brains[brain_name] action_size = brain.vector_action_space_size env_info = env.reset(train_mode=False)[brain_name] states = env_info.vector_observations agent = Agent(state_size=states.shape[1], action_size=action_size, random_seed=2) agent.reset() agent.actor_local.load_state_dict(torch.load(ACTOR_WEIGHTS)) agent.critic_local.load_state_dict(torch.load(CRITIC_WEIGHTS)) scores = [] score = np.zeros((2, )) while True: agent.reset() actions = agent.act(states) env_info = env.step(actions)[brain_name] states = env_info.vector_observations score += np.array(env_info.rewards) dones = env_info.local_done if np.sum(dones) > 0: break print('Scores: {}'.format(score))
def ddpg_dual(n_episodes=5000, max_t=2000, solved_at=0.5): sharedActor = Agent(state_size=state_size, action_size=action_size, random_seed=2) avg_score = [] scores_deque = deque(maxlen=100) best_score = 0.0 env_solved = False for i_episode in range(1, n_episodes + 1): scores = np.zeros(num_agents) env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations # get the current state (for each agent) sharedActor.reset() for t in range(max_t): actions = sharedActor.act(states) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): sharedActor.step(state, action, reward, next_state, done, t) states = next_states scores += rewards # update the score (for each agent) if np.any(dones): # exit loop if episode finished break score = np.max(scores) avg_score.append(score) scores_deque.append(score) print('\rEpisode:{} \tScore:{:.3f} \tAverage Score: {:.3f} solved:{}'. format(i_episode, score, np.mean(scores_deque), env_solved), end="") if i_episode % 10 == 0: print("\n") if score > best_score and np.mean(scores_deque) >= solved_at: if env_solved == False: env_solved = True print( '\nEnv solved in {:.3f} episodes!\tAverage Score ={:.3f} over last {} Episodes' .format(i_episode - 100, np.mean(scores_deque), 100)) torch.save(sharedActor.actor_local.state_dict(), "actor.pth") torch.save(sharedActor.critic_local.state_dict(), "critic.pth") best_score = score break return avg_score
class DdpgDeepleng(): def __init__(self): # Create the Gym environment self.env = gym.make('DeeplengDocking-v1') rospy.loginfo("Gym environment done") self.agent = Agent(state_size=13, action_size=3, random_seed=2) # Set the logging system rospack = rospkg.RosPack() pkg_path = rospack.get_path('deepleng_control') outdir = pkg_path + '/training_results' # env = wrappers.Monitor(env, outdir, force=True) # rospy.loginfo("Monitor Wrapper started") self.max_episodes = 200 self.max_timesteps = 1000 def __call__(self, *args, **kwargs): scores = [] for episode in range(1, self.max_episodes + 1): state = self.env.reset() self.agent.reset() score = 0 print( "==========================================================================" ) print("Episode no. {}".format(episode)) print( "==========================================================================" ) for stp in range(1, self.max_timesteps + 1): # print("___________________________________________________________________________") print("Step no. {}".format(stp)) # print("Current state: {}".format([round(elem, 2) for elem in state])) print("Current state: {}".format(state)) action = self.agent.act(np.array(state)) print("Action taken: {}".format(action)) next_state, reward, done, _ = self.env.step(action) print("Reward for action: {}".format(reward)) print("Next state: {}".format(next_state)) self.agent.step(state, action, reward, next_state, done) state = np.array(next_state) score += reward if done: # print("Done") break print( "___________________________________________________________________________" ) scores.append(score) torch.save( self.agent.actor_local.state_dict(), '/home/dfki.uni-bremen.de/mpatil/Desktop/checkpoint_actor.pth') torch.save( self.agent.critic_local.state_dict(), '/home/dfki.uni-bremen.de/mpatil/Desktop/checkpoint_critic.pth' ) self.env.close() return scores
def ddpg(n_episodes=2000, store_every=10): scores_deque = deque(maxlen=store_every) scores = [] agents = Agent(state_size=state_size, action_size=action_size, num_agents=num_agents, random_seed=0) for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=GRAPHICS_OFF)[brain_name] state = env_info.vector_observations agents.reset() score = np.zeros(num_agents) while True: action = agents.act(state) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done agents.step(state, action, rewards, next_state, dones) state = next_state score += rewards if np.any(dones): break scores_deque.append(np.mean(score)) scores.append(np.mean(score)) avg_score = np.mean(scores_deque) print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}\t {}'.format( i_episode, np.mean(scores_deque), np.mean(score), strftime("%H:%M:%S", gmtime())), end="") if i_episode % store_every == 0 or avg_score >= TARGET_SCORE: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, avg_score)) if avg_score >= TARGET_SCORE: torch.save(agents.actor_local.state_dict(), "ckpt/{}".format(ACTOR_CHECKPOINT_NAME)) torch.save(agents.critic_local.state_dict(), "ckpt/{}".format(CRITIC_CHECKPOINT_NAME)) break return scores
def ddpg_train(n_episodes, seed, buffer_size, batch_size, gamma, tau, lr_actor, lr_critic, weight_decay): scores = [] scores_deque = deque(maxlen=100) agent = Agent(n_agents, state_size, action_size, seed, buffer_size, batch_size, gamma, tau, lr_actor, lr_critic, weight_decay) load(agent) for i_episode in range(n_episodes): env_info = env.reset( train_mode=True)[brain_name] # reset the environment states = env_info.vector_observations agent.reset() # reset the agent noise score = np.zeros(n_agents) while True: actions = agent.act(states) # send the action to the environment env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations # get the next state rewards = env_info.rewards # get the reward dones = env_info.local_done # see if episode has finished agent.step(states, actions, rewards, next_states, dones) score += rewards # update the score states = next_states # roll over the state to next time step if np.any(dones): # exit loop if episode finished break scores.append(np.mean(score)) scores_deque.append(np.mean(score)) print('\rEpisode: \t{} \tScore: \t{:.2f} \tAverage Score: \t{:.2f}'. format(i_episode, np.mean(score), np.mean(scores_deque)), end="") if n_episodes % 10 == 0: torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth') torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth') if np.mean(scores_deque) >= 30.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode, np.mean(scores_deque))) break fig = plt.figure() ax = fig.add_subplot(111) ax.grid() ax.plot(np.arange(len(scores)), scores) ax.set(xlabel="Episode #", ylabel="'Score", title="DDPG Network") fig.savefig("ddpg_network.pdf")
def main(): env = UnityEnvironment(file_name='data/Reacher_Linux/Reacher.x86_64') # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) # size of each action action_size = brain.vector_action_space_size # examine the state space states = env_info.vector_observations state_size = states.shape[1] n_agent = 20 agent = Agent(state_size=state_size, action_size=action_size, random_seed=2, n_agent=n_agent) # load trained model agent.actor_local.load_state_dict(torch.load('model/checkpoint_actor.pth')) agent.critic_local.load_state_dict( torch.load('model/checkpoint_critic.pth')) state = env.reset() env_info = env.reset(train_mode=False)[brain_name] state = env_info.vector_observations for t in range(1000): action = [ agent.act(state[agent_x], agent_x, add_noise=False) for agent_x in range(n_agent) ] env_info = env.step(action)[brain_name] next_state = env_info.vector_observations reward = env_info.rewards done = env_info.local_done state = next_state if all(done): break env.close()
def trained_agent(): agent = Agent(n_agents, state_size, action_size, 0, 0, 0, 0, 0, 0, 0, 0) load(agent) for episode in range(3): env_info = env.reset(train_mode=False)[brain_name] states = env_info.vector_observations score = np.zeros(n_agents) while True: actions = agent.act(states, add_noise=False) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done score += rewards states = next_states if np.any(dones): break print('Episode: \t{} \tScore: \t{:.2f}'.format(episode, np.mean(score))) env.close()
class Worker(mp.Process): def __init__(self, gnet, opt, global_ep, global_ep_r, res_queue, name): super(Worker, self).__init__() self.name = 'w{}'.format(name) self.g_ep, self.g_ep_r, self.res_queue = global_ep, global_ep_r, res_queue self.gnet, self.opt = gnet, opt self.agent = Agent(state_size, action_size, gnet['actor'], gnet['critic'] \ , opt['actor_optimizer'], opt['critic_optimizer'], random_seed) # local agent self.env = gym.make('LunarLanderContinuous-v2') def run(self): total_step = 1 while self.g_ep.value < MAX_EP: state = self.env.reset() ep_r = 0. self.agent.reset() for t in range(MAX_EP_STEP): # if self.name == 'w1': # self.env.render() action = self.agent.act(state) action = np.clip(action, -1, 1) next_state, reward, done, _ = self.env.step(action) self.agent.step(state, action, reward, next_state, done, t) if t == MAX_EP_STEP - 1: done = True ep_r += reward if total_step % UPDATE_GLOBAL_ITER == 0 or done: # time to sync if done: # done and print information record(self.g_ep, self.g_ep_r, ep_r, self.res_queue, self.name) break state = next_state total_step += 1
def ddpg(n_episodes=1000, max_t=500, print_every=100): scores_deque = deque(maxlen=print_every) scores = [] # Create the env and the agent terminating_angle = 15 env = CubeEnv(np.deg2rad(terminating_angle)) agent = Agent(state_size=3, action_size=1, random_seed=2) plotter = LivePlotter(env, max_t, terminating_angle, n_episodes) for i_episode in range(1, n_episodes + 1): state = env.reset() agent.reset() score = 0 done = False plotter.reset() while not done: # Select the next action and update system action = agent.act(state) * 10 next_state, reward, done, _ = env.step(action) agent.step(state, action, reward, next_state, done) # Update plots and metrics state = next_state score += reward plotter.add_data_from_env(env) scores_deque.append(score) scores.append(score) plotter.add_score(score) print('\rEpisode {}\tScore: {}'.format(i_episode, score), end="") # Display the plotrs plotter.display() # Save model torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth') torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth') return scores
def test(): agent = Agent(state_size=33, action_size=4, seed=0) load_model(agent.critic_local, 'solved_critic_trained_model.pth') load_model(agent.actor_local, 'solved_actor_trained_model.pth') env_info = env.reset(train_mode=False)[brain_name] state = env_info.vector_observations score = np.zeros(1) while True: action = agent.act(state, 0, False) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations # get the next state reward = env_info.rewards # get the reward done = env_info.local_done # see if episode has finished state = next_state score += reward if np.any(done): print('\r\tTest Score: {:.2f}'.format(score[0], end="")) break
print_every = 100 iterationss = 20 scores_deque = deque(maxlen=print_every) scores = [] for episode in range(episodes): # Reset the enviroment cur_state = env.reset(seed=episode) score = 0 for i in range(iterationss + 1): # Predict the best action for the current state. action = agent.act(cur_state, add_noise=True) # Action is performed and new state, reward, info are received. new_state, reward, done, info = env.step(action) print("episode: ", episode, " sample: ", i, " reward: ", reward) # current state, action, reward, new state are stored in the experience replay agent.step(cur_state, action, reward, new_state, done) # roll over new state cur_state = new_state score += reward if done: break scores_deque.append(score)
states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) # traing paramenter n_episodes = 4000 print_every = 100 scores_deque = deque(maxlen=print_every) scores_final = [] agent = Agent(state_size, action_size, num_agents, random_seed=2) # ----------------------- training the agents ----------------------- # for i_episode in range(n_episodes): env_info = env.reset(train_mode=False)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) scores = np.zeros(num_agents) # initialize the score (for each agent) while True: actions = agent.act(states) # select an action (for each agent) env_info = env.step(actions)[ brain_name] # send all actions to tne environment next_states = env_info.vector_observations # get next state (for each agent) next_states shape:(2,24) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished agent.step(states, actions, rewards, next_states, dones) scores += env_info.rewards # update the score (for each agent) states = next_states # roll over states to next time step if np.any(dones): # exit loop if episode finished break scores_deque.append(max(scores)) scores_final.append(scores) print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores)), end="")
def main(): # load version 2 (with 20 agents) of the environment env_name = "Tennis_Windows_x86_64\Tennis.exe" # add a Unity-Environment name. no_graphics = False env = UnityEnvironment(file_name = env_name, no_graphics = no_graphics) # Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python. # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the enviroment env_info = env.reset(train_mode = False)[brain_name] # number of agents num_agents = len(env_info.agents) print("Number of agents : ", num_agents) # size of each action action_size = brain.vector_action_space_size print("Size of each action : ", action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print("There are {} agents. Each observes a state with length: {}".format(states.shape[0], state_size)) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") random_seed = 12345 #10 agent = Agent(state_size, action_size, num_agents, random_seed, device=device) actor_state_dict1 = torch.load("checkpoint_actor1.pth") agent.actor_local1.load_state_dict(actor_state_dict1) critic_state_dict1 = torch.load("checkpoint_critic1.pth") agent.critic_local1.load_state_dict(critic_state_dict1) actor_state_dict2 = torch.load("checkpoint_actor2.pth") agent.actor_local2.load_state_dict(actor_state_dict2) critic_state_dict2 = torch.load("checkpoint_critic2.pth") agent.critic_local2.load_state_dict(critic_state_dict2) # Take Random Actions in the Environment env_info = env.reset(train_mode=False)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) scores = np.zeros(num_agents) # initialize the score (for each agent) while True: actions = agent.act(states, add_noise = False) # select an action (for each agent) env_info = env.step(actions)[brain_name] # send all actions to tne environment next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished scores += env_info.rewards # update the score (for each agent) states = next_states # roll over states to next time step if np.any(dones): # exit loop if episode finished break print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores))) # When finished, you can close the environment env.close()
def main(): env = UnityEnvironment(file_name='Reacher.app') brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] num_agents = len(env_info.agents) action_size = brain.vector_action_space_size states = env_info.vector_observations state_size = states.shape[1] agent = Agent(state_size=state_size, action_size=action_size, random_seed=3) scores_deque = deque(maxlen=100) scores = [] for i_episode in range(1, 1000): begin = time.time() curr_scores = np.zeros( num_agents) # initialize the score (for each agent) env_info = env.reset( train_mode=True)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) agent.reset() for t in range(1000): actions = agent.act(states) env_info = env.step(actions)[ brain_name] # send all actions to the environment next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished agent.step(states, actions, rewards, next_states, dones, t) states = next_states curr_scores += rewards if np.any(dones): break curr_score = np.mean(curr_scores) scores_deque.append(curr_score) average_score = np.mean(scores_deque) scores.append(curr_score) print( '\rEpisode {}\tTime: {:.2f}\tAvg: {:.2f}\tScore: {:.2f}\tMin {:.2f}\tMax {:.2f}' .format(i_episode, time.time() - begin, average_score, curr_score, min(curr_scores), max(curr_scores))) if i_episode % 10 == 0: torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth') torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth') if average_score >= 30.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, average_score)) torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth') torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth') break env.close() return
pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict} # 2. overwrite entries in the existing state dict model_dict.update(pretrained_dict) # 3. load the new state dict agent.actor_local.load_state_dict(pretrained_dict) pretrained_dict = torch.load('checkpoint_critic.pth') model_dict = agent.critic_local.state_dict() # 1. filter out unnecessary keys pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict} # 2. overwrite entries in the existing state dict model_dict.update(pretrained_dict) # 3. load the new state dict agent.critic_local.load_state_dict(pretrained_dict) state = env.reset() agent.reset() while True: action = agent.act(state)[0] env.render() # time.sleep(0.1) next_state, reward, done, _ = (env.step(action)) state = next_state if done: break env.env.close()
def train_ddpg(dev, weights_file_actor, weights_file_critic, n_episodes=1000, max_t=1000): """DDPG Learning. Params ====== dev (string): cpu or gpu weights_file_actor (string): name of the file to save the weights of the actor weights_file_critic (string): name of the file to save the weights of the critic n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode """ scores = [ ] # list containing scores from each episode (average of all the agents) averages = [ ] # list containing averages of the scores. Position i (1-index) has the average of the last min(i,100) episodes scores_window = deque( maxlen=100) # last 100 averaged scores for all the agents env = UnityEnvironment(file_name='./Tennis_Linux/Tennis.x86_64') brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] num_agents = len(env_info.agents) states = env_info.vector_observations state_size = states.shape[1] action_size = brain.vector_action_space_size agent = Agent(state_size, action_size, random_seed=0, device=dev) print('Number of agents: {:d}'.format(num_agents)) for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] agent.reset() # reset noise for the actions states = env_info.vector_observations current_scores = np.zeros( num_agents) # initialize the score for all the agents for t in range(max_t): actions = agent.act(states) # process the states of all the agents env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done agent.step(states, actions, rewards, next_states, dones) states = next_states current_scores += rewards if np.any(dones): break max_score = np.max( current_scores) # current maximum score of all the agents scores.append(max_score) scores_window.append(max_score) averages.append(np.mean(scores_window)) if (i_episode % 100 != 0): print('\rEpisode {}\tScore: {:.3f}\tAverage Score: {:.3f}'.format( i_episode, max_score, averages[i_episode - 1]), end="") else: print('\rEpisode {}\tScore: {:.3f}\tAverage Score: {:.3f}'.format( i_episode, max_score, averages[i_episode - 1])) if (averages[i_episode - 1] >= 0.5): print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.3f}' .format(i_episode - 100, averages[i_episode - 1])) torch.save(agent.actor_local.state_dict(), weights_file_actor) torch.save(agent.critic_local.state_dict(), weights_file_critic) break env.close() return scores, averages
def test(dev, weights_file_actor, weights_file_critic, n_episodes=100, max_t=1000): """Test the environment with the parameters stored in checkpoint.pth Params ====== dev (string): cpu or gpu weights_file_actor (string): name of the file to load the weights of the actor weights_file_critic (string): name of the file to load the weights of the critic n_episodes (int): number of test episodes that will be performed """ env = UnityEnvironment(file_name='./Tennis_Linux/Tennis.x86_64') brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=False)[brain_name] num_agents = len(env_info.agents) states = env_info.vector_observations state_size = states.shape[1] action_size = brain.vector_action_space_size agent = Agent(state_size, action_size, random_seed=0, device=dev) scores = [] # load the weights from file print('Number of agents: {:d}'.format(num_agents)) print('Loading weights') try: checkpoint_actor = torch.load(weights_file_actor) except FileNotFoundError: print('Error: File \'{}\' not found'.format(weights_file_actor)) sys.exit(1) try: checkpoint_critic = torch.load(weights_file_critic) except FileNotFoundError: print('Error: File \'{}\' not found'.format(weights_file_critic)) sys.exit(1) agent.actor_local.load_state_dict(checkpoint_actor) agent.critic_local.load_state_dict(checkpoint_critic) print('Running {} episodes'.format(n_episodes)) for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=False)[brain_name] current_scores = np.zeros( num_agents) # initialize the score for all the agents states = env_info.vector_observations for t in range(max_t): actions = agent.act(states, add_noise=False) env_info = env.step(actions)[brain_name] states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done current_scores += rewards if np.any(dones): break max_score = np.max( current_scores) # current maximum score of all the agents scores.append(max_score) if (i_episode % 100 != 0): print('\rEpisode {}\tScore: {:.3f}\tAverage Score: {:.3f}'.format( i_episode, max_score, np.mean(scores)), end="") else: print('\rEpisode {}\tScore: {:.3f}\tAverage Score: {:.3f}'.format( i_episode, max_score, np.mean(scores))) env.close()
# print(action_size,state_size) import torch if condition[0] == "random": pass else: agent.actor_local.load_state_dict(torch.load('checkpoint_actor.pth')) agent.critic_local.load_state_dict(torch.load('checkpoint_critic.pth')) agent.actor_target.load_state_dict( torch.load('checkpoint_actor_target.pth')) agent.critic_target.load_state_dict( torch.load('checkpoint_critic_target.pth')) env_info = env.reset(train_mode=False)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) scores = np.zeros(num_agents) # initialize the score (for each agent) while True: actions = [agent.act(states[no_agent, :]) for no_agent in range(20)] actions = np.array(actions).reshape(20, 4) actions = np.clip(actions, -1, 1) # all actions between -1 and 1 env_info = env.step(actions)[ brain_name] # send all actions to tne environment next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished scores += env_info.rewards # update the score (for each agent) states = next_states # roll over states to next time step if np.any(dones): # exit loop if episode finished break print('Total score (averaged over agents) this episode: {}'.format( np.mean(scores)))
def ddpg(model_number, UPD, BUFFER_SIZE, BATCH_SIZE, LR_ACTOR, LR_CRITIC, fc1_units, fc2_units, a_gradient_clipping, a_leaky, a_dropout, c_gradient_clipping, c_batch_norm, c_leaky, c_dropout, n_episodes=400, max_t=2000, print_every=100): """Deep Q-Learning. Params ====== n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode ... """ agent = Agent(state_size, action_size, UPD, BUFFER_SIZE, BATCH_SIZE, LR_ACTOR, LR_CRITIC, fc1_units, fc2_units, a_gradient_clipping, a_leaky, a_dropout, c_gradient_clipping, c_batch_norm, c_leaky, c_dropout, 0, 12345) scores = [] # list containing scores from each episode scores_window = deque(maxlen=print_every) # last 100 scores for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0] agent.reset() score = 0 for t in range(max_t): action = agent.act(state, a_dropout, a_leaky) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] agent.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores_window.append(score) # save most recent score scores.append(score) # save most recent score with open('results.txt', 'a') as output: output.writelines(\ '{}, {}, {:.2f}, {:.2f}, {}, {}, {}, {:.4f}, {:.4f}, {}, {}, {}, {}, {}, {}, {}, {}, {} \n'.format( model_number, i_episode, np.mean(scores_window), score, UPD, BUFFER_SIZE, BATCH_SIZE, LR_ACTOR, LR_CRITIC, fc1_units, fc2_units, a_gradient_clipping, a_leaky, a_dropout, c_gradient_clipping, c_batch_norm, c_leaky, c_dropout)) output.flush() print('\rModel nr: {}, Episode {}, avg. score: {:.2f}, score: {:.2f}'.format\ (model_number, i_episode, np.mean(scores_window), score), end="") if i_episode % print_every == 0: print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:2f}'.format( i_episode, np.mean(scores_window), score)) if np.mean(scores_window) >= 30.0: with open('./models/models_solved.txt', 'a') as solved: solved.writelines('{}, {} \n'.format(model_number, i_episode)) solved.flush() print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode, np.mean(scores_window))) torch.save( agent.actor_local.state_dict(), './models/checkpoint_actor_' + str(model_number) + '.pth') torch.save( agent.critic_local.state_dict(), './models/checkpoint_critic_' + str(model_number) + '.pth') break return scores
# examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) ################################## ########### LOAD AGENT ########### ################################## agent = Agent(state_size=state_size, action_size=action_size, num_agents=num_agents, seed=1) agent.load_model(path_to_actor, path_to_critic) env_info = env.reset(train_mode=False)[brain_name] # reset the environment states = env_info.vector_observations # get the current state score = np.zeros(num_agents) # initialize the score while True: actions = agent.act(states) # select an action env_info = env.step(actions)[brain_name] # send the action to the environment next_states = env_info.vector_observations # get the next state rewards = env_info.rewards # get the reward dones = env_info.local_done # see if episode has finished score += rewards # update the score states = next_states # roll over the state to next time step if np.any(dones): break print("Score: {}".format(score)) env.close()
def train_or_play(cfg): # initialize the environment and obtain state/action sizes and other parameters env = init_environment(cfg.app_path) brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] action_size = brain.vector_action_space_size state_size = len(env_info.vector_observations[0]) agent = Agent(state_size, action_size, cfg) if cfg.train_model: scores = ddpg_learning(env, agent, brain_name, n_episodes=cfg.n_episodes, max_t=cfg.max_t, avg_score_cutoff=cfg.avg_score_cutoff, save_path_actor=cfg.save_path_actor, save_path_critic=cfg.save_path_critic) if cfg.save_scores: print("Saving scores to file {:s}".format(cfg.save_scores)) scores.to_hdf(cfg.save_scores, "scores") plot_scores(scores, cfg) else: # visualize trained model and scores assert os.path.exists( cfg.save_path_actor ), "Saved model weights need to exist before you can watch a trained agent!" assert os.path.exists( cfg.save_path_critic ), "Saved model weights need to exist before you can watch a trained agent!" print("Visualizing the trained agent!") env_info = env.reset(train_mode=False)[brain_name] agent.actor_local.load_state_dict(torch.load(cfg.save_path_actor)) agent.critic_local.load_state_dict(torch.load(cfg.save_path_critic)) score = 0 # initialize the score state = env_info.vector_observations[0] while True: action = agent.act(state, add_noise=False) # take step without noise env_info = env.step(action)[brain_name] state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] score += reward if done: break if os.path.exists(cfg.save_scores): plot_scores(pd.read_hdf(cfg.save_scores, "scores"), cfg) env.close()
agent_2.critic_target = agent_1.critic_target t_max = 1000 print_every = 100 maxlen = 100 score = [] ev_score = [] scores_deque = deque(maxlen=maxlen) for i_episode in range(1, env.n_episodes + 1): # play game for 5 episodes env_info = env.reset(train_mode=True)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) scores = np.zeros(num_agents) # initialize the score (for each agent) agent_1.reset() agent_2.reset() for t in range(t_max): actions_1 = agent_1.act(np.expand_dims(states[0], 0), True) actions_2 = agent_2.act(np.expand_dims(states[1], 0), True) # actions_1 = np.clip(actions_1, -1, 1) # all actions between -1 and 1 actions = np.concatenate((actions_1, actions_2)) env_info = env.step(actions)[ brain_name] # send all actions to tne environment next_states, rewards, dones = env_info.vector_observations, env_info.rewards, env_info.local_done # for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones): agent_1.step(np.expand_dims(states[0], 0), actions_1, rewards[0], np.expand_dims(next_states[0], 0), dones[0], t) agent_2.step(np.expand_dims(states[1], 0), actions_2, rewards[1], np.expand_dims(next_states[1], 0), dones[1], t) scores += rewards # update the score (for each agent) states = next_states # roll over states to next time step
# examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) agent = Agent(num_agents=num_agents, state_size=state_size, action_size=action_size, random_seed=0) # load the weights from file agent.actor_local.load_state_dict(torch.load('checkpoint_actor.pth')) agent.critic_local.load_state_dict(torch.load('checkpoint_critic.pth')) env_info = env.reset(train_mode=False)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) scores = np.zeros(num_agents) for t in range(1000): actions = agent.act(states, add_noise=False) env_info = env.step( actions )[brain_name] # send the action to the environment rewards = env_info.rewards # get the reward dones = env_info.local_done # see if episode has finished rewards = env_info.rewards # get the reward states = env_info.vector_observations # get the next state scores += rewards if np.any(dones): break print("average score for the episode is", np.max(scores)) env.close()
def train( n_episodes, max_t, env_fp, no_graphics, seed, save_every_nth, buffer_size, batch_size, gamma, tau, lr_actor, lr_critic, weight_decay, log, ): log.info("#### Initializing environment...") # init environment env = UnityEnvironment(file_name=env_fp, no_graphics=no_graphics) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) log.info(f"Number of agents: {num_agents}") # size of each action action_size = brain.vector_action_space_size log.info(f"Size of each action: {action_size}") # examine the state space states = env_info.vector_observations state_size = states.shape[1] log.info( f"There are {states.shape[0]} agents. Each observes a state with length: {state_size}" ) log.info(f"The state for the first agent looks like: {states[0]}") agent = Agent( num_agents=len(env_info.agents), state_size=state_size, action_size=action_size, buffer_size=buffer_size, batch_size=batch_size, gamma=gamma, tau=tau, lr_actor=lr_actor, lr_critic=lr_critic, weight_decay=weight_decay, random_seed=seed, ) log.info("#### Training...") scores_deque = deque(maxlen=100) scores = [] for i_episode in range(1, n_episodes + 1): brain_name = env.brain_names[0] env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations agent.reset() score = np.zeros((len(env_info.agents), 1)) for t in range(max_t): actions = agent.act(states) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations rewards = env_info.rewards rewards = np.array(rewards).reshape((next_states.shape[0], 1)) dones = env_info.local_done dones = np.array(dones).reshape((next_states.shape[0], 1)) agent.step(states, actions, rewards, next_states, dones) score += rewards states = next_states if np.any(dones): break scores_deque.append(np.mean(score)) scores.append(np.mean(score)) print( "Episode {}\tAverage Score: {:.2f}\tScore: {:.2f}".format( i_episode, np.mean(scores_deque), scores[-1]), end="\r", ) if i_episode % 100 == 0: print("\rEpisode {}\tAverage Score: {:.2f}".format( i_episode, np.mean(scores_deque))) if i_episode % save_every_nth == 0: save_checkpoint( state={ "episode": i_episode, "actor_state_dict": agent.actor_local.state_dict(), "critic_state_dict": agent.critic_local.state_dict(), "scores_deque": scores_deque, "scores": scores, }, filename="checkpoint.pth", ) plot_scores( scores=scores, title=f"Avg score over {len(env_info.agents)} agents", fname="avg_scores.png", savefig=True, ) if np.mean(scores_deque) >= 30: torch.save(agent.actor_local.state_dict(), "checkpoint_actor.pth") torch.save(agent.critic_local.state_dict(), "checkpoint_critic.pth") print( "\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}" .format(i_episode - 100, np.mean(scores_deque))) break
action = 0 state_size = len(state) # b_agent = Agent(args.model_name, state_size, action_size) try: b_agent.load() # try to load to continue training except: pass for epx in range(1, args.episodes + 1): at_step = 0 env_info = env.reset(train_mode=False)[brain_name] b_agent.reset_episode() while True: action = b_agent.act(state) env_info = env.step(action)[brain_name] at_step += 1 next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] if at_step % 100 == 0: log.info("ep:{} step:{} r:{} l:{}".format( epx, at_step, b_agent.cum_rewards(), b_agent.ave_loss())) if done: break b_agent.sense(state, action, reward, next_state, done) state = next_state print("{},{}".format(epx, b_agent.cum_rewards())) b_agent.save()
def ddpg(n_episodes=500, max_t=1000, start_steps=10, learn_frequency=20, learn_count=10, random_seed=1): """Deep Deterministic Policy Gradient (DDPG) Params ====== n_episodes (int) : maximum number of training episodes max_t (int) : maximum number of timesteps per episode start_steps (int) : number of starting steps actions are chosen randomly learn_frequency (int) : frequency of learning per timestep learn_count (int) : number of learning steps to do at learning timestep random_seed (int) : random seed for agent's weights """ agent = Agent(state_size=state_size, action_size=action_size, random_seed=random_seed) #Initialize the Agent avg_scores_episode = [] # list containing scores from each episode avg_scores_moving = [ ] # list containing avg scores from window at each episode scores_window = deque(maxlen=100) # last 100 scores for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] # reset environment states = env_info.vector_observations # get current state for each agent scores = np.zeros(num_agents) # initialize score for each agent agent.reset() # reset noise of the agent for t in range(max_t): #Randomly sample actions during the starting steps if i_episode <= start_steps: actions = np.random.randn( num_agents, action_size) # select an action randomly actions = np.clip(actions, -1, 1) # all actions between -1 and 1 else: actions = agent.act( states, add_noise=True ) # select an action according to policy (for each agent) env_info = env.step(actions)[ brain_name] # send actions to environment next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode has finished (for each agent) # for each agent's experience, save it and learn for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): if t % learn_frequency == 0: # Learn with frequency agent.step(state, action, reward, next_state, done, learn=True, learn_count=learn_count) else: agent.step(state, action, reward, next_state, done, learn=False) #just add, don't learn states = next_states scores += rewards # add the rewards from the timestep to the scores if np.any( dones ): # finish episode if any agent has reached a terminal state break scores_window.append( np.mean(scores)) # save the most recent score to scores window avg_scores_episode.append( np.mean(scores)) # save the most recent score to avg_scores avg_scores_moving.append( np.mean(scores_window) ) # save the most recent score window average to moving averages print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") if i_episode % 1 == 0: # Print every episode print( '\rEpisode {}\tAverage Score: {:.2f} \t Current Score: {:.2f}'. format(i_episode, np.mean(scores_window), np.mean(scores))) #environment is solved if np.mean(scores_window) >= 30.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(scores_window))) torch.save(agent.actor_local.state_dict(), "checkpoint_actor.pth") #Save actors' weights torch.save(agent.critic_local.state_dict(), "checkpoint_critic.pth") #Save critics' weights break return avg_scores_episode, avg_scores_moving # Return average score of each episode and moving average at that time
def multi_agent_ddpg(env, brain_name, title, n_episodes, action_size, state_size, num_agents, print_every, n_updates, update_intervals, device): # create save dir for this experiment if title is None: title = "experiment" current_time = strftime("%Y-%m-%d_%H:%M:%S", gmtime()) title = title + "_" + current_time # write a new file os.makedirs("experiments/{}".format(title), exist_ok=True) f = open("experiments/{}/scores.txt".format(title), "w") f.close() all_agents_statesize = state_size * num_agents agent1 = Agent(state_size=all_agents_statesize, action_size=action_size, num_agents=1, random_seed=123, device=device) agent2 = Agent(state_size=all_agents_statesize, action_size=action_size, num_agents=1, random_seed=123, device=device) scores_deque = deque(maxlen=100) mean_scores = [] for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations states = np.reshape( states, (1, all_agents_statesize )) # reshape so we can feed both agents states to each agent # reset agent1.reset() agent2.reset() # place to store scores agent_scores = np.zeros(num_agents) t = 0 while True: # two agents actions actions_1 = agent1.act(states, add_noise=True) actions_2 = agent2.act(states, add_noise=True) # step environment for two agents and get next states actions = np.concatenate((actions_1, actions_2), axis=0) actions = np.reshape(actions, (1, 4)) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations next_states = np.reshape(next_states, (1, all_agents_statesize)) rewards = env_info.rewards dones = env_info.local_done # update the agents accordingly (ddpg) agent1.step(states, actions_1, rewards[0], next_states, dones[0], n_updates, update_intervals, t) agent2.step(states, actions_2, rewards[1], next_states, dones[1], n_updates, update_intervals, t) states = next_states agent_scores += rewards if np.any(dones): break t += 1 scores_deque.append(np.max(agent_scores)) print('\rEpisode {}\tLast 100 average Score: {:.2f}'.format( i_episode, np.mean(scores_deque)), end="") # save score and model every print_every if i_episode % print_every == 0: f = open("experiments/{}/scores.txt".format(title), "a") f.write("{},{}\n".format(i_episode, np.mean(scores_deque))) f.close() print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_deque))) mean_scores.append(np.mean(scores_deque)) # save if best model if np.mean(scores_deque) == max(mean_scores): torch.save( agent1.actor_local.state_dict(), 'experiments/{}/checkpoint_actor1.pth'.format(title)) torch.save( agent1.critic_local.state_dict(), 'experiments/{}/checkpoint_critic1.pth'.format(title)) torch.save( agent2.actor_local.state_dict(), 'experiments/{}/checkpoint_actor2.pth'.format(title)) torch.save( agent2.critic_local.state_dict(), 'experiments/{}/checkpoint_critic2.pth'.format(title)) if np.mean(scores_deque) >= 1.0 and i_episode > 100: print("\rEnvironment solved with average score of 30") break
# print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque))) # # return scores #scores = ddpg() n_episodes = 15 max_t = 300 print_every = 100 scores_deque = deque(maxlen=print_every) scores = [] for i_episode in range(1, n_episodes + 1): state = env.reset() agent.reset() score = 0 for t in range(max_t): action = agent.act(state) next_state, reward, done, _ = env.step(action) agent.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores_deque.append(score) scores.append(score) print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)), end="") torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth') torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')