def run(): # Create environment env, state_dim, action_dim, max_steps = make_env(env_params=Namespace( **env_params)) env_eval, state_dim, action_dim, max_steps = make_env(env_params=Namespace( **env_params)) # Create agent trainers # obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] # num_adversaries = min(env.n, arglist.num_adversaries) obs_shape_n = state_dim act_shape_n = action_dim maa2c = MAA2C(env, env_params['n_agents'], obs_shape_n, act_shape_n, max_steps=max_steps) episodes = [] eval_rewards = [] while maa2c.n_episodes < MAX_EPISODES: # print(maa2c.env_state) maa2c.interact() # if maa2c.n_episodes >= EPISODES_BEFORE_TRAIN: # maa2c.train() maa2c.train() if maa2c.episode_done and ((maa2c.n_episodes) % EVAL_INTERVAL == 0): rewards, _ = maa2c.evaluation(env_eval, EVAL_EPISODES) rewards_mu, rewards_std = agg_double_list(rewards) print("Episode %d, Average Reward %.2f, STD %.2f" % (maa2c.n_episodes, rewards_mu, rewards_std)) episodes.append(maa2c.n_episodes) eval_rewards.append(rewards_mu)
def run(env_id="AttFC_GyroErr-MotorVel_M4_Con-v0"): env = gym.make(env_id) env = RewScale(env, 0.1) env.seed(RANDOM_SEED) env_eval = gym.make(env_id) env_eval = RewScale(env_eval, 0.1) env_eval.seed(RANDOM_SEED) state_dim = env.observation_space.shape[0] if len(env.action_space.shape) > 1: action_dim = env.action_space.shape[0] else: action_dim = env.action_space.shape[0] ppo = PPO(env=env, memory_capacity=MEMORY_CAPACITY, state_dim=state_dim, action_dim=action_dim, batch_size=BATCH_SIZE, entropy_reg=ENTROPY_REG, done_penalty=DONE_PENALTY, roll_out_n_steps=ROLL_OUT_N_STEPS, target_update_steps=TARGET_UPDATE_STEPS, target_tau=TARGET_TAU, reward_gamma=REWARD_DISCOUNTED_GAMMA, epsilon_start=EPSILON_START, epsilon_end=EPSILON_END, epsilon_decay=EPSILON_DECAY, max_grad_norm=MAX_GRAD_NORM, episodes_before_train=EPISODES_BEFORE_TRAIN, critic_loss=CRITIC_LOSS) episodes = [] eval_rewards = [] while ppo.n_episodes < MAX_EPISODES: ppo.interact() if ppo.n_episodes >= EPISODES_BEFORE_TRAIN: ppo.train() if ppo.episode_done and ((ppo.n_episodes + 1) % EVAL_INTERVAL == 0): rewards, _ = ppo.evaluation(env_eval, EVAL_EPISODES) rewards_mu, rewards_std = agg_double_list(rewards) print("Episode %d, Average Reward %.2f" % (ppo.n_episodes + 1, rewards_mu)) episodes.append(ppo.n_episodes + 1) eval_rewards.append(rewards_mu) episodes = np.array(episodes) eval_rewards = np.array(eval_rewards) np.savetxt("./output/%s_ppo_episodes.txt" % env_id, episodes) np.savetxt("./output/%s_ppo_eval_rewards.txt" % env_id, eval_rewards) plt.figure() plt.plot(episodes, eval_rewards) plt.title("%s" % env_id) plt.xlabel("Episode") plt.ylabel("Average Reward") plt.legend(["PPO"]) plt.savefig("./output/%s_ppo.png" % env_id)
def test_final(self, actor_weight_file, critic_weight_file): # Evaluate the performance of your agent over 100 episodes, by calculating cummulative rewards for the 100 episodes. # Here you need to interact with the environment, irrespective of whether you are using a memory. # Load the weights self.a2c.load_weights(actor_weight_file, critic_weight_file) # Setting the env configuration if (environment_name == 'cp-v0'): self.env.env.my_init(self.G * 10.5, self.MC * 0.9, self.MP * 2.5, self.L * 1.5, self.F * 2.5) elif (environment_name == 'Bipedal-v0'): self.env.env.my_init(self.F * 2.5) # mini train num_minitrain_episodes = 10 while self.a2c.n_episodes < 10: self.a2c.interact() self.a2c.train() episodes = [] eval_rewards = [] num_episodes = 40 + num_minitrain_episodes while self.a2c.n_episodes < num_episodes: self.a2c.interact() rewards, _ = self.a2c.evaluation(self.env_eval, 1) rewards_mu, rewards_std = agg_double_list(rewards) print("Episode %d, Average Reward %.2f" % (self.a2c.n_episodes + 1, rewards_mu)) episodes.append(self.a2c.n_episodes + 1) eval_rewards.append(rewards_mu) episodes = np.array(episodes) eval_rewards = np.array(eval_rewards) # Print mean and std.dev mean_reward = np.mean(eval_rewards) stddev_reward = np.std(eval_rewards) print("Mean Reward:{}\n Std. dev:{}".format(mean_reward, stddev_reward)) # Save the plot base_path = os.path.join(self.environment_name, 'a2c_plot_test') if not os.path.exists(base_path): os.makedirs(base_path) file_name = os.path.join(base_path, 'Average_reward.png') plt.figure() plt.plot(episodes, eval_rewards) plt.title("%s" % self.environment_name) plt.xlabel("Episode") plt.ylabel("Average Reward") plt.legend(["A2C"]) plt.show() plt.savefig(file_name)
def run(env_id="CartPole-v0"): env = gym.make(env_id) env.seed(RANDOM_SEED) env_eval = gym.make(env_id) env_eval.seed(RANDOM_SEED) state_dim = env.observation_space.shape[0] if len(env.action_space.shape) > 1: action_dim = env.action_space.shape[0] else: action_dim = env.action_space.n dqn = DQN(env=env, memory_capacity=MEMORY_CAPACITY, state_dim=state_dim, action_dim=action_dim, batch_size=BATCH_SIZE, max_steps=MAX_STEPS, done_penalty=DONE_PENALTY, critic_loss=CRITIC_LOSS, reward_gamma=REWARD_DISCOUNTED_GAMMA, epsilon_start=EPSILON_START, epsilon_end=EPSILON_END, epsilon_decay=EPSILON_DECAY, max_grad_norm=MAX_GRAD_NORM, episodes_before_train=EPISODES_BEFORE_TRAIN) episodes = [] eval_rewards = [] while dqn.n_episodes < MAX_EPISODES: dqn.interact() if dqn.n_episodes >= EPISODES_BEFORE_TRAIN: dqn.train() if dqn.episode_done and ((dqn.n_episodes + 1) % EVAL_INTERVAL == 0): rewards, _ = dqn.evaluation(env_eval, EVAL_EPISODES) rewards_mu, rewards_std = agg_double_list(rewards) print("Episode %d, Average Reward %.2f" % (dqn.n_episodes + 1, rewards_mu)) episodes.append(dqn.n_episodes + 1) eval_rewards.append(rewards_mu) episodes = np.array(episodes) eval_rewards = np.array(eval_rewards) np.savetxt("./output/%s_dqn_episodes.txt" % env_id, episodes) np.savetxt("./output/%s_dqn_eval_rewards.txt" % env_id, eval_rewards) plt.figure() plt.plot(episodes, eval_rewards) plt.title("%s" % env_id) plt.xlabel("Episode") plt.ylabel("Average Reward") plt.legend(["DQN"]) plt.savefig("./output/%s_dqn.png" % env_id)
def run(env_id="Pendulum-v0"): env = gym.make(env_id) env.seed(RANDOM_SEED) env_eval = gym.make(env_id) env_eval.seed(RANDOM_SEED) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] ddpg = DDPG(env=env, memory_capacity=MEMORY_CAPACITY, state_dim=state_dim, action_dim=action_dim, batch_size=BATCH_SIZE, max_steps=MAX_STEPS, done_penalty=DONE_PENALTY, target_update_steps=TARGET_UPDATE_STEPS, target_tau=TARGET_TAU, reward_gamma=REWARD_DISCOUNTED_GAMMA, critic_loss=CRITIC_LOSS, epsilon_start=EPSILON_START, epsilon_end=EPSILON_END, epsilon_decay=EPSILON_DECAY, max_grad_norm=MAX_GRAD_NORM, episodes_before_train=EPISODES_BEFORE_TRAIN) episodes = [] eval_rewards = [] while ddpg.n_episodes < MAX_EPISODES: ddpg.interact() if ddpg.n_episodes >= EPISODES_BEFORE_TRAIN: ddpg.train() if ddpg.episode_done and ((ddpg.n_episodes + 1) % EVAL_INTERVAL == 0): rewards, _ = ddpg.evaluation(env_eval, EVAL_EPISODES) rewards_mu, rewards_std = agg_double_list(rewards) print("Episode: %d, Average Reward: %.5f" % (ddpg.n_episodes + 1, rewards_mu)) episodes.append(ddpg.n_episodes + 1) eval_rewards.append(rewards_mu) episodes = np.array(episodes) eval_rewards = np.array(eval_rewards) np.savetxt("./output/%s_ddpg_episodes.txt" % env_id, episodes) np.savetxt("./output/%s_ddpg_eval_rewards.txt" % env_id, eval_rewards) plt.figure() plt.plot(episodes, eval_rewards) plt.xlabel("Episode") plt.ylabel("Average Reward") plt.legend(["DDPG"]) plt.savefig("./output/%s_ddpg.png" % env_id)
def train(self, render=1): # In this function, we will train our network. # If training without experience replay_memory, then you will interact with the environment # in this function, while also updating your network parameters. # If you are using a replay memory, you should interact with environment here, and store these # transitions to memory, while also updating your model. # Variables init # Burn in memory episodes = [] eval_rewards = [] while self.a2c.n_episodes < self.max_episodes: self.a2c.interact() if self.a2c.n_episodes >= self.episodes_before_train: self.a2c.train() if self.a2c.episode_done and ((self.a2c.n_episodes + 1) % self.eval_iterval == 0): rewards, _ = self.a2c.evaluation(self.env_eval, self.eval_episodes) rewards_mu, rewards_std = agg_double_list(rewards) print("Episode %d, Average Reward %.2f" % (self.a2c.n_episodes + 1, rewards_mu)) episodes.append(self.a2c.n_episodes + 1) eval_rewards.append(rewards_mu) # Save the weights print("=> Saving weights after {} episodes".format( self.a2c.n_episodes + 1)) self.a2c.save_weights(self.environment_name, self.a2c.n_episodes + 1) episodes = np.array(episodes) eval_rewards = np.array(eval_rewards) # Save the plot base_path = os.path.join(self.environment_name, 'a2c_plot_eval') if not os.path.exists(base_path): os.makedirs(base_path) file_name = os.path.join(base_path, 'Average_reward.png') plt.figure() plt.plot(episodes, eval_rewards) plt.title("%s" % self.environment_name) plt.xlabel("Episode") plt.ylabel("Average Reward") plt.legend(["A2C"]) plt.savefig(file_name)
def run(arglist): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) env_eval = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers # obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) obs_shape_n = [env.observation_space[i].shape[0] for i in range(env.n)] act_shape_n = [env.action_space[i].n for i in range(env.n)] maa2c = MAA2C(env, env.n, obs_shape_n, act_shape_n) # maa2c_eval = MAA2C(env_eval, env_eval.n, obs_shape_n, act_shape_n) # trainers = get_trainers(env) # env = gym.make(env_id) # env.seed(RANDOM_SEED) # env_eval = gym.make(env_id) # env_eval.seed(RANDOM_SEED) # state_dim = env.observation_space.shape[0] # if len(env.action_space.shape) > 1: # action_dim = env.action_space.shape[0] # else: # action_dim = env.action_space.n # a2c = A2C(env=env, memory_capacity=MEMORY_CAPACITY, # state_dim=state_dim, action_dim=action_dim, # batch_size=BATCH_SIZE, entropy_reg=ENTROPY_REG, # done_penalty=DONE_PENALTY, roll_out_n_steps=ROLL_OUT_N_STEPS, # reward_gamma=REWARD_DISCOUNTED_GAMMA, # epsilon_start=EPSILON_START, epsilon_end=EPSILON_END, # epsilon_decay=EPSILON_DECAY, max_grad_norm=MAX_GRAD_NORM, # episodes_before_train=EPISODES_BEFORE_TRAIN, # critic_loss=CRITIC_LOSS) episodes =[] eval_rewards =[] while maa2c.n_episodes < MAX_EPISODES: maa2c.interact() # if maa2c.n_episodes >= EPISODES_BEFORE_TRAIN: # maa2c.train() maa2c.train() if maa2c.episode_done and ((maa2c.n_episodes)%EVAL_INTERVAL == 0): rewards, _ = maa2c.evaluation(env_eval, EVAL_EPISODES) rewards_mu, rewards_std = agg_double_list(rewards) # print(rewards) print("Episode %d, Average Reward %.2f, STD %.2f" % (maa2c.n_episodes, rewards_mu, rewards_std)) episodes.append(maa2c.n_episodes) eval_rewards.append(rewards_mu)
def train_maml(self, render=1): sample_size = 10 theta_list = [] K = 1 num_iterations = 50000 task_list = [] plt.figure() for i in range(num_iterations * sample_size): if (self.environment_name == 'cp-v0'): task = { 'G': np.random.uniform(self.range[0] * self.G, self.range[1] * self.G, 1)[0], 'MC': np.random.uniform(self.range[0] * self.MC, self.range[1] * self.MC, 1)[0], 'MP': np.random.uniform(self.range[0] * self.MP, self.range[1] * self.MP, 1)[0], 'L': np.random.uniform(self.range[0] * self.L, self.range[1] * self.L, 1)[0], 'F': np.random.uniform(self.range[0] * self.F, self.range[1] * self.F, 1)[0] } elif (self.environment_name == 'Bipedal-v0'): task = { 'F': np.random.uniform(self.range[0] * self.F, self.range[1] * self.F, 1)[0] } task_list.append(task) num_tasks = len(task_list) # Outer loop for i in range(num_iterations): sample_indexes = np.random.randint(0, num_tasks, size=sample_size) # Get the theta if i == 0: theta_actor_critic = self.a2c.get_weights() # Inner loop # First gradient for j, sample_index in enumerate(sample_indexes): task = task_list[sample_index] # Set the configuration if (self.environment_name == 'cp-v0'): self.env.env.my_init(task['G'], task['MC'], task['MP'], task['L'], task['F']) elif (self.environment_name == 'Bipedal-v0'): self.env.env.my_init(task['F']) # Set the model weights to theta before training self.a2c.set_weights(theta_actor_critic) # Train the a2c network for this task for K episodes while self.a2c.n_episodes < K: self.a2c.interact() self.a2c.train() if i == 0: theta_list.append(self.a2c.get_weights()) else: theta_list[j] = self.a2c.get_weights() # Second gradiet for j, sample_index in enumerate(sample_indexes): task = task_list[sample_index] # Set the configuration if (self.environment_name == 'cp-v0'): self.env.env.my_init(task['G'], task['MC'], task['MP'], task['L'], task['F']) elif (self.environment_name == 'Bipedal-v0'): self.env.env.my_init(task['F']) # Set the model weights to theta before training self.a2c.set_weights(theta_list[j]) # Get the network loss for this task for 1 episode # TODO: There should be no while loop # while self.a2c.n_episodes < 1: self.a2c.interact() combined_loss = self.a2c.get_loss() # Set the model weights to theta self.a2c.set_weights(theta_actor_critic) # Update theta self.a2c.update_net(combined_loss) theta_actor_critic = self.a2c.get_weights() # Evaluate the network self.a2c.interact() rewards, _ = self.a2c.evaluation(self.env_eval, 1) rewards_mu, rewards_std = agg_double_list(rewards) print("Episode %d, Average Reward %.2f" % (i + 1, rewards_mu)) # Plot iteration vs reward plt.scatter(i, rewards_mu) #plt.pause(0.0001) # Save the weights if i % self.save_weight_interval == 0 and i != 0: self.a2c.save_weights(self.environment_name, i) base_path = os.path.join(self.environment_name, 'a2c_plot_train') if not os.path.exists(base_path): os.makedirs(base_path) file_name = os.path.join(base_path, 'Average_reward_train.png') plt.title("%s" % self.environment_name) plt.xlabel("Episode") plt.ylabel("Average Reward") plt.legend(["A2C"]) plt.savefig(file_name)
def test_final(self, actor_weight_file, critic_weight_file): # Evaluate the performance of your agent over 100 episodes, by calculating cummulative rewards for the 100 episodes. # Here you need to interact with the environment, irrespective of whether you are using a memory. episodes = [] eval_rewards = [] self.env_fin = gym.make('cp-v0') num_episodes = 50 self.a2c.load_weights(actor_weight_file, critic_weight_file) base = np.array([1, 0.5, 2]) G = np.array([1]) * 9.8 MC = base * 0.5 MP = base * 0.1 L = base * 0.5 F = base * 10 fl = open('Experiments.csv', 'w') fl.write( 'List of parameters: Gravity, Mass of Cart, Mass of Pole, Length, Force Magnitude\n' ) fl.write('Output Reward: Mean, Standard Deviation\n') for g in G: for mc in MC: for mp in MP: for l in L: for f in F: self.env_fin.env.my_init(G=g, MC=mc, MP=mp, L=l, F=f) for i in range(num_episodes): self.a2c.interact() rewards, _ = self.a2c.evaluation( self.env_fin, 1) rewards_mu, rewards_std = agg_double_list( rewards) #print("Episode %d, Average Reward %.2f" % # (self.a2c.n_episodes+1, rewards_mu)) episodes.append(i + 1) eval_rewards.append(rewards_mu) print(g, mc, mp, l, f) rm = float("{0:.2f}".format(np.mean(eval_rewards))) rs = float("{0:.2f}".format(np.std(eval_rewards))) str_cp = str(mc) + '& ' + str(mp) + '& ' + str( l) + '& ' + str(f) + '& ' str_cp = str_cp + str(rm) + ' &' + str(rs) + '\n' fl.write(str_cp) print( "Rewards: Mean: %d, Std: %d" % (np.mean(eval_rewards), np.std(eval_rewards))) episodes = np.array(episodes) eval_rewards = np.array(eval_rewards) # Save the plot base_path = os.path.join(self.environment_name, 'a2c_plot_test') if not os.path.exists(base_path): os.makedirs(base_path) file_name = os.path.join(base_path, 'Average_reward.png') plt.figure() plt.plot(episodes, eval_rewards) plt.title("%s" % self.environment_name) plt.xlabel("Episode") plt.ylabel("Average Reward") plt.legend(["A2C"]) plt.savefig(file_name)