def step(self, current_return): """ Perturb policy weights by generating neighboring policies and comparing the return from each policy with current best. if the return from any of neighboring policy is greater than or equal to current best then we use that policy as our current. Parameters ---------- current_return (int): Return of current rollout """ super().step(current_return) # Check the return from all neighbors candidate_returns = [current_return] candidate_weights = [self.w] for _ in range(self.neighbors): policy = deepcopy(self) policy.w = self.best_weights + self.noise * np.random.rand(*self.best_weights.shape) rewards = run_episode(policy, self.env, self.max_steps) policy_return = calculate_returns(self.gamma, rewards) candidate_returns.append(policy_return) candidate_weights.append(policy.w) # Find the max return from candidate returns and # compare it with our best return best_idx = np.argmax(np.array(candidate_returns)) if candidate_returns[best_idx] >= self.best_return: self.best_return = candidate_returns[best_idx] self.best_weights = candidate_weights[best_idx] self.w = candidate_weights[best_idx]
def train(args, stop_on_solve=True, print_every=100): """ Create gym environment and train a variant of hill climbing policy """ env = gym.make(args.env) state_size = env.observation_space.shape[0] action_size = env.action_space.n # env_name, policy_name, num_episodes, max_steps, goal_score policy = load_policy(args.policy, env, args.steps, args.gamma) avg_return = deque(maxlen=100) all_rewards = [] for episode in range(1, args.episodes + 1): # rollout rewards = run_episode(policy, env, args.steps) # calculate discounted return current_return = calculate_returns(args.gamma, rewards) # update policy policy.step(current_return) # record returns avg_return.append(current_return) all_rewards.append(current_return) if episode % print_every == 0: logger.info('{}/{} average return of last 100 episodes {}'.format( episode, args.episodes, np.mean(avg_return))) if stop_on_solve: if np.mean(avg_return) >= args.goal: print('Env solved in {}'.format(episode)) break return policy, all_rewards, np.mean(avg_return)
def run_episodes_with_learned_baseline(policy_model, value_model, env, num_episodes, discount_factor, learn_rate_policy, learn_rate_value, init_temp, stochasticity): policy_optimizer = optim.Adam(policy_model.parameters(), learn_rate_policy) value_optimizer = optim.Adam(value_model.parameters(), learn_rate_value) episode_durations = [] value_losses = [] reinforce_losses = [] for i in range(num_episodes): policy_optimizer.zero_grad() value_optimizer.zero_grad() episode = run_episode(env, policy_model, i, init_temp, stochasticity) reinforce_loss = compute_reinforce_loss_with_learned_baseline( value_model, episode, discount_factor, env) value_loss = compute_value_loss(value_model, episode, discount_factor, env) reinforce_loss.backward() policy_optimizer.step() value_loss.backward() value_optimizer.step() episode_durations.append(len(episode)) reinforce_losses.append(reinforce_loss.detach().numpy()) value_losses.append(value_loss.detach().numpy()) del episode return np.asanyarray(episode_durations), np.asanyarray( reinforce_losses), np.asanyarray(value_losses)
def train(shared_policy_net, shared_value_net, process_i, args): """ Seeds each training process based on its rank. Build local version of policy and value network. Run the policy in the environment and update the policy and value network using Monte Carlo return. Synchronize the params of both policy and value network with the shared policy and value network after every update. :param process_i: the rank of this process :param args: an object which holds all hyperparam setting """ # Create env env = gym.make(args.env_name) # Each training process is init with a different seed random.seed(process_i) np.random.seed(process_i) torch.manual_seed(process_i) env.seed(process_i) # create local policy and value net and sync params policy_net = build_policy_net(args) policy_net.load_state_dict(shared_policy_net.state_dict()) pol_optim = RMSprop(policy_net.parameters(), lr=args.lr) value_net = build_value_net(args) value_net.load_state_dict(shared_value_net.state_dict()) val_optim = RMSprop(value_net.parameters(), lr=args.lr) for episode_i in count(): episode = run_episode(policy_net, env, args, process_i=process_i) if process_i == 0: print( f'process: {process_i}, episode: {episode_i}, episode length: {len(episode)}, G: {episode[0].G}' ) sys.stdout.flush() train_value_net(value_net, shared_value_net, val_optim, episode) value_net.load_state_dict(shared_value_net.state_dict()) train_policy_net(policy_net, shared_policy_net, pol_optim, episode, value_net, args, process_i=process_i) policy_net.load_state_dict(shared_policy_net.state_dict())
def run_episodes_no_baseline(model, env, num_episodes, discount_factor, learn_rate, init_temp, stochasticity): optimizer = optim.Adam(model.parameters(), learn_rate) episode_durations = [] losses = [] for i in range(num_episodes): optimizer.zero_grad() episode = run_episode(env, model, i, init_temp, stochasticity) loss = compute_reinforce_loss_no_baseline(episode, discount_factor) loss.backward() optimizer.step() losses.append(loss.detach().numpy()) episode_durations.append(len(episode)) del episode return np.asanyarray(episode_durations), np.asanyarray(losses)
r = pagerank_method.extract_localized_rewards(env) print("Pagerank GA fitness:" + str(pagerank_method.fitness(GA_policy, P, r))) # Random # scores_random = np.zeros(n_tests_random) # for i in range(n_tests_random): # steps = run_episode(env, agent_init_fn, policy=neutral_policy) # scores_random[i] = steps # # print("Score Random:" + str(np.mean(scores_random))) # GA Policy: scores_GA_pagerank = np.zeros(n_tests_pagerank) for i in range(n_tests_pagerank): steps = run_episode(env, ConsensusAgent, policy=GA_policy) scores_GA_pagerank[i] = steps print("Score GA:" + str(np.mean(scores_GA_pagerank))) policy = pagerank_method.pagerank_optimize_for_env(env) pagerank_policy = pagerank_method.optimize_value_iteration(P, env) neutral_policy = pagerank_policy * 0 + 1. / n_opinions # PageRank DP + 10% Random policy = pagerank_policy * (1 - randomness) + neutral_policy * randomness fitness = pagerank_method.fitness(pagerank_policy, P, r) scores_pagerank = np.zeros(n_tests_pagerank) for i in range(n_tests_pagerank): steps = run_episode(env, ConsensusAgent, policy=policy)
} mean_steps_list = [] n_gens = 9999 n_episodes_per_gen = 100 # plot_env = ConsensusEnvironment(n_agents=n_agents, n_opinions=n_opinions, draw=True) episodes_done = [] for i in range(n_gens): episodes_done.append(i * n_episodes_per_gen) steps_list = [] # Generate Data for j in range(n_episodes_per_gen): agent_storage.new_episode() # print("Running Episode: " + str(j)) steps, observations, actions = run_episode( env, agent_storage.get_next_agent, return_details=True) steps_list.append(steps) mean_steps_list.append(np.mean(steps_list)) print(np.mean(steps_list)) # if np.mean(steps_list)<500 and i%5 == 0: # run_episode(plot_env, agent_storage.get_next_agent, return_details=True) plt.figure() plt.plot(episodes_done, mean_steps_list) plt.xlabel("Total episodes performed") plt.ylabel("Mean number of steps") plt.grid() plt.show()
observation_list = env.observation_list observation_dict = { observation: idx for idx, observation in enumerate(observation_list) } scores_GA_pagerank_online_centralized = np.zeros(n_tests_pagerank * n_gens) for i in range(n_gens): policy = GA_policy policy = policy / np.sum(policy, 1)[:, np.newaxis] steps_list = [] # Generate Data for j in range(n_tests_pagerank): steps, observations, actions = run_episode(env, ConsensusAgent, policy=policy, return_details=True) for id, agent_observations in observations.items(): for step, o_t in enumerate(agent_observations[:-1]): o_t1 = agent_observations[step + 1] a = actions[id][step] experiences[observation_dict[o_t], observation_dict[o_t1], a] += 1.0 scores_GA_pagerank_online_centralized[i * n_tests_pagerank + j] = steps print("Gen:" + str(i)) P = experiences / np.sum(experiences, (1))[:, np.newaxis, :] GA_policy = pagerank_method.optimize_GA(env, P) print("Done pagerank, centralized") policy = pagerank_method.pagerank_optimize_for_env(env) pagerank_policy = pagerank_method.optimize_value_iteration(P, env)
optimizer = torch.optim.Adam(policy_PPO.parameters(), lr=0.01) #2. Iteration for iter in range(NUM_ITER): #2.1 Using theta k to interact with the env # to collect {s_t, a_t} and compute advantage # advantage(s_t, a_t) = sum_{t^prime=t}^{T_n}(r_{t^prime}^{n}) all_obs = [] all_action = [] all_advantage = [] for episode in range(NUM_EPISODE): obs_list, action_list, reward_list = run_episode(env, policy_PPO) # batch_obs = torch.from_numpy(np.array(obs_list)) # batch_action = torch.from_numpy(np.array(action_list)) # batch_reward = torch.from_numpy(calc_advantage(reward_list, gamma=0.9)) advantage_list = calc_advantage(reward_list, gamma=0.9) all_obs.extend(obs_list) all_action.extend(action_list) all_advantage.extend(advantage_list) dataset = PPODataset(obs_list=all_obs, action_list=all_action, advantage_list=all_advantage) dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False) # optimize theta for epoch in range(NUM_EPOCH): for i, (batch_obs, batch_action, batch_adv) in enumerate(dataloader): # print(batch_obs.shape)
from agents import ConsensusAgent from utils import run_episode import pagerank_method import matplotlib.pyplot as plt import numpy as np n_agents = 20 n_opinions = 3 env = ConsensusEnvironment(n_agents=n_agents, n_actions=n_opinions, draw=False) agent_init_fn = ConsensusAgent pagerank_policy = pagerank_method.pagerank_optimize_for_env(env) neutral_policy = pagerank_policy * 0 + 1. / n_opinions randomness_list = [0.01, 0.02, 0.05, 0.1, 0.2] mean_steps_list = [] for randomness in randomness_list: policy = pagerank_policy * (1 - randomness) + neutral_policy * randomness steps_list = [] for i in range(100): steps = run_episode(env, agent_init_fn, policy=policy) steps_list.append(steps) print(steps) mean_steps_list.append(np.mean(steps_list)) print("Test complete") print(np.mean(steps_list)) plt.semilogx(randomness_list, mean_steps_list) plt.ylabel("Mean number of steps (lower is better)") plt.xlabel("uniform policy / optimal policy ratio") plt.show()
# Setup task = Task( init_pose= np.array([0., 0., 10., 0., 0., 0.]), target_pos = np.array([0., 0., 10.]), ) stat = StatCollector() agent = Agent(task, sess, stat) saver = tf.train.Saver() # Run Training for i_episode in range(num_episodes): stat.tick() # Train policy and Q-Network score, steps = run_episode(sess, agent, task, train=True) stat.scalar('episode_steps_train', steps) stat.scalar('episode_reward_train', score) print('Episode = {:4d}, score train = {:7.3f}, steps = {}'.format(i_episode, score, steps)) # Evaluate policy if i_episode % evaluate_every == 0: score, steps = run_episode(sess, agent, task, train=False) stat.scalar('episode_steps_eval', steps) stat.scalar('episode_reward_eval', score) print('Episode = {:4d}, score eval = {:7.3f}, steps = {}'.format(i_episode, score, steps)) saver.save(sess, model_file) plot_training_graphs(stat) plt.pause(0.05) plt.savefig("./graphs.png")
# # Dyna-Q-learning # scores_DynaQ_online_decentralized = np.zeros(n_tests) # for i in range(n_tests): # agent_storage = PermanentAgentStorage(env, DynaAgent, k=10, randomness=randomness) # # Generate Data # steps, observations, actions = run_episode(env, agent_storage.get_next_agent, return_details=True) # scores_DynaQ_online_decentralized[i] = steps # print(i) # print("Mean Dyna-Q: " + str(np.mean(scores_DynaQ_online_decentralized)) + " --- S.E.: " + str(np.std(scores_DynaQ_online_decentralized)/np.sqrt(n_tests))) # # DP behaviour - Nonlearning scores_DP = np.zeros(n_tests) for i in range(n_tests): steps = run_episode(env, ConsensusAgent, policy=pagerank_policy * (1 - randomness) + randomness * neutral_policy, use_joint_actions=False) scores_DP[i] = steps print("Mean DP: " + str(np.mean(scores_DP)) + " --- S.E.: " + str(np.std(scores_DP) / np.sqrt(n_tests))) scores_Q_online_decentralized = np.zeros(n_tests) for i in range(n_tests): agent_storage = PermanentAgentStorage(env, DynaAgent, k=0, randomness=randomness, lr=0.0000, Q=Q_init)
target_PPO.load_state_dict(policy_PPO.state_dict()) optimizer = torch.optim.Adam(policy_PPO.parameters(), lr=0.01) BETA = 1 #2. Iteration for iter in range(num_iter): #2.1 Using theta k to interact with the env # to collect {s_t, a_t} and compute advantage # advantage(s_t, a_t) = sum_{t^prime=t}^{T_n}(r_{t^prime}^{n}) all_obs = [] all_action = [] all_advantage = [] for episode in range(num_episode): obs_list, action_list, reward_list = run_episode(env, target_PPO) # batch_obs = torch.from_numpy(np.array(obs_list)) # batch_action = torch.from_numpy(np.array(action_list)) # batch_reward = torch.from_numpy(calc_advantage(reward_list, gamma=0.9)) advantage_list = calc_advantage(reward_list, gamma=0.9) all_obs.extend(obs_list) all_action.extend(action_list) all_advantage.extend(advantage_list) dataset = PPODataset(obs_list=all_obs, action_list=all_action, advantage_list=all_advantage) dataloader = DataLoader(dataset, batch_size=128, shuffle=True) # optimize theta for epoch in range(num_epoch):
n_episodes_per_gen = 100 #plot_env = ConsensusEnvironment(n_agents=n_agents, n_opinions=n_opinions, draw=True) randomness = 0.1 episodes_done = [] for i in range(n_gens): episodes_done.append(i * n_episodes_per_gen) #randomness = randomness*0.9 policy = ((1 - randomness) * pagerank_policy + (randomness) * neutral_policy) policy = policy / np.sum(policy, 1)[:, np.newaxis] steps_list = [] # Generate Data for j in range(n_episodes_per_gen): steps, observations, actions = run_episode(env, agent_init_fn, policy=policy, return_details=True) steps_list.append(steps) for id, agent_observations in observations.items(): for step, o_t in enumerate(agent_observations[:-1]): o_t1 = agent_observations[step + 1] a = actions[id][step] experiences[observation_dict[o_t], observation_dict[o_t1], a] += 1.0 mean_steps_list.append(np.mean(steps_list)) P = experiences / np.sum(experiences, (1))[:, np.newaxis, :] pagerank_policy = pagerank_method.optimize_value_iteration(P, env) print(np.mean(steps_list)) #if np.mean(steps_list)<500 and i%5 == 0: # run_episode(plot_env, agent_init_fn, policy=policy, return_details=True)
from agents import ConsensusAgent from agents import DQNAgent from agents import PermanentAgentStorage # Environment setup: n_agents = 10 n_opinions = 2 randomness = 0.2 n_tests = 50 env = ConsensusEnvironment(n_agents=n_agents, n_opinions=n_opinions, draw=False) P = pagerank_method.pagerank_find_P(env) agent_storage = PermanentAgentStorage(env, DQNAgent, k=0, randomness=randomness) scores_DQN_online_decentralized = np.zeros(n_tests) for i in range(n_tests): # Generate Data steps = run_episode(env, agent_storage.get_next_agent, use_extended_observation=True, return_details=False) scores_DQN_online_decentralized[i] = steps print(steps) print(np.mean(scores_DQN_online_decentralized))