示例#1
0
    def step(self, current_return):
        """
        Perturb policy weights by generating neighboring policies and
        comparing the return from each policy with current best.
        if the return from any of neighboring policy is greater than 
        or equal to current best then we use that policy as our current.

        Parameters
        ----------
        current_return (int): Return of current rollout
        """        
        super().step(current_return)
        # Check the return from all neighbors
        candidate_returns = [current_return]
        candidate_weights = [self.w]        
        for _ in range(self.neighbors):
            policy = deepcopy(self)
            policy.w = self.best_weights + self.noise * np.random.rand(*self.best_weights.shape)
            rewards = run_episode(policy, self.env, self.max_steps)
            policy_return = calculate_returns(self.gamma, rewards)
            candidate_returns.append(policy_return)
            candidate_weights.append(policy.w)

        # Find the max return from candidate returns and 
        # compare it with our best return
        best_idx = np.argmax(np.array(candidate_returns))
        if candidate_returns[best_idx] >= self.best_return:
            self.best_return = candidate_returns[best_idx]
            self.best_weights = candidate_weights[best_idx]
            self.w = candidate_weights[best_idx]
示例#2
0
def train(args, stop_on_solve=True, print_every=100):
    """
    Create gym environment and train a variant of hill climbing policy    
    """
    env = gym.make(args.env)
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    # env_name, policy_name, num_episodes, max_steps, goal_score
    policy = load_policy(args.policy, env, args.steps, args.gamma)

    avg_return = deque(maxlen=100)
    all_rewards = []
    for episode in range(1, args.episodes + 1):
        # rollout
        rewards = run_episode(policy, env, args.steps)
        # calculate discounted return
        current_return = calculate_returns(args.gamma, rewards)
        # update policy
        policy.step(current_return)
        # record returns
        avg_return.append(current_return)
        all_rewards.append(current_return)

        if episode % print_every == 0:
            logger.info('{}/{} average return of last 100 episodes {}'.format(
                episode, args.episodes, np.mean(avg_return)))

        if stop_on_solve:
            if np.mean(avg_return) >= args.goal:
                print('Env solved in {}'.format(episode))
                break

    return policy, all_rewards, np.mean(avg_return)
def run_episodes_with_learned_baseline(policy_model, value_model, env,
                                       num_episodes, discount_factor,
                                       learn_rate_policy, learn_rate_value,
                                       init_temp, stochasticity):
    policy_optimizer = optim.Adam(policy_model.parameters(), learn_rate_policy)
    value_optimizer = optim.Adam(value_model.parameters(), learn_rate_value)

    episode_durations = []
    value_losses = []
    reinforce_losses = []

    for i in range(num_episodes):
        policy_optimizer.zero_grad()
        value_optimizer.zero_grad()

        episode = run_episode(env, policy_model, i, init_temp, stochasticity)
        reinforce_loss = compute_reinforce_loss_with_learned_baseline(
            value_model, episode, discount_factor, env)
        value_loss = compute_value_loss(value_model, episode, discount_factor,
                                        env)

        reinforce_loss.backward()
        policy_optimizer.step()

        value_loss.backward()
        value_optimizer.step()

        episode_durations.append(len(episode))
        reinforce_losses.append(reinforce_loss.detach().numpy())
        value_losses.append(value_loss.detach().numpy())

        del episode

    return np.asanyarray(episode_durations), np.asanyarray(
        reinforce_losses), np.asanyarray(value_losses)
示例#4
0
def train(shared_policy_net, shared_value_net, process_i, args):
    """
    Seeds each training process based on its rank.
    Build local version of policy and value network.
    Run the policy in the environment and update the policy and value network using Monte Carlo return.
    Synchronize the params of both policy and value network with the shared policy and value network after every update.

    :param process_i: the rank of this process
    :param args: an object which holds all hyperparam setting
    """

    # Create env
    env = gym.make(args.env_name)

    # Each training process is init with a different seed
    random.seed(process_i)
    np.random.seed(process_i)
    torch.manual_seed(process_i)
    env.seed(process_i)

    # create local policy and value net and sync params
    policy_net = build_policy_net(args)
    policy_net.load_state_dict(shared_policy_net.state_dict())

    pol_optim = RMSprop(policy_net.parameters(), lr=args.lr)

    value_net = build_value_net(args)
    value_net.load_state_dict(shared_value_net.state_dict())

    val_optim = RMSprop(value_net.parameters(), lr=args.lr)

    for episode_i in count():
        episode = run_episode(policy_net, env, args, process_i=process_i)

        if process_i == 0:
            print(
                f'process: {process_i}, episode: {episode_i}, episode length: {len(episode)}, G: {episode[0].G}'
            )
            sys.stdout.flush()

        train_value_net(value_net, shared_value_net, val_optim, episode)
        value_net.load_state_dict(shared_value_net.state_dict())

        train_policy_net(policy_net,
                         shared_policy_net,
                         pol_optim,
                         episode,
                         value_net,
                         args,
                         process_i=process_i)
        policy_net.load_state_dict(shared_policy_net.state_dict())
def run_episodes_no_baseline(model, env, num_episodes, discount_factor,
                             learn_rate, init_temp, stochasticity):
    optimizer = optim.Adam(model.parameters(), learn_rate)
    episode_durations = []
    losses = []

    for i in range(num_episodes):
        optimizer.zero_grad()

        episode = run_episode(env, model, i, init_temp, stochasticity)
        loss = compute_reinforce_loss_no_baseline(episode, discount_factor)

        loss.backward()
        optimizer.step()

        losses.append(loss.detach().numpy())
        episode_durations.append(len(episode))

        del episode

    return np.asanyarray(episode_durations), np.asanyarray(losses)
r = pagerank_method.extract_localized_rewards(env)
print("Pagerank GA fitness:" + str(pagerank_method.fitness(GA_policy, P, r)))

# Random
# scores_random = np.zeros(n_tests_random)
# for i in range(n_tests_random):
#     steps = run_episode(env, agent_init_fn, policy=neutral_policy)
#     scores_random[i] = steps
#
# print("Score Random:" + str(np.mean(scores_random)))

# GA Policy:
scores_GA_pagerank = np.zeros(n_tests_pagerank)
for i in range(n_tests_pagerank):
    steps = run_episode(env, ConsensusAgent, policy=GA_policy)
    scores_GA_pagerank[i] = steps

print("Score GA:" + str(np.mean(scores_GA_pagerank)))

policy = pagerank_method.pagerank_optimize_for_env(env)
pagerank_policy = pagerank_method.optimize_value_iteration(P, env)
neutral_policy = pagerank_policy * 0 + 1. / n_opinions

# PageRank DP + 10% Random
policy = pagerank_policy * (1 - randomness) + neutral_policy * randomness
fitness = pagerank_method.fitness(pagerank_policy, P, r)

scores_pagerank = np.zeros(n_tests_pagerank)
for i in range(n_tests_pagerank):
    steps = run_episode(env, ConsensusAgent, policy=policy)
}

mean_steps_list = []
n_gens = 9999
n_episodes_per_gen = 100
# plot_env = ConsensusEnvironment(n_agents=n_agents, n_opinions=n_opinions, draw=True)
episodes_done = []
for i in range(n_gens):
    episodes_done.append(i * n_episodes_per_gen)

    steps_list = []

    # Generate Data
    for j in range(n_episodes_per_gen):
        agent_storage.new_episode()
        # print("Running Episode: " + str(j))
        steps, observations, actions = run_episode(
            env, agent_storage.get_next_agent, return_details=True)
        steps_list.append(steps)
    mean_steps_list.append(np.mean(steps_list))

    print(np.mean(steps_list))
    # if np.mean(steps_list)<500 and i%5 == 0:
    #    run_episode(plot_env, agent_storage.get_next_agent, return_details=True)

plt.figure()
plt.plot(episodes_done, mean_steps_list)
plt.xlabel("Total episodes performed")
plt.ylabel("Mean number of steps")
plt.grid()
plt.show()
示例#8
0
observation_list = env.observation_list
observation_dict = {
    observation: idx
    for idx, observation in enumerate(observation_list)
}

scores_GA_pagerank_online_centralized = np.zeros(n_tests_pagerank * n_gens)
for i in range(n_gens):
    policy = GA_policy
    policy = policy / np.sum(policy, 1)[:, np.newaxis]

    steps_list = []
    # Generate Data
    for j in range(n_tests_pagerank):
        steps, observations, actions = run_episode(env,
                                                   ConsensusAgent,
                                                   policy=policy,
                                                   return_details=True)
        for id, agent_observations in observations.items():
            for step, o_t in enumerate(agent_observations[:-1]):
                o_t1 = agent_observations[step + 1]
                a = actions[id][step]
                experiences[observation_dict[o_t], observation_dict[o_t1],
                            a] += 1.0
        scores_GA_pagerank_online_centralized[i * n_tests_pagerank + j] = steps
    print("Gen:" + str(i))
    P = experiences / np.sum(experiences, (1))[:, np.newaxis, :]
    GA_policy = pagerank_method.optimize_GA(env, P)
print("Done pagerank, centralized")

policy = pagerank_method.pagerank_optimize_for_env(env)
pagerank_policy = pagerank_method.optimize_value_iteration(P, env)
示例#9
0
optimizer = torch.optim.Adam(policy_PPO.parameters(), lr=0.01)



    
#2. Iteration 
for iter in range(NUM_ITER):
    #2.1  Using theta k to interact with the env
    # to collect {s_t, a_t} and compute advantage
    # advantage(s_t, a_t) = sum_{t^prime=t}^{T_n}(r_{t^prime}^{n})
    
    all_obs = []
    all_action = []
    all_advantage = []
    for episode in range(NUM_EPISODE):
        obs_list, action_list, reward_list = run_episode(env, policy_PPO)
        # batch_obs = torch.from_numpy(np.array(obs_list))
        # batch_action = torch.from_numpy(np.array(action_list))
        # batch_reward = torch.from_numpy(calc_advantage(reward_list, gamma=0.9))
        advantage_list = calc_advantage(reward_list, gamma=0.9)
        all_obs.extend(obs_list)
        all_action.extend(action_list)
        all_advantage.extend(advantage_list)
    dataset = PPODataset(obs_list=all_obs, action_list=all_action, advantage_list=all_advantage)
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)

    # optimize theta
    
    for epoch in range(NUM_EPOCH):
        for i, (batch_obs, batch_action, batch_adv) in enumerate(dataloader):
            # print(batch_obs.shape)
from agents import ConsensusAgent
from utils import run_episode
import pagerank_method
import matplotlib.pyplot as plt
import numpy as np

n_agents = 20
n_opinions = 3
env = ConsensusEnvironment(n_agents=n_agents, n_actions=n_opinions, draw=False)
agent_init_fn = ConsensusAgent

pagerank_policy = pagerank_method.pagerank_optimize_for_env(env)
neutral_policy = pagerank_policy * 0 + 1. / n_opinions
randomness_list = [0.01, 0.02, 0.05, 0.1, 0.2]

mean_steps_list = []
for randomness in randomness_list:
    policy = pagerank_policy * (1 - randomness) + neutral_policy * randomness
    steps_list = []
    for i in range(100):
        steps = run_episode(env, agent_init_fn, policy=policy)
        steps_list.append(steps)
        print(steps)
    mean_steps_list.append(np.mean(steps_list))
    print("Test complete")
    print(np.mean(steps_list))

plt.semilogx(randomness_list, mean_steps_list)
plt.ylabel("Mean number of steps (lower is better)")
plt.xlabel("uniform policy / optimal policy ratio")
plt.show()
示例#11
0
    # Setup
    task = Task(
            init_pose= np.array([0., 0., 10., 0., 0., 0.]),
            target_pos = np.array([0., 0., 10.]),
            )
    stat = StatCollector()
    agent = Agent(task, sess, stat)
    saver = tf.train.Saver()

    # Run Training
    for i_episode in range(num_episodes):
        stat.tick()

        # Train policy and Q-Network
        score, steps = run_episode(sess, agent, task, train=True)
        stat.scalar('episode_steps_train', steps)
        stat.scalar('episode_reward_train', score)
        print('Episode = {:4d}, score train = {:7.3f}, steps = {}'.format(i_episode, score, steps))

        # Evaluate policy
        if i_episode % evaluate_every == 0:
            score, steps = run_episode(sess, agent, task, train=False)
            stat.scalar('episode_steps_eval', steps)
            stat.scalar('episode_reward_eval', score)
            print('Episode = {:4d},  score eval = {:7.3f}, steps = {}'.format(i_episode, score, steps))
            saver.save(sess, model_file)

        plot_training_graphs(stat)
        plt.pause(0.05)
        plt.savefig("./graphs.png")
示例#12
0
# # Dyna-Q-learning
# scores_DynaQ_online_decentralized = np.zeros(n_tests)
# for i in range(n_tests):
#     agent_storage = PermanentAgentStorage(env, DynaAgent, k=10, randomness=randomness)
#     # Generate Data
#     steps, observations, actions = run_episode(env, agent_storage.get_next_agent, return_details=True)
#     scores_DynaQ_online_decentralized[i] = steps
#     print(i)
# print("Mean Dyna-Q: " + str(np.mean(scores_DynaQ_online_decentralized)) + " --- S.E.: " + str(np.std(scores_DynaQ_online_decentralized)/np.sqrt(n_tests)))
#
# DP behaviour - Nonlearning
scores_DP = np.zeros(n_tests)
for i in range(n_tests):
    steps = run_episode(env,
                        ConsensusAgent,
                        policy=pagerank_policy * (1 - randomness) +
                        randomness * neutral_policy,
                        use_joint_actions=False)
    scores_DP[i] = steps

print("Mean DP: " + str(np.mean(scores_DP)) + " --- S.E.: " +
      str(np.std(scores_DP) / np.sqrt(n_tests)))

scores_Q_online_decentralized = np.zeros(n_tests)
for i in range(n_tests):
    agent_storage = PermanentAgentStorage(env,
                                          DynaAgent,
                                          k=0,
                                          randomness=randomness,
                                          lr=0.0000,
                                          Q=Q_init)
示例#13
0
target_PPO.load_state_dict(policy_PPO.state_dict())

optimizer = torch.optim.Adam(policy_PPO.parameters(), lr=0.01)
BETA = 1

#2. Iteration
for iter in range(num_iter):
    #2.1  Using theta k to interact with the env
    # to collect {s_t, a_t} and compute advantage
    # advantage(s_t, a_t) = sum_{t^prime=t}^{T_n}(r_{t^prime}^{n})

    all_obs = []
    all_action = []
    all_advantage = []
    for episode in range(num_episode):
        obs_list, action_list, reward_list = run_episode(env, target_PPO)
        # batch_obs = torch.from_numpy(np.array(obs_list))
        # batch_action = torch.from_numpy(np.array(action_list))
        # batch_reward = torch.from_numpy(calc_advantage(reward_list, gamma=0.9))
        advantage_list = calc_advantage(reward_list, gamma=0.9)
        all_obs.extend(obs_list)
        all_action.extend(action_list)
        all_advantage.extend(advantage_list)
    dataset = PPODataset(obs_list=all_obs,
                         action_list=all_action,
                         advantage_list=all_advantage)
    dataloader = DataLoader(dataset, batch_size=128, shuffle=True)

    # optimize theta

    for epoch in range(num_epoch):
示例#14
0
n_episodes_per_gen = 100
#plot_env = ConsensusEnvironment(n_agents=n_agents, n_opinions=n_opinions, draw=True)
randomness = 0.1
episodes_done = []
for i in range(n_gens):
    episodes_done.append(i * n_episodes_per_gen)
    #randomness = randomness*0.9
    policy = ((1 - randomness) * pagerank_policy +
              (randomness) * neutral_policy)
    policy = policy / np.sum(policy, 1)[:, np.newaxis]

    steps_list = []
    # Generate Data
    for j in range(n_episodes_per_gen):
        steps, observations, actions = run_episode(env,
                                                   agent_init_fn,
                                                   policy=policy,
                                                   return_details=True)
        steps_list.append(steps)
        for id, agent_observations in observations.items():
            for step, o_t in enumerate(agent_observations[:-1]):
                o_t1 = agent_observations[step + 1]
                a = actions[id][step]
                experiences[observation_dict[o_t], observation_dict[o_t1],
                            a] += 1.0
    mean_steps_list.append(np.mean(steps_list))
    P = experiences / np.sum(experiences, (1))[:, np.newaxis, :]
    pagerank_policy = pagerank_method.optimize_value_iteration(P, env)
    print(np.mean(steps_list))
    #if np.mean(steps_list)<500 and i%5 == 0:
    #    run_episode(plot_env, agent_init_fn, policy=policy, return_details=True)
示例#15
0
from agents import ConsensusAgent
from agents import DQNAgent

from agents import PermanentAgentStorage

# Environment setup:
n_agents = 10
n_opinions = 2

randomness = 0.2
n_tests = 50

env = ConsensusEnvironment(n_agents=n_agents,
                           n_opinions=n_opinions,
                           draw=False)
P = pagerank_method.pagerank_find_P(env)
agent_storage = PermanentAgentStorage(env,
                                      DQNAgent,
                                      k=0,
                                      randomness=randomness)

scores_DQN_online_decentralized = np.zeros(n_tests)
for i in range(n_tests):
    # Generate Data
    steps = run_episode(env,
                        agent_storage.get_next_agent,
                        use_extended_observation=True,
                        return_details=False)
    scores_DQN_online_decentralized[i] = steps
    print(steps)
print(np.mean(scores_DQN_online_decentralized))