def test(env_name, episodes, params, render):
    # Create a Gym environment
    env = gym.make(env_name)

    # Get dimensionalities of actions and observations
    action_space_dim = env.action_space.shape[-1]
    observation_space_dim = env.observation_space.shape[-1]

    # Instantiate agent and its policy
    policy = Policy(observation_space_dim, action_space_dim)
    policy.load_state_dict(params)
    agent = Agent(policy)

    test_reward, test_len = 0, 0
    for ep in range(episodes):
        done = False
        observation = env.reset()
        while not done:
            # Similar to the training loop above -
            # get the action, act on the environment, save total reward
            # (evaluation=True makes the agent always return what it thinks to be
            # the best action - there is no exploration at this point)
            action, _ = agent.get_action(observation, evaluation=True)
            observation, reward, done, info = env.step(
                action.detach().cpu().numpy())

            if render:
                env.render()
            test_reward += reward
            test_len += 1
    print("Average test reward:", test_reward / episodes, "episode length:",
          test_len / episodes)
示例#2
0
def main(args):
    # Create a Gym environment
    env = gym.make(args.env)

    # Exercise 1
    # TODO: For CartPole-v0 - maximum episode length
    env._max_episode_steps = 1000

    # Get dimensionalities of actions and observations
    action_space_dim = get_space_dim(env.action_space)
    observation_space_dim = get_space_dim(env.observation_space)

    # Instantiate agent and its policy
    policy = Policy(observation_space_dim, action_space_dim)
    agent = Agent(policy)

    # Print some stuff
    print("Environment:", args.env)
    print("Training device:", agent.train_device)
    print("Observation space dimensions:", observation_space_dim)
    print("Action space dimensions:", action_space_dim)

    # If no model was passed, train a policy from scratch.
    # Otherwise load the policy from the file and go directly to testing.
    if args.test is None:
        training_history = train(args.position, agent, env,
                                 args.train_episodes, False,
                                 args.render_training)

        # Save the model
        tt = str(datetime.datetime.now().date()) + "-" + str(
            datetime.datetime.now().hour) + "-" + str(
                datetime.datetime.now().minute)
        model_file = "%s_params.mdl" % (args.env + tt + "vel")
        torch.save(policy.state_dict(), model_file)
        print("Model saved to", model_file)

        # Plot rewards
        sns.lineplot(x="episode", y="reward", data=training_history)
        sns.lineplot(x="episode", y="mean_reward", data=training_history)
        plt.legend(["Reward", "100-episode average"])
        plt.title("Reward history (%s)" % args.env)
        # time and day of plot
        plt.savefig("train_history" + tt + "vel" + ".jpg")
        plt.show()

        print("Training finished.")
    else:
        print("Loading model from", args.test, "...")
        state_dict = torch.load(args.test)
        policy.load_state_dict(state_dict)
        print("Testing...")
        test(args.position, agent, env, args.train_episodes, args.render_test)
def main(args):
    # Create a Gym environment
    env = gym.make(args.env)

    # Exercise 1
    env._max_episode_steps = args.episode_length

    # Get dimensionalities of actions and observations
    action_space_dim = get_space_dim(env.action_space)
    observation_space_dim = get_space_dim(env.observation_space)

    # Instantiate agent and its policy
    policy = Policy(observation_space_dim, action_space_dim)
    agent = Agent(policy)

    # Print some stuff
    print("Environment:", args.env)
    print("Training device:", agent.train_device)
    print("Observation space dimensions:", observation_space_dim)
    print("Action space dimensions:", action_space_dim)

    # If no model was passed, train a policy from scratch.
    # Otherwise load the policy from the file and go directly to testing.
    if args.test is None:
        training_history = train(agent,
                                 env,
                                 args.train_episodes,
                                 False,
                                 args.render_training,
                                 x0=args.x0,
                                 args=args,
                                 policy=policy)

        # Save the model
        model_file = "%s_params.mdl" % args.env
        torch.save(policy.state_dict(), model_file)
        print("Model saved to", model_file)

        # Plot rewards
        sns.lineplot(x="episode", y="reward", data=training_history)
        sns.lineplot(x="episode", y="mean_reward", data=training_history)
        plt.legend(["Reward", "100-episode average"])
        plt.title("Reward history (%s)" % args.env)
        plt.show()
        print("Training finished.")
    else:
        print("Loading model from", args.test, "...")
        state_dict = torch.load(args.test)
        policy.load_state_dict(state_dict)
        print("Testing...")
        test(agent, env, args.train_episodes, args.render_test, x0=args.x0)
def trainer(fargs):
    trainer_id, args = fargs
    print("Trainer id", trainer_id, "started")
    # Create a Gym environment
    env = gym.make(args.env)

    # Set maximum episode length
    if args.episode_steps is not None:
        env._max_episode_steps = args.episode_steps

    # Get dimensionalities of actions and observations
    action_space_dim = get_space_dim(env.action_space)
    observation_space_dim = get_space_dim(env.observation_space)

    # Instantiate agent and its policy
    policy = Policy(observation_space_dim, action_space_dim)
    agent = Agent(policy)

    training_history = train(agent,
                             env,
                             args.train_episodes,
                             silent=True,
                             train_run_id=trainer_id,
                             early_stop=False)

    print("Trainer id", trainer_id, "finished")

    return training_history
示例#5
0
    def __init__(self):
        #Preparing envs
        self.envs = Envs()

        self.memory = ReplayBuffer()
        self.device = torch.device(settings.device)
        self.policy = Policy().to(self.device)
        self.policy_optim = Adam(self.policy.parameters(), lr=p.lr)

        self.critic = QNetwork().to(self.device)
        self.critic_target = QNetwork().to(self.device)

        self.critic_optim = Adam(self.critic.parameters(), lr=p.lr)
        self.parameter_update(tau=1.0)

        if settings.mode == "test":
            self.policy.load_state_dict(
                torch.load("policy_seed_{}".format(settings.seed)))

        self.logger = Logger()
示例#6
0
def main(load_path, num_episode):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    n_env = 1

    env_id = 'Breakout-v0'
    envs = [make_env(env_id) for _ in range(n_env)]
    envs = DummyVecEnv(envs)
    envs = VecToTensor(envs)

    policy = Policy(84, 84, 4, envs.action_space.n).to(device)
    policy.load_state_dict(torch.load(load_path, map_location=device))
    policy.eval()

    for i in tqdm(range(num_episode)):
        obs = envs.reset()
        total_rewards = 0
        while True:
            action_logits, values = policy(obs)
            actions = choose_action(action_logits)

            next_obs, rewards, dones, info = envs.step(actions)
            total_rewards += rewards

            envs.render()

            if dones:
                break

        print('--------------------' + str(total_rewards.item()) +
              '-------------------')

    envs.close()
    # handle cli
    parser = argparse.ArgumentParser(description="evaluate a policy")
    parser.add_argument("policy_dir", type=str)
    parser.add_argument("env_name", type=str)
    parser.add_argument("--atari", action="store_true")
    parser.add_argument("--runs", type=int, default=10)
    parser.add_argument("--save", action="store_true")
    parser.add_argument("--save-to", type=str, default="example.gif", help="save as gif or mp4")
    parser.add_argument("--fps", type=int, default=24)
    parser.add_argument("--dpi", type=int, default=72)
    parser.add_argument("--repeat", type=int, default=3)

    args = parser.parse_args()

    # load policy
    p = Policy(args.policy_dir)

    # load env
    env = gym.make(args.env_name)
    if args.atari:
        env = AtariWrapper(env)

    # evalulate
    history, _ = eval(p, env, args.runs)
    statistics(history)

    if not args.save:
        exit(0)

    suffix = args.save_to.split(".")[-1]
    if suffix == "gif":
示例#8
0
import pong_utils
device = pong_utils.device
print("using device: ", device)

import gym
env = gym.make('PongDeterministic-v4')
print("List of available actions: ", env.unwrapped.get_action_meanings())
# The actions 'RIGHTFIRE' = 4 and 'LEFTFIRE" = 5 makes the game restarts if done

import matplotlib.pyplot as plt

from agent import Policy
agent = Policy()
agent = agent.to(device)

pong_utils.play(env, agent, time=100)

envs = pong_utils.parallelEnv('PongDeterministic-v4', n=4, seed=12345)
prob, state, action, reward = pong_utils.collect_trajectories(envs,
                                                              agent,
                                                              tmax=100)
示例#9
0
class Trainer:
    def __init__(self):
        #Preparing envs
        self.envs = Envs()

        self.memory = ReplayBuffer()
        self.device = torch.device(settings.device)
        self.policy = Policy().to(self.device)
        self.policy_optim = Adam(self.policy.parameters(), lr=p.lr)

        self.critic = QNetwork().to(self.device)
        self.critic_target = QNetwork().to(self.device)

        self.critic_optim = Adam(self.critic.parameters(), lr=p.lr)
        self.parameter_update(tau=1.0)

        if settings.mode == "test":
            self.policy.load_state_dict(
                torch.load("policy_seed_{}".format(settings.seed)))

        self.logger = Logger()

    def start(self):
        self.total_numsteps = 0

        if settings.mode == "train":
            self.add_random_steps()

            names = torch.FloatTensor(
                [i for i, _ in enumerate(settings.env_names)]).to(self.device)
            while self.total_numsteps < p.max_numsteps:
                self.run_test()
                leg_starts, states = self.envs.reset()
                for step in range(p._max_episode_steps):
                    self.total_numsteps += 1
                    actions = self.select_action(leg_starts, states, names)
                    next_states, rewards, dones = self.envs.step(actions)
                    self.memory.push(names, leg_starts, states, next_states,
                                     actions, rewards, dones)
                    states = self.envs.reset_dones(next_states, dones)

                    c1_loss, c2_loss, policy_loss = self.update_nets()

                    if (self.total_numsteps % 10) == 0:
                        self.logger.show_update(self.total_numsteps)

            torch.save(self.policy.state_dict(),
                       "policy_seed_{}".format(settings.seed))

        else:
            print("Seed: {}".format(settings.seed))
            self.run_test()

    def run_test(self):
        if settings.mode == "test":
            print("\nTesting current policy")
        leg_starts, states = self.envs.reset()
        done_filter = epsd_rewards = torch.FloatTensor(
            [1.0] * len(settings.env_names)).to(self.device)
        epsd_rewards = torch.FloatTensor([0.0] * len(settings.env_names)).to(
            self.device)
        names = torch.FloatTensor(
            [i for i, _ in enumerate(settings.env_names)]).to(self.device)
        for step in range(p._max_episode_steps):
            actions = self.select_action(leg_starts,
                                         states,
                                         names,
                                         evaluate=True)
            next_states, rewards, dones = self.envs.step(actions)
            epsd_rewards += done_filter * rewards
            done_filter *= (dones != 1).float()
            states = next_states

        self.logger.add_rewards(len(names), epsd_rewards, self.total_numsteps)
        self.logger.save()

    def add_random_steps(self):
        print("Adding random steps")
        leg_starts, states = self.envs.reset()
        names = torch.FloatTensor(
            [i for i, _ in enumerate(settings.env_names)]).to(self.device)
        while len(self.memory) <= p.batch_size * 10:
            actions = self.envs.sample_actions()
            next_states, rewards, dones = self.envs.step(actions)
            self.memory.push(names, leg_starts, states, next_states, actions,
                             rewards, dones)
            states = self.envs.reset_dones(next_states, dones)

    def select_action(self, leg_starts, states, names, evaluate=False):
        with torch.no_grad():

            if not evaluate:
                actions, _, _ = self.policy.sample(leg_starts, states, names)
            else:
                _, _, actions = self.policy.sample(leg_starts, states, names)

            return actions.cpu()

    def parameter_update(self, tau=p.tau):
        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - tau) +
                                    param.data * tau)

    def update_nets(self):
        names_batch, leg_starts_batch, state_batch, action_batch, reward_batch, next_state_batch, mask_batch = self.memory.sample(
        )

        reward_batch = reward_batch.unsqueeze(1)
        mask_batch = mask_batch.unsqueeze(1)

        with torch.no_grad():
            next_state_action, next_state_log_pi, _ = self.policy.sample(
                leg_starts_batch, next_state_batch, names_batch)
            qf1_next_target, qf2_next_target = self.critic_target(
                leg_starts_batch, next_state_batch, next_state_action,
                names_batch)
            min_qf_next_target = torch.min(
                qf1_next_target, qf2_next_target) - p.alpha * next_state_log_pi
            next_q_value = reward_batch + mask_batch * p.gamma * (
                min_qf_next_target)
        qf1, qf2 = self.critic(leg_starts_batch, state_batch, action_batch,
                               names_batch)

        qf1_loss = F.mse_loss(qf1, next_q_value)
        qf2_loss = F.mse_loss(qf2, next_q_value)
        qf_loss = qf1_loss + qf2_loss

        self.critic_optim.zero_grad()
        qf_loss.backward()
        self.critic_optim.step()

        pi, log_pi, _ = self.policy.sample(leg_starts_batch, state_batch,
                                           names_batch)
        qf1_pi, qf2_pi = self.critic(leg_starts_batch, state_batch, pi,
                                     names_batch)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)
        policy_loss = ((p.alpha * log_pi) - min_qf_pi).mean()

        self.policy_optim.zero_grad()
        policy_loss.backward()
        self.policy_optim.step()

        self.parameter_update()

        return qf1_loss.item(), qf2_loss.item(), policy_loss.item()
示例#10
0
def train(episodes, player, opponent):

    target_dqn = Policy(observation_space_dim, action_space_dim)
    target_dqn.load_state_dict(policy.state_dict())
    #Stacked preprocessed frames
    stacked_frames = deque(np.zeros((200, 210)), maxlen=4)

    #Updates
    update_counter = 0

    #Memory Initialisation
    # take random actions to fill the memory
    memory = Memory(memory_size, batch_size)
    for i in range(memory_size):
        if (i == 0):
            obs = env.reset()
            state, stacked_frames = stack_frame(stacked_frames, obs[0], True)
        action1 = random.randint(0, 3)
        action2 = random.randint(0, 3)
        next_obs, rewards, done, info = env.step((action1, action2))
        next_state, stacked_frames = stack_frame(stacked_frames, next_obs[0])
        memory.store((state, action1, rewards[0], next_state, done))
        state = next_state

    player.reset_score()
    opponent.reset_score()
    '''
    Training
    '''

    for i in range(0, episodes):
        done = False
        obs = env.reset()
        state, stacked_frames = stack_frame(stacked_frames, obs[0], True)
        timesteps = 0
        reward_sum = 0

        while not done:
            action1 = player.get_action(state, epsilon)
            action2 = opponent.get_action()

            next_obs, rewards, done, info = env.step((action1, action2))
            next_state, stacked_frames = stack_frame(stacked_frames,
                                                     next_obs[0])

            memory.store((state, action1, rewards[0], next_state, done))
            reward_sum += rewards[0]

            obs = next_obs
            state = next_state

            env.render()

            #Updating policy
            #Loading from memory
            samples = memory.sample()
            batch_states = np.asarray([x[0] for x in samples])
            batch_actions = np.asarray([x[1] for x in samples])
            batch_rewards = np.asarray([x[2] for x in samples])
            batch_next_states = np.asarray([x[3] for x in samples])
            batch_done = np.asarray([x[4] for x in samples])

            #Target network
            batch = torch.from_numpy(batch_next_states.squeeze()).float().to(
                player.train_device)
            batch_t_q_values = target_dqn.forward(batch)

            #Q Learning
            batch_t_q_max, _ = batch_t_q_values.max(dim=1)
            y = torch.empty(batch_size, 1)
            batch_rewards = torch.from_numpy(batch_rewards).float().to(
                player.train_device)

            for j in range(batch_size):
                #.any() ?
                if batch_done[j].any():
                    y[j] = batch_rewards[j]
                else:
                    y[j] = batch_rewards[j] + batch_t_q_max[j].mul(gamma)
            y.detach()

            #Gradient_descent
            batch_q_values = policy.forward(
                torch.from_numpy(batch_states.squeeze()).float().to(
                    player.train_device))
            loss = torch.mean(y.sub(batch_q_values)**2)
            loss.backward()

            player.update_policy()

            update_counter += 1
            if (update_counter % update_step == 0):
                target_dqn.load_state_dict(policy.state_dict())
            timesteps += 1

        epsilon = epsilon * decay
        print(
            "Episode {} finished. Total reward: {:.3g} ({} timesteps)".format(
                i, reward_sum, timesteps))
def train(env_name, print_things=True, train_run_id=0, train_episodes=5000):
    # Create a Gym environment
    env = gym.make(env_name)

    # Get dimensionalities of actions and observations
    action_space_dim = env.action_space.shape[-1]
    observation_space_dim = env.observation_space.shape[-1]

    # Instantiate agent and its policy
    policy = Policy(observation_space_dim, action_space_dim)
    agent = Agent(policy)

    # Arrays to keep track of rewards
    reward_history, timestep_history = [], []
    average_reward_history = []

    # Run actual training
    for episode_number in range(train_episodes):
        reward_sum, timesteps = 0, 0
        done = False
        # Reset the environment and observe the initial state
        observation = env.reset()

        # Loop until the episode is over
        while not done:
            # Get action from the agent
            action, action_probabilities = agent.get_action(observation)
            previous_observation = observation

            # Perform the action on the environment, get new state and reward
            observation, reward, done, info = env.step(action.detach().numpy())

            # Store action's outcome (so that the agent can improve its policy)
            agent.store_outcome(previous_observation, action_probabilities,
                                action, reward)

            # Store total episode reward
            reward_sum += reward
            timesteps += 1

        if print_things:
            print("Episode {} finished. Total reward: {:.3g} ({} timesteps)".
                  format(episode_number, reward_sum, timesteps))

        # Bookkeeping (mainly for generating plots)
        reward_history.append(reward_sum)
        timestep_history.append(timesteps)
        if episode_number > 100:
            avg = np.mean(reward_history[-100:])
        else:
            avg = np.mean(reward_history)
        average_reward_history.append(avg)

        # Let the agent do its magic (update the policy)
        agent.episode_finished(episode_number)

    # Training is finished - plot rewards
    if print_things:
        plt.plot(reward_history)
        plt.plot(average_reward_history)
        plt.legend(["Reward", "100-episode average"])
        plt.title("Reward history")
        plt.savefig("plots/task-2b.png")
        plt.show()
        print("Training finished.")
    data = pd.DataFrame({
        "episode": np.arange(len(reward_history)),
        "train_run_id": [train_run_id] * len(reward_history),
        # TODO: Change algorithm name for plots, if you want
        "algorithm": ["PG"] * len(reward_history),
        "reward": reward_history
    })
    torch.save(agent.policy.state_dict(),
               "model_%s_%d.mdl" % (env_name, train_run_id))
    return data
示例#12
0
    lr = 0.001
    alpha = 0.99
    epsilon = 1e-05

    env_id = 'Breakout-v0'
    envs = [make_env(env_id) for _ in range(n_env)]
    #    envs = DummyVecEnv(envs)
    #    envs = SubprocVecEnv(envs)
    envs = ShmemVecEnv(envs)
    envs = VecToTensor(envs)

    date = datetime.now().strftime('%m_%d_%H_%M')
    mon_file_name = "./tmp/" + date
    envs = VecMonitor(envs, mon_file_name)

    train_policy = Policy(84, 84, 4, envs.action_space.n).to(device)
    step_policy = Policy(84, 84, 4, envs.action_space.n).to(device)
    step_policy.load_state_dict(train_policy.state_dict())
    step_policy.eval()

    runner = Runner(envs, step_policy, n_step, gamma)

    optimizer = optim.RMSprop(train_policy.parameters(),
                              lr=lr,
                              alpha=alpha,
                              eps=epsilon)

    for i in tqdm(range(num_updates)):
        mb_obs, mb_rewards, mb_values, mb_actions = runner.run()

        action_logits, values = train_policy(mb_obs)
示例#13
0
Initialisation
'''

env = Pong(headless=args.headless)

#Players
player_id = 1
opponent_id = 3 - player_id
opponent = PongAi(env, opponent_id)

#Model
action_space_dim = 3
observation_space_dim = 4

#Classes
policy = Policy(observation_space_dim, action_space_dim)
player = Agent(env, policy, player_id)

env.set_names(player.get_name(), opponent.get_name())


def train(episodes, player, opponent):

    target_dqn = Policy(observation_space_dim, action_space_dim)
    target_dqn.load_state_dict(policy.state_dict())
    #Stacked preprocessed frames
    stacked_frames = deque(np.zeros((200, 210)), maxlen=4)

    #Updates
    update_counter = 0
def train(env_name,
          print_things=True,
          train_run_id=0,
          train_timesteps=200000,
          update_steps=50):
    # Create a Gym environment
    # This creates 64 parallel envs running in 8 processes (8 envs each)
    env = ParallelEnvs(env_name, processes=8, envs_per_process=8)

    # Get dimensionalities of actions and observations
    action_space_dim = env.action_space.shape[-1]
    observation_space_dim = env.observation_space.shape[-1]

    # Instantiate agent and its policy
    policy = Policy(observation_space_dim, action_space_dim)
    agent = Agent(policy)

    # Arrays to keep track of rewards
    reward_history, timestep_history = [], []
    average_reward_history = []

    # Run actual training
    # Reset the environment and observe the initial state
    observation = env.reset()

    # Loop forever
    for timestep in range(train_timesteps):
        # Get action from the agent
        action, action_probabilities = agent.get_action(observation)
        previous_observation = observation

        # Perform the action on the environment, get new state and reward
        observation, reward, done, info = env.step(action.detach().numpy())

        for i in range(len(info["infos"])):
            env_done = False
            # Check if the environment is finished; if so, store cumulative reward
            for envid, envreward in info["finished"]:
                if envid == i:
                    reward_history.append(envreward)
                    average_reward_history.append(
                        np.mean(reward_history[-500:]))
                    env_done = True
                    break
            # Store action's outcome (so that the agent can improve its policy)
            agent.store_outcome(previous_observation[i], observation[i],
                                action_probabilities[i], reward[i], env_done)

        if timestep % update_steps == update_steps - 1:
            print(f"Update @ step {timestep}")
            agent.update_policy(0)

        plot_freq = 1000
        if timestep % plot_freq == plot_freq - 1:
            # Training is finished - plot rewards
            plt.plot(reward_history)
            plt.plot(average_reward_history)
            plt.legend(["Reward", "500-episode average"])
            plt.title("AC reward history (non-episodic, parallel)")
            plt.savefig("rewards_%s.png" % env_name)
            plt.clf()
            torch.save(agent.policy.state_dict(), "model.mdl")
            print("%d: Plot and model saved." % timestep)
    data = pd.DataFrame({
        "episode":
        np.arange(len(reward_history)),
        "train_run_id": [train_run_id] * len(reward_history),
        # TODO: Change algorithm name for plots, if you want
        "algorithm": ["Nonepisodic parallel AC"] * len(reward_history),
        "reward":
        reward_history
    })
    torch.save(agent.policy.state_dict(),
               "model_%s_%d.mdl" % (env_name, train_run_id))
    return data
parser.add_argument('--render', action='store_true')
args = parser.parse_args()

env = Pong(headless=args.headless)

player_id = 1
opponent_id = 3 - player_id
action_space = 1

UP_ACTION = 1
DOWN_ACTION = 2

episode_n = 1500

# Policy declaration
policy1 = Policy(args.hidden_layer_size, action_space)
policy2 = Policy(args.hidden_layer_size, action_space)

# Agent declaration
#opponent = Agent(env,  policy1, args.learning_rate, args.discount_factor, opponent_id, 'player 2')
opponent = PongAi(env, opponent_id)
player = Agent(env, policy2, args.learning_rate, args.discount_factor,
               player_id, 'player 1')
player.load_checkpoint('checkpoint-single-2/checkpoints-player-1500.pth')
env.set_names(player.get_name(), opponent.get_name())

# action1 = player.get_action()
# action2 = opponent.get_action()
# (ob1, ob2), (rew1, rew2), done, info = env.step((action1, action2))

import torch.optim as optim
import torch
import gym
from agent import Policy
from collections import deque
import numpy as np

env = gym.make('CartPole-v1')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

policy = Policy().to(device)
optimizer = optim.Adam(policy.parameters(), lr=1e-2)

n_episodes = 2000
max_t = 1000
gamma = 1.0
print_every = 100

scores_deque = deque(maxlen=100)
scores = []

for i_episode in range(1, n_episodes+1):
    saved_log_probs = []
    rewards = []
    state = env.reset()
    for t in range(max_t):
        action, log_prob = policy.act(state)
        saved_log_probs.append(log_prob)
        state, reward, done, _ = env.step(action)
        rewards.append(reward)
        if done:
示例#17
0
    def __init__(self,
                 gamma,
                 optimizer,
                 modelName,
                 logDir,
                 lr,
                 TAU,
                 ALPHA=0.2,
                 stepToCkpt=50000):

        self.gamma = gamma
        self.opt = optimizer
        self.modelName = modelName
        self.logDir = logDir
        self.sumDir = os.path.join(self.logDir, self.modelName + "_summary")
        self.ckptDir = os.path.join(self.logDir, self.modelName + "_ckpt")
        self.lr = lr
        self.TAU = TAU
        self.ALPHA = ALPHA
        ### implementation of polyak averaging
        # self.avg = tf.train.ExponentialMovingAverage(decay = self.TAU)
        OPTIMIZER = {
            "sgd": tf.keras.optimizers.SGD(learning_rate=self.lr),
            "Adam": tf.keras.optimizers.Adam(learning_rate=self.lr),
            "rmsProp": tf.keras.optimizers.RMSprop(learning_rate=self.lr),
            "adaGrad": tf.keras.optimizers.Adagrad(learning_rate=self.lr)
        }
        LOSS = {
            "huber": tf.keras.losses.Huber,
            "mse": tf.keras.losses.MSE,
        }

        self.polOpt = OPTIMIZER[self.opt]
        self.qOpt = OPTIMIZER[self.opt]
        self.vOpt = tfa.optimizers.MovingAverage(OPTIMIZER[self.opt],
                                                 average_decay=self.TAU)
        # self.vOpt = OPTIMIZER[optimizer]

        ####### network definition #############
        self.policy = Policy()
        self.QSample2 = QValFn()
        self.QSample1 = QValFn(
        )  #https://spinningup.openai.com/en/latest/algorithms/sac.html
        self.ValueFn = ValFn()

        ############ summary Writer#############
        self.summary_writer = tf.compat.v2.summary.create_file_writer(
            os.path.join(self.sumDir, 'logs/'), flush_millis=10000)
        # self.ckpt_writer = tf.compat.v2.summary.create_file_writer(os.path.join(self.sumDir,'ckpt/'), flush_millis=10000)
        self.summary_writer.set_as_default()
        self.global_step = tf.compat.v1.train.get_or_create_global_step()

        ##### checkpoint writer ###########
        self.ckpt = tf.train.Checkpoint(policy=self.policy.finalModel,
                                        q1=self.QSample1.finalModel,
                                        q2=self.QSample2.finalModel,
                                        value=self.ValueFn.finalModel,
                                        policyOpt=self.polOpt,
                                        qOpt=self.qOpt,
                                        vOpt=self.vOpt)

        self.pathCkpt = os.path.join(self.sumDir, 'ckpt')
        self.ckptManager = tf.train.CheckpointManager(self.ckpt,
                                                      self.pathCkpt,
                                                      max_to_keep=3)

        self.stepToCkpt = stepToCkpt

        ###### loading the latest checkpoint for the training purpose ##########
        print(self.pathCkpt)
        self.ckpt.restore(self.ckptManager.latest_checkpoint)
        if self.ckptManager.latest_checkpoint:
            print("Restored from {}".format(
                self.ckptManager.latest_checkpoint))
        else:
            print("Initializing from scratch.")
示例#18
0
class SAC:
    def __init__(self,
                 gamma,
                 optimizer,
                 modelName,
                 logDir,
                 lr,
                 TAU,
                 ALPHA=0.2,
                 stepToCkpt=50000):

        self.gamma = gamma
        self.opt = optimizer
        self.modelName = modelName
        self.logDir = logDir
        self.sumDir = os.path.join(self.logDir, self.modelName + "_summary")
        self.ckptDir = os.path.join(self.logDir, self.modelName + "_ckpt")
        self.lr = lr
        self.TAU = TAU
        self.ALPHA = ALPHA
        ### implementation of polyak averaging
        # self.avg = tf.train.ExponentialMovingAverage(decay = self.TAU)
        OPTIMIZER = {
            "sgd": tf.keras.optimizers.SGD(learning_rate=self.lr),
            "Adam": tf.keras.optimizers.Adam(learning_rate=self.lr),
            "rmsProp": tf.keras.optimizers.RMSprop(learning_rate=self.lr),
            "adaGrad": tf.keras.optimizers.Adagrad(learning_rate=self.lr)
        }
        LOSS = {
            "huber": tf.keras.losses.Huber,
            "mse": tf.keras.losses.MSE,
        }

        self.polOpt = OPTIMIZER[self.opt]
        self.qOpt = OPTIMIZER[self.opt]
        self.vOpt = tfa.optimizers.MovingAverage(OPTIMIZER[self.opt],
                                                 average_decay=self.TAU)
        # self.vOpt = OPTIMIZER[optimizer]

        ####### network definition #############
        self.policy = Policy()
        self.QSample2 = QValFn()
        self.QSample1 = QValFn(
        )  #https://spinningup.openai.com/en/latest/algorithms/sac.html
        self.ValueFn = ValFn()

        ############ summary Writer#############
        self.summary_writer = tf.compat.v2.summary.create_file_writer(
            os.path.join(self.sumDir, 'logs/'), flush_millis=10000)
        # self.ckpt_writer = tf.compat.v2.summary.create_file_writer(os.path.join(self.sumDir,'ckpt/'), flush_millis=10000)
        self.summary_writer.set_as_default()
        self.global_step = tf.compat.v1.train.get_or_create_global_step()

        ##### checkpoint writer ###########
        self.ckpt = tf.train.Checkpoint(policy=self.policy.finalModel,
                                        q1=self.QSample1.finalModel,
                                        q2=self.QSample2.finalModel,
                                        value=self.ValueFn.finalModel,
                                        policyOpt=self.polOpt,
                                        qOpt=self.qOpt,
                                        vOpt=self.vOpt)

        self.pathCkpt = os.path.join(self.sumDir, 'ckpt')
        self.ckptManager = tf.train.CheckpointManager(self.ckpt,
                                                      self.pathCkpt,
                                                      max_to_keep=3)

        self.stepToCkpt = stepToCkpt

        ###### loading the latest checkpoint for the training purpose ##########
        print(self.pathCkpt)
        self.ckpt.restore(self.ckptManager.latest_checkpoint)
        if self.ckptManager.latest_checkpoint:
            print("Restored from {}".format(
                self.ckptManager.latest_checkpoint))
        else:
            print("Initializing from scratch.")

    def policyLoss(self, currentState):
        ## CHECKED
        ## define loss function for policy function
        ## TODO : FORMULATION DOESN"T MATCH THE PAPER
        _, _, action, mean, sqrtStd, gauss, rewardAction = self.policy.samplePolicy(
            currentState,
            training=True)  ## TODO : CHECK THE ORDER FROM POLICY NETWORK
        logPolicy = tf.stop_gradient(
            self.policy.lgOfPolicy(
                mean, sqrtStd,
                gauss))  ## TODO : check implementation here also
        qVal = self.QSample1.QvalForward(currentState, action, training=False)
        policyLossOp = tf.reduce_mean(tf.abs(self.ALPHA * logPolicy - qVal))
        return policyLossOp

    def qValLoss(self, Qnetwork, currentState, action, reward, nextState,
                 DONE):

        ## CHECKED

        #### NOTE function calculation depend on two key state and action  make sure they are consistent####
        # part of TODO ^^^
        ## define loss function for Q value
        #TODO : define data structure for current state next state and reward
        #UPDATE : added few changes in Policy return to be consistent with value and Q function

        vValNext = self.ValueFn.ValFnForward(nextState, training=False)
        qVal = Qnetwork.QvalForward(
            currentState, action,
            training=True)  ## stochastic sampling of state
        qTarget = reward + self.gamma * (
            1 -
            DONE) * vValNext  ## Question why not use QTarget instead of QVal

        loss = tf.reduce_mean(
            tf.pow((qVal - qTarget), 2)
        )  ## loss is explicitly defined for q based gradient  not for value function
        return loss

    def vValLoss(self, currentState):
        ### Checked
        ## define loss function for value function
        #### https://spinningup.openai.com/en/latest/algorithms/sac.html
        value = self.ValueFn.ValFnForward(currentState, training=True)
        _, _, action, mean, sqrtStd, gauss, rewardAction = self.policy.samplePolicy(
            currentState, training=False)  ## TODO : check the order
        qVal1 = self.QSample1.QvalForward(currentState, action, training=False)
        qVal2 = self.QSample2.QvalForward(currentState, action, training=False)
        qVal = tf.math.minimum(qVal1, qVal2)
        logPolicy = tf.stop_gradient(
            self.policy.lgOfPolicy(mean, sqrtStd, gauss))
        softValue = tf.reduce_sum(qVal - self.ALPHA * logPolicy)
        ##TODO : POLYAK averaging
        return tf.reduce_mean(tf.pow((value - softValue), 2))

    def softUpdate(self, locModel, tagModel):
        """
		soft update the model parameters.
		theta_target = tau*theta_local + (1-tau)theta_target
		"""
        #TODO : check if its working or not

        for targetParam, localParam in zip(tagModel.trainable_variables,
                                           locModel.trainable_variables):
            print("old :", targetParam)
            targetParam.assign(self.TAU * targetParam +
                               (1 - self.TAU) * localParam)
            print(print("new :", targetParam))

        return

    def loggingQLoss(self, loss, step):
        tf.summary.experimental.set_step(step)
        tf.compat.v2.summary.scalar('qvalue_loss', tf.math.log(loss))

    def loggingVLoss(self, loss, step):
        tf.summary.experimental.set_step(step)
        tf.compat.v2.summary.scalar('Vvalue_loss', tf.math.log(loss))

    def loggingPLoss(self, loss, step):
        tf.summary.experimental.set_step(step)
        tf.compat.v2.summary.scalar('policy_loss', tf.math.log(loss))

    def loggingReward(self, reward, step):
        tf.summary.experimental.set_step(step)
        tf.compat.v2.summary.scalar('reward', reward)

    def train(self, epState, batchState, batchReward, batchAction,
              batchNextState, DONE):
        # print(self.policy.finalModel.summary())
        # input()

        ## training the model
        ## ttrick to fit model on smaller GPU

        if ((epState + 1) % self.stepToCkpt == 0):
            self.saveModel(epState + 1)
        else:
            pass

        if (epState % 3 == 0):
            ############# ask GSR : better way of regularization ################
            # with tf.device('/device:GPU:1'):
            with tf.GradientTape() as Ptape:
                lossPolicy = self.policyLoss(batchState)
                #TODO : modofy the policy model
                policyGradient = Ptape.gradient(
                    lossPolicy, self.policy.finalModel.trainable_variables)
                self.polOpt.apply_gradients(
                    zip(policyGradient,
                        self.policy.finalModel.trainable_variables))
                self.loggingPLoss(lossPolicy, epState // 3)

                ### assigning one step for each of the step in policy
                return lossPolicy, 0.0, 0.0

        elif (epState % 3 == 1):
            # with tf.device('/device:GPU:1'):
            with tf.GradientTape() as Qtape:
                countQNet = epState // 2

                if countQNet % 2 == 0:
                    Qnetwork = self.QSample1
                    strQ = 1
                else:
                    Qnetwork = self.QSample2
                    strQ = 2
                print("qOpt : ", strQ, countQNet)
                lossQValue = self.qValLoss(Qnetwork, batchState, batchAction,
                                           batchReward, batchNextState, DONE)
                QGradient = Qtape.gradient(
                    lossQValue, Qnetwork.finalModel.trainable_variables)
                self.qOpt.apply_gradients(
                    zip(QGradient, Qnetwork.finalModel.trainable_variables))
                self.loggingQLoss(lossQValue, epState // 3)
                return 0.0, lossQValue, 0.0
        else:
            print("vVal")
            # with tf.device('/device:CPU:0'):
            with tf.GradientTape() as ValueTape:
                lossVvalue = self.vValLoss(batchState)
                ValGradient = ValueTape.gradient(
                    lossVvalue, self.ValueFn.finalModel.trainable_variables)
                self.loggingVLoss(lossVvalue, epState // 3)
                self.vOpt.apply_gradients(
                    zip(ValGradient,
                        self.ValueFn.finalModel.trainable_variables))
                return 0.0, 0.0, lossVvalue

    def restoreCkpt(self):
        ## restor the ckpt
        self.ckpt.restore(tf.train.latest_checkpoint(self.pathCkpt))

    def saveModel(self, epState):
        ## function to save the checkpoint
        save_path = self.ckptManager.save(checkpoint_number=epState + 1)
        print("Saved checkpoint for step {}: {}".format(
            epState + 1, save_path))
示例#19
0
import argparse
from agent import Policy, Agent

parser = argparse.ArgumentParser()
parser.add_argument("--headless",
                    action="store_true",
                    help="Run in headless mode")
args = parser.parse_args()

env = Pong(headless=args.headless)

UP_ACTION = 1
DOWN_ACTION = 2

# Policy declaration
policy1 = Policy(500, 1)
policy2 = Policy(500, 1)

# Agent declaration
opponent = Agent(env, policy2, 0.0005, 0.99, 2, 'player 1')
opponent = PongAi(env, 2)
player = Agent(env, policy1, 0.0005, 0.99, 1, 'player 2')

# print(player.policy.state_dict())
player.load_checkpoint('checkpoint-single-1/checkpoints-player-18000.pth')
#player.load_checkpoint('checkpoints-3/checkpoints-player-9500.pth')
#opponent.load_checkpoint('checkpoints-3/checkpoints-opponent-9500.pth')
# player.policy.state_dict()


def plot(observation):
示例#20
0
# Load World
# ----------

LEFT = 5
RIGHT = 4
env = gym.make('PongDeterministic-v4')
print("List of available actions: ", env.unwrapped.get_action_meanings())

# Load Agent
# ----------

from agent import Policy
import torch.optim as optim

agent = Policy().to(device)
optimizer = optim.Adam(agent.parameters(), lr=1e-4)

# Load Parallel Environment
# -------------------------

from pong_utils import parallelEnv, preprocess_batch
envs = parallelEnv('PongDeterministic-v4', n=4, seed=12345)


#
def collect_trajectories(envs, agent, tmax=200, nrand=5):
    '''
    Collect trajectories of multiple agents of a parallelized environment
    '''
    n = len(envs.ps)