def trainer(fargs):
    trainer_id, args = fargs
    print("Trainer id", trainer_id, "started")
    # Create a Gym environment
    env = gym.make(args.env)

    # Set maximum episode length
    if args.episode_steps is not None:
        env._max_episode_steps = args.episode_steps

    # Get dimensionalities of actions and observations
    action_space_dim = get_space_dim(env.action_space)
    observation_space_dim = get_space_dim(env.observation_space)

    # Instantiate agent and its policy
    policy = Policy(observation_space_dim, action_space_dim)
    agent = Agent(policy)

    training_history = train(agent,
                             env,
                             args.train_episodes,
                             silent=True,
                             train_run_id=trainer_id,
                             early_stop=False)

    print("Trainer id", trainer_id, "finished")

    return training_history
def test(env_name, episodes, params, render):
    # Create a Gym environment
    env = gym.make(env_name)

    # Get dimensionalities of actions and observations
    action_space_dim = env.action_space.shape[-1]
    observation_space_dim = env.observation_space.shape[-1]

    # Instantiate agent and its policy
    policy = Policy(observation_space_dim, action_space_dim)
    policy.load_state_dict(params)
    agent = Agent(policy)

    test_reward, test_len = 0, 0
    for ep in range(episodes):
        done = False
        observation = env.reset()
        while not done:
            # Similar to the training loop above -
            # get the action, act on the environment, save total reward
            # (evaluation=True makes the agent always return what it thinks to be
            # the best action - there is no exploration at this point)
            action, _ = agent.get_action(observation, evaluation=True)
            observation, reward, done, info = env.step(
                action.detach().cpu().numpy())

            if render:
                env.render()
            test_reward += reward
            test_len += 1
    print("Average test reward:", test_reward / episodes, "episode length:",
          test_len / episodes)
Exemplo n.º 3
0
def main(load_path, num_episode):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    n_env = 1

    env_id = 'Breakout-v0'
    envs = [make_env(env_id) for _ in range(n_env)]
    envs = DummyVecEnv(envs)
    envs = VecToTensor(envs)

    policy = Policy(84, 84, 4, envs.action_space.n).to(device)
    policy.load_state_dict(torch.load(load_path, map_location=device))
    policy.eval()

    for i in tqdm(range(num_episode)):
        obs = envs.reset()
        total_rewards = 0
        while True:
            action_logits, values = policy(obs)
            actions = choose_action(action_logits)

            next_obs, rewards, dones, info = envs.step(actions)
            total_rewards += rewards

            envs.render()

            if dones:
                break

        print('--------------------' + str(total_rewards.item()) +
              '-------------------')

    envs.close()
Exemplo n.º 4
0
def main(args):
    # Create a Gym environment
    env = gym.make(args.env)

    # Exercise 1
    # TODO: For CartPole-v0 - maximum episode length
    env._max_episode_steps = 1000

    # Get dimensionalities of actions and observations
    action_space_dim = get_space_dim(env.action_space)
    observation_space_dim = get_space_dim(env.observation_space)

    # Instantiate agent and its policy
    policy = Policy(observation_space_dim, action_space_dim)
    agent = Agent(policy)

    # Print some stuff
    print("Environment:", args.env)
    print("Training device:", agent.train_device)
    print("Observation space dimensions:", observation_space_dim)
    print("Action space dimensions:", action_space_dim)

    # If no model was passed, train a policy from scratch.
    # Otherwise load the policy from the file and go directly to testing.
    if args.test is None:
        training_history = train(args.position, agent, env,
                                 args.train_episodes, False,
                                 args.render_training)

        # Save the model
        tt = str(datetime.datetime.now().date()) + "-" + str(
            datetime.datetime.now().hour) + "-" + str(
                datetime.datetime.now().minute)
        model_file = "%s_params.mdl" % (args.env + tt + "vel")
        torch.save(policy.state_dict(), model_file)
        print("Model saved to", model_file)

        # Plot rewards
        sns.lineplot(x="episode", y="reward", data=training_history)
        sns.lineplot(x="episode", y="mean_reward", data=training_history)
        plt.legend(["Reward", "100-episode average"])
        plt.title("Reward history (%s)" % args.env)
        # time and day of plot
        plt.savefig("train_history" + tt + "vel" + ".jpg")
        plt.show()

        print("Training finished.")
    else:
        print("Loading model from", args.test, "...")
        state_dict = torch.load(args.test)
        policy.load_state_dict(state_dict)
        print("Testing...")
        test(args.position, agent, env, args.train_episodes, args.render_test)
def main(args):
    # Create a Gym environment
    env = gym.make(args.env)

    # Exercise 1
    env._max_episode_steps = args.episode_length

    # Get dimensionalities of actions and observations
    action_space_dim = get_space_dim(env.action_space)
    observation_space_dim = get_space_dim(env.observation_space)

    # Instantiate agent and its policy
    policy = Policy(observation_space_dim, action_space_dim)
    agent = Agent(policy)

    # Print some stuff
    print("Environment:", args.env)
    print("Training device:", agent.train_device)
    print("Observation space dimensions:", observation_space_dim)
    print("Action space dimensions:", action_space_dim)

    # If no model was passed, train a policy from scratch.
    # Otherwise load the policy from the file and go directly to testing.
    if args.test is None:
        training_history = train(agent,
                                 env,
                                 args.train_episodes,
                                 False,
                                 args.render_training,
                                 x0=args.x0,
                                 args=args,
                                 policy=policy)

        # Save the model
        model_file = "%s_params.mdl" % args.env
        torch.save(policy.state_dict(), model_file)
        print("Model saved to", model_file)

        # Plot rewards
        sns.lineplot(x="episode", y="reward", data=training_history)
        sns.lineplot(x="episode", y="mean_reward", data=training_history)
        plt.legend(["Reward", "100-episode average"])
        plt.title("Reward history (%s)" % args.env)
        plt.show()
        print("Training finished.")
    else:
        print("Loading model from", args.test, "...")
        state_dict = torch.load(args.test)
        policy.load_state_dict(state_dict)
        print("Testing...")
        test(agent, env, args.train_episodes, args.render_test, x0=args.x0)
Exemplo n.º 6
0
    def __init__(self):
        #Preparing envs
        self.envs = Envs()

        self.memory = ReplayBuffer()
        self.device = torch.device(settings.device)
        self.policy = Policy().to(self.device)
        self.policy_optim = Adam(self.policy.parameters(), lr=p.lr)

        self.critic = QNetwork().to(self.device)
        self.critic_target = QNetwork().to(self.device)

        self.critic_optim = Adam(self.critic.parameters(), lr=p.lr)
        self.parameter_update(tau=1.0)

        if settings.mode == "test":
            self.policy.load_state_dict(
                torch.load("policy_seed_{}".format(settings.seed)))

        self.logger = Logger()
Exemplo n.º 7
0
    # handle cli
    parser = argparse.ArgumentParser(description="evaluate a policy")
    parser.add_argument("policy_dir", type=str)
    parser.add_argument("env_name", type=str)
    parser.add_argument("--atari", action="store_true")
    parser.add_argument("--runs", type=int, default=10)
    parser.add_argument("--save", action="store_true")
    parser.add_argument("--save-to", type=str, default="example.gif", help="save as gif or mp4")
    parser.add_argument("--fps", type=int, default=24)
    parser.add_argument("--dpi", type=int, default=72)
    parser.add_argument("--repeat", type=int, default=3)

    args = parser.parse_args()

    # load policy
    p = Policy(args.policy_dir)

    # load env
    env = gym.make(args.env_name)
    if args.atari:
        env = AtariWrapper(env)

    # evalulate
    history, _ = eval(p, env, args.runs)
    statistics(history)

    if not args.save:
        exit(0)

    suffix = args.save_to.split(".")[-1]
    if suffix == "gif":
def train(env_name, print_things=True, train_run_id=0, train_episodes=5000):
    # Create a Gym environment
    env = gym.make(env_name)

    # Get dimensionalities of actions and observations
    action_space_dim = env.action_space.shape[-1]
    observation_space_dim = env.observation_space.shape[-1]

    # Instantiate agent and its policy
    policy = Policy(observation_space_dim, action_space_dim)
    agent = Agent(policy)

    # Arrays to keep track of rewards
    reward_history, timestep_history = [], []
    average_reward_history = []

    # Run actual training
    for episode_number in range(train_episodes):
        reward_sum, timesteps = 0, 0
        done = False
        # Reset the environment and observe the initial state
        observation = env.reset()

        # Loop until the episode is over
        while not done:
            # Get action from the agent
            action, action_probabilities = agent.get_action(observation)
            previous_observation = observation

            # Perform the action on the environment, get new state and reward
            observation, reward, done, info = env.step(action.detach().numpy())

            # Store action's outcome (so that the agent can improve its policy)
            agent.store_outcome(previous_observation, action_probabilities,
                                action, reward)

            # Store total episode reward
            reward_sum += reward
            timesteps += 1

        if print_things:
            print("Episode {} finished. Total reward: {:.3g} ({} timesteps)".
                  format(episode_number, reward_sum, timesteps))

        # Bookkeeping (mainly for generating plots)
        reward_history.append(reward_sum)
        timestep_history.append(timesteps)
        if episode_number > 100:
            avg = np.mean(reward_history[-100:])
        else:
            avg = np.mean(reward_history)
        average_reward_history.append(avg)

        # Let the agent do its magic (update the policy)
        agent.episode_finished(episode_number)

    # Training is finished - plot rewards
    if print_things:
        plt.plot(reward_history)
        plt.plot(average_reward_history)
        plt.legend(["Reward", "100-episode average"])
        plt.title("Reward history")
        plt.savefig("plots/task-2b.png")
        plt.show()
        print("Training finished.")
    data = pd.DataFrame({
        "episode": np.arange(len(reward_history)),
        "train_run_id": [train_run_id] * len(reward_history),
        # TODO: Change algorithm name for plots, if you want
        "algorithm": ["PG"] * len(reward_history),
        "reward": reward_history
    })
    torch.save(agent.policy.state_dict(),
               "model_%s_%d.mdl" % (env_name, train_run_id))
    return data
Exemplo n.º 9
0
import pong_utils
device = pong_utils.device
print("using device: ", device)

import gym
env = gym.make('PongDeterministic-v4')
print("List of available actions: ", env.unwrapped.get_action_meanings())
# The actions 'RIGHTFIRE' = 4 and 'LEFTFIRE" = 5 makes the game restarts if done

import matplotlib.pyplot as plt

from agent import Policy
agent = Policy()
agent = agent.to(device)

pong_utils.play(env, agent, time=100)

envs = pong_utils.parallelEnv('PongDeterministic-v4', n=4, seed=12345)
prob, state, action, reward = pong_utils.collect_trajectories(envs,
                                                              agent,
                                                              tmax=100)
Exemplo n.º 10
0
Initialisation
'''

env = Pong(headless=args.headless)

#Players
player_id = 1
opponent_id = 3 - player_id
opponent = PongAi(env, opponent_id)

#Model
action_space_dim = 3
observation_space_dim = 4

#Classes
policy = Policy(observation_space_dim, action_space_dim)
player = Agent(env, policy, player_id)

env.set_names(player.get_name(), opponent.get_name())


def train(episodes, player, opponent):

    target_dqn = Policy(observation_space_dim, action_space_dim)
    target_dqn.load_state_dict(policy.state_dict())
    #Stacked preprocessed frames
    stacked_frames = deque(np.zeros((200, 210)), maxlen=4)

    #Updates
    update_counter = 0
Exemplo n.º 11
0
def train(episodes, player, opponent):

    target_dqn = Policy(observation_space_dim, action_space_dim)
    target_dqn.load_state_dict(policy.state_dict())
    #Stacked preprocessed frames
    stacked_frames = deque(np.zeros((200, 210)), maxlen=4)

    #Updates
    update_counter = 0

    #Memory Initialisation
    # take random actions to fill the memory
    memory = Memory(memory_size, batch_size)
    for i in range(memory_size):
        if (i == 0):
            obs = env.reset()
            state, stacked_frames = stack_frame(stacked_frames, obs[0], True)
        action1 = random.randint(0, 3)
        action2 = random.randint(0, 3)
        next_obs, rewards, done, info = env.step((action1, action2))
        next_state, stacked_frames = stack_frame(stacked_frames, next_obs[0])
        memory.store((state, action1, rewards[0], next_state, done))
        state = next_state

    player.reset_score()
    opponent.reset_score()
    '''
    Training
    '''

    for i in range(0, episodes):
        done = False
        obs = env.reset()
        state, stacked_frames = stack_frame(stacked_frames, obs[0], True)
        timesteps = 0
        reward_sum = 0

        while not done:
            action1 = player.get_action(state, epsilon)
            action2 = opponent.get_action()

            next_obs, rewards, done, info = env.step((action1, action2))
            next_state, stacked_frames = stack_frame(stacked_frames,
                                                     next_obs[0])

            memory.store((state, action1, rewards[0], next_state, done))
            reward_sum += rewards[0]

            obs = next_obs
            state = next_state

            env.render()

            #Updating policy
            #Loading from memory
            samples = memory.sample()
            batch_states = np.asarray([x[0] for x in samples])
            batch_actions = np.asarray([x[1] for x in samples])
            batch_rewards = np.asarray([x[2] for x in samples])
            batch_next_states = np.asarray([x[3] for x in samples])
            batch_done = np.asarray([x[4] for x in samples])

            #Target network
            batch = torch.from_numpy(batch_next_states.squeeze()).float().to(
                player.train_device)
            batch_t_q_values = target_dqn.forward(batch)

            #Q Learning
            batch_t_q_max, _ = batch_t_q_values.max(dim=1)
            y = torch.empty(batch_size, 1)
            batch_rewards = torch.from_numpy(batch_rewards).float().to(
                player.train_device)

            for j in range(batch_size):
                #.any() ?
                if batch_done[j].any():
                    y[j] = batch_rewards[j]
                else:
                    y[j] = batch_rewards[j] + batch_t_q_max[j].mul(gamma)
            y.detach()

            #Gradient_descent
            batch_q_values = policy.forward(
                torch.from_numpy(batch_states.squeeze()).float().to(
                    player.train_device))
            loss = torch.mean(y.sub(batch_q_values)**2)
            loss.backward()

            player.update_policy()

            update_counter += 1
            if (update_counter % update_step == 0):
                target_dqn.load_state_dict(policy.state_dict())
            timesteps += 1

        epsilon = epsilon * decay
        print(
            "Episode {} finished. Total reward: {:.3g} ({} timesteps)".format(
                i, reward_sum, timesteps))
parser.add_argument('--render', action='store_true')
args = parser.parse_args()

env = Pong(headless=args.headless)

player_id = 1
opponent_id = 3 - player_id
action_space = 1

UP_ACTION = 1
DOWN_ACTION = 2

episode_n = 1500

# Policy declaration
policy1 = Policy(args.hidden_layer_size, action_space)
policy2 = Policy(args.hidden_layer_size, action_space)

# Agent declaration
#opponent = Agent(env,  policy1, args.learning_rate, args.discount_factor, opponent_id, 'player 2')
opponent = PongAi(env, opponent_id)
player = Agent(env, policy2, args.learning_rate, args.discount_factor,
               player_id, 'player 1')
player.load_checkpoint('checkpoint-single-2/checkpoints-player-1500.pth')
env.set_names(player.get_name(), opponent.get_name())

# action1 = player.get_action()
# action2 = opponent.get_action()
# (ob1, ob2), (rew1, rew2), done, info = env.step((action1, action2))

Exemplo n.º 13
0
    lr = 0.001
    alpha = 0.99
    epsilon = 1e-05

    env_id = 'Breakout-v0'
    envs = [make_env(env_id) for _ in range(n_env)]
    #    envs = DummyVecEnv(envs)
    #    envs = SubprocVecEnv(envs)
    envs = ShmemVecEnv(envs)
    envs = VecToTensor(envs)

    date = datetime.now().strftime('%m_%d_%H_%M')
    mon_file_name = "./tmp/" + date
    envs = VecMonitor(envs, mon_file_name)

    train_policy = Policy(84, 84, 4, envs.action_space.n).to(device)
    step_policy = Policy(84, 84, 4, envs.action_space.n).to(device)
    step_policy.load_state_dict(train_policy.state_dict())
    step_policy.eval()

    runner = Runner(envs, step_policy, n_step, gamma)

    optimizer = optim.RMSprop(train_policy.parameters(),
                              lr=lr,
                              alpha=alpha,
                              eps=epsilon)

    for i in tqdm(range(num_updates)):
        mb_obs, mb_rewards, mb_values, mb_actions = runner.run()

        action_logits, values = train_policy(mb_obs)
def train(env_name,
          print_things=True,
          train_run_id=0,
          train_timesteps=200000,
          update_steps=50):
    # Create a Gym environment
    # This creates 64 parallel envs running in 8 processes (8 envs each)
    env = ParallelEnvs(env_name, processes=8, envs_per_process=8)

    # Get dimensionalities of actions and observations
    action_space_dim = env.action_space.shape[-1]
    observation_space_dim = env.observation_space.shape[-1]

    # Instantiate agent and its policy
    policy = Policy(observation_space_dim, action_space_dim)
    agent = Agent(policy)

    # Arrays to keep track of rewards
    reward_history, timestep_history = [], []
    average_reward_history = []

    # Run actual training
    # Reset the environment and observe the initial state
    observation = env.reset()

    # Loop forever
    for timestep in range(train_timesteps):
        # Get action from the agent
        action, action_probabilities = agent.get_action(observation)
        previous_observation = observation

        # Perform the action on the environment, get new state and reward
        observation, reward, done, info = env.step(action.detach().numpy())

        for i in range(len(info["infos"])):
            env_done = False
            # Check if the environment is finished; if so, store cumulative reward
            for envid, envreward in info["finished"]:
                if envid == i:
                    reward_history.append(envreward)
                    average_reward_history.append(
                        np.mean(reward_history[-500:]))
                    env_done = True
                    break
            # Store action's outcome (so that the agent can improve its policy)
            agent.store_outcome(previous_observation[i], observation[i],
                                action_probabilities[i], reward[i], env_done)

        if timestep % update_steps == update_steps - 1:
            print(f"Update @ step {timestep}")
            agent.update_policy(0)

        plot_freq = 1000
        if timestep % plot_freq == plot_freq - 1:
            # Training is finished - plot rewards
            plt.plot(reward_history)
            plt.plot(average_reward_history)
            plt.legend(["Reward", "500-episode average"])
            plt.title("AC reward history (non-episodic, parallel)")
            plt.savefig("rewards_%s.png" % env_name)
            plt.clf()
            torch.save(agent.policy.state_dict(), "model.mdl")
            print("%d: Plot and model saved." % timestep)
    data = pd.DataFrame({
        "episode":
        np.arange(len(reward_history)),
        "train_run_id": [train_run_id] * len(reward_history),
        # TODO: Change algorithm name for plots, if you want
        "algorithm": ["Nonepisodic parallel AC"] * len(reward_history),
        "reward":
        reward_history
    })
    torch.save(agent.policy.state_dict(),
               "model_%s_%d.mdl" % (env_name, train_run_id))
    return data
import torch.optim as optim
import torch
import gym
from agent import Policy
from collections import deque
import numpy as np

env = gym.make('CartPole-v1')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

policy = Policy().to(device)
optimizer = optim.Adam(policy.parameters(), lr=1e-2)

n_episodes = 2000
max_t = 1000
gamma = 1.0
print_every = 100

scores_deque = deque(maxlen=100)
scores = []

for i_episode in range(1, n_episodes+1):
    saved_log_probs = []
    rewards = []
    state = env.reset()
    for t in range(max_t):
        action, log_prob = policy.act(state)
        saved_log_probs.append(log_prob)
        state, reward, done, _ = env.step(action)
        rewards.append(reward)
        if done:
Exemplo n.º 16
0
    def __init__(self,
                 gamma,
                 optimizer,
                 modelName,
                 logDir,
                 lr,
                 TAU,
                 ALPHA=0.2,
                 stepToCkpt=50000):

        self.gamma = gamma
        self.opt = optimizer
        self.modelName = modelName
        self.logDir = logDir
        self.sumDir = os.path.join(self.logDir, self.modelName + "_summary")
        self.ckptDir = os.path.join(self.logDir, self.modelName + "_ckpt")
        self.lr = lr
        self.TAU = TAU
        self.ALPHA = ALPHA
        ### implementation of polyak averaging
        # self.avg = tf.train.ExponentialMovingAverage(decay = self.TAU)
        OPTIMIZER = {
            "sgd": tf.keras.optimizers.SGD(learning_rate=self.lr),
            "Adam": tf.keras.optimizers.Adam(learning_rate=self.lr),
            "rmsProp": tf.keras.optimizers.RMSprop(learning_rate=self.lr),
            "adaGrad": tf.keras.optimizers.Adagrad(learning_rate=self.lr)
        }
        LOSS = {
            "huber": tf.keras.losses.Huber,
            "mse": tf.keras.losses.MSE,
        }

        self.polOpt = OPTIMIZER[self.opt]
        self.qOpt = OPTIMIZER[self.opt]
        self.vOpt = tfa.optimizers.MovingAverage(OPTIMIZER[self.opt],
                                                 average_decay=self.TAU)
        # self.vOpt = OPTIMIZER[optimizer]

        ####### network definition #############
        self.policy = Policy()
        self.QSample2 = QValFn()
        self.QSample1 = QValFn(
        )  #https://spinningup.openai.com/en/latest/algorithms/sac.html
        self.ValueFn = ValFn()

        ############ summary Writer#############
        self.summary_writer = tf.compat.v2.summary.create_file_writer(
            os.path.join(self.sumDir, 'logs/'), flush_millis=10000)
        # self.ckpt_writer = tf.compat.v2.summary.create_file_writer(os.path.join(self.sumDir,'ckpt/'), flush_millis=10000)
        self.summary_writer.set_as_default()
        self.global_step = tf.compat.v1.train.get_or_create_global_step()

        ##### checkpoint writer ###########
        self.ckpt = tf.train.Checkpoint(policy=self.policy.finalModel,
                                        q1=self.QSample1.finalModel,
                                        q2=self.QSample2.finalModel,
                                        value=self.ValueFn.finalModel,
                                        policyOpt=self.polOpt,
                                        qOpt=self.qOpt,
                                        vOpt=self.vOpt)

        self.pathCkpt = os.path.join(self.sumDir, 'ckpt')
        self.ckptManager = tf.train.CheckpointManager(self.ckpt,
                                                      self.pathCkpt,
                                                      max_to_keep=3)

        self.stepToCkpt = stepToCkpt

        ###### loading the latest checkpoint for the training purpose ##########
        print(self.pathCkpt)
        self.ckpt.restore(self.ckptManager.latest_checkpoint)
        if self.ckptManager.latest_checkpoint:
            print("Restored from {}".format(
                self.ckptManager.latest_checkpoint))
        else:
            print("Initializing from scratch.")
Exemplo n.º 17
0
import argparse
from agent import Policy, Agent

parser = argparse.ArgumentParser()
parser.add_argument("--headless",
                    action="store_true",
                    help="Run in headless mode")
args = parser.parse_args()

env = Pong(headless=args.headless)

UP_ACTION = 1
DOWN_ACTION = 2

# Policy declaration
policy1 = Policy(500, 1)
policy2 = Policy(500, 1)

# Agent declaration
opponent = Agent(env, policy2, 0.0005, 0.99, 2, 'player 1')
opponent = PongAi(env, 2)
player = Agent(env, policy1, 0.0005, 0.99, 1, 'player 2')

# print(player.policy.state_dict())
player.load_checkpoint('checkpoint-single-1/checkpoints-player-18000.pth')
#player.load_checkpoint('checkpoints-3/checkpoints-player-9500.pth')
#opponent.load_checkpoint('checkpoints-3/checkpoints-opponent-9500.pth')
# player.policy.state_dict()


def plot(observation):
Exemplo n.º 18
0
# Load World
# ----------

LEFT = 5
RIGHT = 4
env = gym.make('PongDeterministic-v4')
print("List of available actions: ", env.unwrapped.get_action_meanings())

# Load Agent
# ----------

from agent import Policy
import torch.optim as optim

agent = Policy().to(device)
optimizer = optim.Adam(agent.parameters(), lr=1e-4)

# Load Parallel Environment
# -------------------------

from pong_utils import parallelEnv, preprocess_batch
envs = parallelEnv('PongDeterministic-v4', n=4, seed=12345)


#
def collect_trajectories(envs, agent, tmax=200, nrand=5):
    '''
    Collect trajectories of multiple agents of a parallelized environment
    '''
    n = len(envs.ps)