示例#1
0
    def __init__(self):
        sess = tf.Session()
        with sess:
            global_step = tf.Variable(0,
                                      dtype=tf.int32,
                                      name='global_episodes',
                                      trainable=False)

            self.env = gym_env = gym.make(FLAGS.game)
            if FLAGS.gym_seed and FLAGS.gym_seed != -1:
                gym_env.seed(FLAGS.gym_seed)

            if FLAGS.monitor:
                gym_env = gym.wrappers.Monitor(gym_env, FLAGS.experiments_dir)

            env = AtariEnvironment(
                gym_env=gym_env,
                resized_width=FLAGS.resized_width,
                resized_height=FLAGS.resized_height,
                agent_history_length=FLAGS.agent_history_length)
            nb_actions = len(env.gym_actions)

            self.agent = DQNAgent(env, sess, nb_actions, global_step)
            self.saver = tf.train.Saver(max_to_keep=1000)

        if FLAGS.resume or not FLAGS.train:
            checkpoint_dir = os.path.join(FLAGS.checkpoint_dir,
                                          FLAGS.algorithm)
            ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
            print("Loading Model from {}".format(ckpt.model_checkpoint_path))
            self.saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            sess.run(tf.global_variables_initializer())
示例#2
0
    def __init__(self, learner_type):
        self.agent = DQNAgent(100, 2, 0.3, 0.4)
        self.env = None

        #  set the model for the learning engine depending on
        #  learning type
        if learner_type == 'the_rival':
            self.agent.init_model('qnet')
        else:
            #  placeholder: replace with more suitable model for mime if there exists
            self.agent.init_model('qnet')
示例#3
0
def train(agent: DQNAgent, env: Env, episodes: int = 10_000):
    display = False

    progression = tqdm.trange(episodes,
                              desc=f"Training {agent.name}",
                              unit="episode")
    fps = 0

    for episode in progression:
        state = env.reset()

        mean_reward = 0
        return_ = 0
        x_pos = 0

        for step in count(1):
            t = time()
            action = agent.act(np.asarray(state), explore=True)
            next_state, reward, done, info = env.step(action)
            agent.memorize(
                Experience((state, next_state, action, done, reward)))
            state = next_state
            agent.learn()

            mean_reward += (reward - mean_reward) / step
            return_ += reward
            x_pos = max(x_pos, info["x_pos"])
            fps = fps * 0.9 + 0.1 / (time() - t)

            if not step % 100:
                try:
                    display = (yaml.load(
                        (PROJECT_DIRECTORY / "display.yml").read_text()).get(
                            agent.name, {}).get("display", False))
                except:
                    pass
            if display:
                env.render()

            if done or info["flag_get"]:
                break

        progression.set_description(
            f"Training {agent.name}; "
            f"Frames: {agent.step} ({fps:.0f} FPS); "
            f"last progression: {x_pos} ({x_pos/3260:.1%}); "
            f"eps: {agent.eps:.2f}")

        agent.register_episode(
            EpisodeMetrics(episode=episode,
                           x_pos=x_pos,
                           return_=return_,
                           steps=step))

    agent.save_model()
示例#4
0
def create_agent(conf, action_space, observation_space):
    if conf['agent'] == "dqn":
        return DQNAgent(
            action_space,
            observation_space,
            batch_size=conf['batch_size'],
            learning_rate=conf['learning_rate'],
            discount=conf['discount'],
            epsilon=conf['random_explore'])
    elif conf['agent'] == "conv_dqn":
        return ConvDQNAgent(
            action_space,
            observation_space,
            batch_size=conf['batch_size'],
            learning_rate=conf['learning_rate'],
            discount=conf['discount'],
            epsilon=conf['random_explore'])
    elif conf['agent'] == "tabular_q":
        return TabularQAgent(
            action_space,
            observation_space,
            q_init=conf['q_value_init'],
            learning_rate=conf['learning_rate'],
            discount=conf['discount'],
            epsilon=conf['random_explore'])
    elif conf['agent'] == "random":
        return RandomAgent(action_space, observation_space)
    else:
        raise ArgumentError("Agent type [%s] is not supported." %
                            conf['agent'])
    def __init__(self, action_space, cmdl, is_training=True):
        DQNAgent.__init__(self, action_space, cmdl, is_training)
        self.name = "Categorical_agent"
        self.cmdl = cmdl

        hist_len, action_no = cmdl.hist_len, self.action_no
        self.policy = policy = get_model(cmdl.estimator, 1, hist_len,
                                         (action_no, cmdl.atoms_no),
                                         hidden_size=cmdl.hidden_size)
        self.target = target = get_model(cmdl.estimator, 1, hist_len,
                                         (action_no, cmdl.atoms_no),
                                         hidden_size=cmdl.hidden_size)
        if self.cmdl.cuda:
            self.policy.cuda()
            self.target.cuda()

        self.policy_evaluation = CategoricalPolicyEvaluation(policy, cmdl)
        self.policy_improvement = CategoricalPolicyImprovement(
                policy, target, cmdl)
示例#6
0
    def __init__(self, action_space, cmdl, is_training=True):
        DQNAgent.__init__(self, action_space, cmdl, is_training)
        self.name = "Categorical_agent"
        self.cmdl = cmdl

        hist_len, action_no = cmdl.hist_len, self.action_no
        self.policy = policy = get_model(cmdl.estimator,
                                         1,
                                         hist_len, (action_no, cmdl.atoms_no),
                                         hidden_size=cmdl.hidden_size)
        self.target = target = get_model(cmdl.estimator,
                                         1,
                                         hist_len, (action_no, cmdl.atoms_no),
                                         hidden_size=cmdl.hidden_size)
        if self.cmdl.cuda:
            self.policy.cuda()
            self.target.cuda()

        self.policy_evaluation = CategoricalPolicyEvaluation(policy, cmdl)
        self.policy_improvement = CategoricalPolicyImprovement(
            policy, target, cmdl)
示例#7
0
def main():

    env = UnityEnvironment(
        file_name=
        "/home/faten/projects/deep-reinforcement-learning/p1_navigation/Banana_Linux/Banana.x86_64"
    )

    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    action_size = brain.vector_action_space_size
    env_info = env.reset(train_mode=True)[brain_name]
    state = env_info.vector_observations[0]
    state_size = len(state)

    agent = DQNAgent(state_size, action_size, seed=0)

    scores = train(env, agent)

    # plot the scores
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(len(scores)), scores)
    plt.ylabel('Score')
    plt.xlabel('Epsiode #')
    plt.show()

    agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth'))

    for i in range(3):
        state = env.reset()
        for j in range(200):
            action = agent.act(state)
            env.render()
            state, reward, done, _ = env.step(action)
            if done:
                break

    env.close()
示例#8
0
    def create(config):
        working_agent = config['GLOBAL']['working_agent']

        if working_agent is None:
            return None

        if working_agent == 'DQNAgent':  # maybe better to use the type() to get the name,but this is just ok .
            from agents.dqn_agent import DQNAgent  # dynamic import. Refer to the Item 52: know how to break circular dependencey in book  "Effective Python"
            return DQNAgent(config)

        if working_agent == 'REINFORCEAgent':
            from agents.reinforce_agent import REINFORCEAgent
            return REINFORCEAgent(config)

        if working_agent == 'ActorCriticAgent':
            from agents.actor_critic import ActorCriticAgent
            return ActorCriticAgent(config)
        return None
示例#9
0
文件: utils.py 项目: lajd/drl_toolbox
def get_agent(state_shape: tuple, action_size: int, model: torch.nn.Module,
              policy, memory, optimizer, params):
    # Agent
    agent = DQNAgent(
        state_shape=state_shape,
        action_size=action_size,
        model=model,
        policy=policy,
        batch_size=params['BATCH_SIZE'],
        update_frequency=params['UPDATE_FREQUENCY'],
        gamma=params['GAMMA'],
        lr_scheduler=DummyLRScheduler(optimizer),  # Using adam
        optimizer=optimizer,
        memory=memory,
        seed=params['SEED'],
        tau=params['TAU'],
        action_repeats=params['ACTION_REPEATS'],
    )
    return agent
示例#10
0
def navigation_main():
    env = UnityEnvironment(file_name="Navigation/Banana.app")
    seed = 777
    np.random.seed(seed)
    seed_torch(seed)

    num_episode = 2000
    memory_size = 10000
    batch_size = 64
    target_update = 4
    epsilon_decay = 0.9

    agent = DQNAgent(env, memory_size, batch_size, target_update, epsilon_decay)
    agent.train(num_episode)

    agent.test()
示例#11
0
    env = UnityEnvironment(file_name="./Banana_Linux/Banana.x86_64")
    min_solved = 13.0
    # Get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    env_info = env.reset(train_mode=True)[brain_name]
    action_size = brain.vector_action_space_size
    state_size = len(env_info.vector_observations[0])

    scores = []
    test_scores = []
    test_scores_i = []
    avg_scores = []
    scores_window = deque(maxlen=100)
    config = generate_configuration_qnet(action_size, state_size)
    agent = DQNAgent(config)
    agent.create_dirs()

    eps = config.eps_start

    for i_episode in range(1, config.n_episodes + 1):
        # Reset the environment and the score
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations[0]
        score = 0
        while True:
            action = agent.act(state, eps)
            env_info = env.step(action)[brain_name]
            next_state, reward, done = env_info.vector_observations[
                0], env_info.rewards[0], env_info.local_done[0]
            agent.step(state, action, reward, next_state, done)
示例#12
0
              static_all=True,
              static_obj_rnd_pos=False,
              rnd_obj_rnd_pos=False,
              full_color=False)

env.seed(args.seed)
torch.manual_seed(args.seed)

saved_action = namedtuple('saved_action', ['log_prob', 'value'])

policy_net = DQNCnn(7)
target_net = DQNCnn(7)

memory = ReplayBuffer(100000, 4)

agent = DQNAgent(memory, 64, 1., 1e-6, 0.05, 7, policy_net, target_net, 0.99,
                 0.001)

e = 0
N_EXPLORE = 10

for i in tqdm(range(N_EXPLORE)):
    done = False
    s = env.reset()

    # To make this compatible with the ReplayBuffer, we need to expand the 3rd channel...
    s = np.expand_dims(s, 2)

    while not done:
        last_stored_frame_idx = agent.memory.store_frame(s)
        obs = agent.memory.encode_recent_observation()
        a = np.random.choice([x for x in range(7)])
示例#13
0
# Import internal modules
from agents.dqn_agent import DQNAgent

# Import external modules
import gym

if __name__ == "__main__":
    # initialize gym environment and the agent
    env = gym.make('CartPole-v0')
    agent = DQNAgent(environment=env)

    # Test the agent
    agent.test()
model = build_q_network(
    input_shape=[
        len(ini_observation)],
    nb_output=len(actions))
target_model = build_q_network(
    input_shape=[
        len(ini_observation)],
    nb_output=len(actions))

agent = DQNAgent(actions=actions,
                 memory=memory,
                 update_interval=200,
                 train_interval=1,
                 batch_size=32,
                 observation=ini_observation,
                 model=model,
                 target_model=target_model,
                 policy=policy,
                 loss_fn=loss_fn,
                 optimizer=optimizer,
                 obs_processor=obs_processor, 
                 is_ddqn=True)

step_history = []
reward_history = []
nb_epsiodes = 1000
# for episode in range(nb_epsiodes):episode_reward_averague
episode_reward_average = -1
with tqdm.trange(nb_epsiodes) as t:
    for episode in t:
        # agent.reset()
示例#15
0
import argparse
import copy
import microgridRLsimulator

import params.params as params
from agents.random_agent import RandomAgent
from agents.agent_ppo import PPOAgent
from agents.dqn_agent import DQNAgent

parser = argparse.ArgumentParser()
parser.add_argument('--env',
                    '-e',
                    type=str,
                    default='microgrid',
                    choices=['microgrid', 'maze-dense', 'maze-sparse'])

args = parser.parse_args()

params = copy.deepcopy(params.params)
params['env']['env'] = args.env

params['env']['case'] = 'elespino_discrete'
agent = DQNAgent(params)

agent.train()
# agent.test()
agent.store_results(render_tr_te=2)
print("End of agent's life")
示例#16
0
import gym
from agents.dqn_agent import experienceReplayBuffer_DQN, DQNAgent, QNetwork_DQN
import torch
from agents import evaluate
from copy import deepcopy



if __name__ == "__main__":
    n_iter = 100000
    env = gym.make('gym_pvz:pvz-env-v2')
    nn_name = input("Save name: ")
    buffer = experienceReplayBuffer_DQN(memory_size=100000, burn_in=10000)
    net = QNetwork_DQN(env, device='cpu', use_zombienet=False, use_gridnet=False)
    # old_agent = torch.load("agents/benchmark/dfq5_znet_epslinear")
    # net.zombienet.load_state_dict(old_agent.zombienet.state_dict())
    # for p in net.zombienet.parameters():
    #     p.requires_grad = False
    # net.optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()),
    #                                       lr=net.learning_rate)
    agent = DQNAgent(env, net, buffer, n_iter=n_iter, batch_size=200)
    agent.train(max_episodes=n_iter, evaluate_frequency=5000, evaluate_n_iter=1000)
    torch.save(agent.network, nn_name)
    agent._save_training_data(nn_name)
BUFFER_SIZE = 10**4
EPISODES = 1001
EPISODE_LENGTH = 200
DISCOUNT = 0.99
LR = 1e-3
PRESAMPLE = 10
BATCH_SIZE = 128
C = 10
INTERVAL = 10

## DQN
env.reset()
buffer = ReplayBuffer(BUFFER_SIZE)
dqn_q_network = build_network(n_states, n_actions, 2, 200)

agent = DQNAgent(dqn_q_network, DISCOUNT, LR)
learner = DQNLearner(env, buffer, agent)

dqn_hist = learner.train(presample=PRESAMPLE,
                         batch_size=BATCH_SIZE,
                         episodes=EPISODES,
                         episode_length=EPISODE_LENGTH,
                         interval=INTERVAL,
                         C=C,
                         save_path="./",
                         save_name="DQN")

## Double DQN
env.reset()
buffer = ReplayBuffer(BUFFER_SIZE)
double_q_network = build_network(n_states, n_actions, 2, 200)
示例#18
0
            state = next_state

    print()
    print("Results after {episodes} episodes:")
    print("Average reward per episode: {total_reward / episodes}")
    print("Average time steps per episode: {total_epochs / episodes}")


# Load a Windy GridWorld environment
env_name = "LunarLander-v2"
env = gym.make(env_name)

random_agent = RandomDQNAgent(env_name, env, 1000, is_state_box=True, memory_capacity=100000)
# random_agent.train()

agent = DQNAgent(env_name, env, 5000, learning_rate=0.00025, start_epsilon=1.0, discount_factor=0.99, decay_rate=0.0001,
                 make_checkpoint=True, is_state_box=True, batch_size=64, memory_capacity=100000)
# agent.memory = random_agent.memory
# agent.train()

weights, rewards, episode_len = agent.load("/home/dsalwala/NUIG/Thesis/rl-algos/data/LunarLander-v2_100.npy")
stats = plotting.EpisodeStats(
    episode_lengths=episode_len,
    episode_rewards=rewards)

# Search for a Q values
# nn, stats = agent.nn.get_weights(), agent.stats

nn = ANN(8, 4, 0.00025)
nn.set_weights(weights)
play_episode(env, nn, 1)
示例#19
0
from acme.utils import loggers
from acme.wrappers import gym_wrapper

from agents.dqn_agent import DQNAgent
from networks.models import Models

from tensorflow.python.client import device_lib

print(device_lib.list_local_devices())


def render(env):
    return env.environment.render(mode='rgb_array')


environment = gym_wrapper.GymWrapper(gym.make('LunarLander-v2'))
environment = wrappers.SinglePrecisionWrapper(environment)
environment_spec = specs.make_environment_spec(environment)

model = Models.sequential_model(
    input_shape=environment_spec.observations.shape,
    num_outputs=environment_spec.actions.num_values,
    hidden_layers=3,
    layer_size=300)

agent = DQNAgent(environment_spec=environment_spec, network=model)

logger = loggers.TerminalLogger(time_delta=10.)
loop = acme.EnvironmentLoop(environment=environment, actor=agent)
loop.run()
示例#20
0
def train():
    """
    Trains a DQN agent in the Unity Banana environment.
    """

    # set hyperparameters

    #    # udacity dqn baseline: solved after 487 steps
    #    buffer_size = int(1e5)
    #    batch_size = 64
    #    gamma = 0.99
    #    tau = 1e-3
    #    learning_rate = 5e-4
    #    eps_start = 1.0
    #    eps_end = 0.01
    #    eps_decay = 0.995
    #    fc1_units = 64
    #    fc2_units = 64
    #    q_function_update_fraction=4
    #    seed = 0
    #    # larger network in 1st layer
    #    buffer_size = int(1e5)
    #    batch_size = 64
    #    gamma = 0.99
    #    tau = 1e-3
    #    learning_rate = 5e-4
    #    eps_start = 1.0
    #    eps_end = 0.01
    #    eps_decay = 0.995
    #    fc1_units = 128
    #    fc2_units = 64
    #    q_function_update_fraction=4
    #    seed = 0
    #
    #    # smaller network in 1st and 2nd layer
    #    buffer_size = int(1e5)
    #    batch_size = 64
    #    gamma = 0.99
    #    tau = 1e-3
    #    learning_rate = 5e-4
    #    eps_start = 1.0
    #    eps_end = 0.01
    #    eps_decay = 0.995
    #    fc1_units = 32
    #    fc2_units = 16
    #    q_function_update_fraction=4
    #    seed = 0

    #     # higher discount rate
    #     buffer_size = int(1e5)
    #     batch_size = 64
    #     gamma = 0.9999
    #     tau = 1e-3
    #     learning_rate = 5e-4
    #     eps_start = 1.0
    #     eps_end = 0.01
    #     eps_decay = 0.995
    #     fc1_units = 64
    #     fc2_units = 64
    #     q_function_update_fraction=4
    #     seed = 0

    #   # higher eps. decay rate
    #   buffer_size = int(1e5)
    #   batch_size = 64
    #   gamma = 0.99
    #   tau = 1e-3
    #   learning_rate = 5e-4
    #   eps_start = 1.0
    #   eps_end = 0.01
    #   eps_decay = 0.999
    #   fc1_units = 64
    #   fc2_units = 64
    #   q_function_update_fraction=4
    #   seed = 0

    # higher eps. decay rate
    buffer_size = int(1e5)
    batch_size = 64
    gamma = 0.99
    tau = 1e-3
    learning_rate = 5e-4
    eps_start = 1.0
    eps_end = 0.01
    eps_decay = 0.990
    fc1_units = 64
    fc2_units = 64
    q_function_update_fraction = 4
    seed = 0

    # use a simple concatenation of all hyperparameters as the experiment name. results are stored in a subfolder
    #   with this name
    experiment_name = "6-smaller_eps_decay-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}".format(
        buffer_size, batch_size, gamma, tau, learning_rate, eps_start, eps_end,
        eps_decay, fc1_units, fc2_units, q_function_update_fraction, seed)

    # in addition to creating the experiment folder, create subfolders for checkpoints and logs
    if not os.path.isdir(experiment_name):
        os.mkdir(experiment_name)
        os.mkdir(experiment_name + '/checkpoints')
        os.mkdir(experiment_name + '/logs')

    # log the hyperparameters
    with open(experiment_name + '/logs/' + 'hyperparameters.log', 'w') as f:
        print(
            "Buffer size {}\nbatch size {}\ngamma {}\ntau {}\nlearning_rate {}\nfc1-fc2 {}-{}\nq-function_update_fraction {}\nseed {}"
            .format(buffer_size, batch_size, gamma, tau, learning_rate,
                    fc1_units, fc2_units, q_function_update_fraction, seed),
            file=f)

    ############ THE ENVIRONMENT ###############
    env = UnityEnvironment(file_name='Banana_Linux/Banana.x86_64', seed=seed)

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]

    # get the number of agents
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)

    # get the size of the action space
    action_size = brain.vector_action_space_size
    print('Size of each action:', action_size)

    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]
    print('There are {} agents. Each observes a state with length: {}'.format(
        states.shape[0], state_size))

    dqn_agent = DQNAgent(name=experiment_name,
                         state_size=state_size,
                         action_size=action_size,
                         learning_rate=learning_rate,
                         discount_rate=gamma,
                         eps_start=eps_start,
                         eps_end=eps_end,
                         eps_decay=eps_decay,
                         tau=tau,
                         network_architecture=[fc1_units, fc2_units],
                         experience_replay_buffer_size=buffer_size,
                         experience_replay_buffer_batch_size=batch_size,
                         experience_replay_start_size=3200,
                         q_function_update_fraction=q_function_update_fraction,
                         device='gpu',
                         seed=seed)

    # run the train loop
    scores_all = train_loop(env=env,
                            brain_name=brain_name,
                            agent=dqn_agent,
                            experiment_name=experiment_name)

    pickle.dump(scores_all, open(experiment_name + '/scores_all.pkg', 'wb'))

    # plot the results
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(1, len(scores_all) + 1), scores_all)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.show()

    # finally, close the environment
    env.close()
示例#21
0
            next_state, reward, done = process_state(
                env_info.visual_observations[0]
            ), env_info.rewards[0], env_info.local_done[0]
            state_window.append(next_state)
            state = np.vstack(
                [np.expand_dims(np.array(s), 0) for s in state_window])
            score += reward
            if done:
                break
        scores_window.append(score)
        scores.append(score)
        print('\rTest Episode {}\tLast Score: {:.2f}\tAverage Score: {:.2f}'.
              format(i_episode, score, np.mean(scores_window)),
              end="")
    print('\rTest after {} episode mean {:.2f}'.format(n_ep_train,
                                                       np.mean(scores_window)))
    return np.mean(scores_window)


if __name__ == '__main__':
    env = UnityEnvironment(file_name="./Banana_Linux/Banana.x86_64")
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    env_info = env.reset(train_mode=True)[brain_name]
    action_size = brain.vector_action_space_size
    state_size = len(env_info.vector_observations[0])
    config = generate_configuration_qnet_visual(action_size, state_size)
    agent = DQNAgent(config)
    agent.load_weights("./checkpoint.pth")
    print(test(env, agent, 0, n_episodes=100, sleep_t=0))
"""
Use this script to train the Double DQN agent that takes the segmented frames as input.

This script will take about a day to run, depending on your computer performances.

Logs and networks will be saved in /exp/rl

At any point, you can visualize the agent playing by turning the `dqn-segm` (or the name of the model) field to True
 in the file /display.yml
"""

from agents.dqn_agent import DQNAgent
from agents.training import train
from environment.env import make_environment_for_dqn_with_segm
from utils.reproductibility import seed_all

if __name__ == "__main__":
    e = make_environment_for_dqn_with_segm()
    seed_all(e)
    a = DQNAgent(e, "dqn-segm")
    train(a, e)
示例#23
0
class LearningEngine:
    def __init__(self, learner_type):
        self.agent = DQNAgent(100, 2, 0.3, 0.4)
        self.env = None

        #  set the model for the learning engine depending on
        #  learning type
        if learner_type == 'the_rival':
            self.agent.init_model('qnet')
        else:
            #  placeholder: replace with more suitable model for mime if there exists
            self.agent.init_model('qnet')

    def init_game(self, width, height, obstacles):
        self.env = GameEnv(width, height, obstacles)
        self.env.reset()

    def train_mime(self, opponent, move_history, nb_episodes, delimiter=':'):
        action_ls = ['38', '40', '37', '39']

        for episode_idx in range(nb_episodes):
            # logger.info(episode_idx, extra={ 'tags': ['dev_mssg: episode_idx'] })
            self.env.reset()
            #  get initial game state by enacting first user move
            user_action = move_history[0].split(delimiter)[-1]
            state, reward, done, _ = self.env.step(
                action_ls.index(user_action), 1)

            max_plies = 30
            #  play the game
            for step_idx in count(1):

                # break out of game if too many turns and no one has won
                if step_idx > max_plies: break

                expected_action = move_history[step_idx].split(delimiter)[-1]
                expected_action = action_ls.index(expected_action)
                # logger.info(step_idx, extra={ 'tags': ['dev_mssg: step_idx'] })
                if step_idx % 2 != 0:
                    next_state, _, done, _ = self.env.step(
                        expected_action, (step_idx % 2) + 1)
                else:
                    #  select an action and then perform it
                    action = self.agent.select_action(state)
                    next_state, reward, done, _ = self.env.step(
                        action[0, 0], (step_idx % 2) + 1, expected_action)

                    # Perform one step of the optimization (on the target network)
                    self.agent.optimize()

                    # Store the transition in memory
                    self.agent.memory.remember(state, action, next_state,
                                               reward)

                # Move to the next state
                state = next_state

                if done: break

    def train_agent(self, opponent, nb_episodes, plot_performance=False):
        avg_reward_ls = []

        for episode_idx in range(nb_episodes):
            # logger.info(episode_idx, extra={ 'tags': ['dev_mssg: episode_idx'] })
            self.env.reset()
            state, reward, done, _ = self.env.step(0, 1)

            total_reward, num_plies, max_plies = 0.0, 0, 30
            #  play the game
            for step_idx in count(1):
                num_plies = step_idx

                # break out of game if too many turns and no one has won
                if step_idx > max_plies: break

                # logger.info(step_idx, extra={ 'tags': ['dev_mssg: step_idx'] })
                #  set current player based on turn
                current_agent = self.agent if step_idx % 2 == 0 else opponent

                #  select an action and then perform it
                action = current_agent.select_action(state)
                next_state, reward, done, _ = self.env.step(
                    action[0, 0], (step_idx % 2) + 1)

                total_reward += reward

                # Store the transition in memory
                current_agent.memory.remember(state, action, next_state,
                                              reward)

                # Move to the next state
                state = next_state

                # Perform one step of the optimization (on the target network)
                current_agent.optimize()

                if done: break

            avg_reward_ls.append(total_reward / num_plies)

        #  TODO: plot training performance
        if plot_performance:
            img = io.BytesIO()
            _, ax = plt.subplots()
            sns.tsplot(time=list(range(nb_episodes)),
                       data=avg_reward_ls,
                       condition='Training Loss',
                       legend='True',
                       ax=ax)
            ax.set_xlabel('Games')
            ay.set_ylabel('Average Reward')
            plt.savefig(img, format='png')
            img.seek(0)
            return base64.b64encode(img.getvalue()).decode()
        else:
            return None
示例#24
0
def test():
    # set hyperparameters (not really important for running the agent)
    # higher eps. decay rate
    buffer_size = int(1e5)
    batch_size = 64
    gamma = 0.99
    tau = 1e-3
    learning_rate = 5e-4
    eps_start = 1.0
    eps_end = 0.01
    eps_decay = 0.999
    fc1_units = 64
    fc2_units = 64
    q_function_update_fraction = 4
    seed = 0

    ############ THE ENVIRONMENT ###############
    env = UnityEnvironment(file_name='Banana_Linux/Banana.x86_64', seed=seed)

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]

    # get the number of agents
    num_agents = len(env_info.agents)

    # get the size of the action space
    action_size = brain.vector_action_space_size

    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]

    # initialize agent

    dqn_agent = DQNAgent(name=None,
                         state_size=state_size,
                         action_size=action_size,
                         learning_rate=learning_rate,
                         discount_rate=gamma,
                         eps_start=eps_start,
                         eps_end=eps_end,
                         eps_decay=eps_decay,
                         tau=tau,
                         network_architecture=[fc1_units, fc2_units],
                         experience_replay_buffer_size=buffer_size,
                         experience_replay_buffer_batch_size=batch_size,
                         experience_replay_start_size=3200,
                         q_function_update_fraction=q_function_update_fraction,
                         device='gpu',
                         seed=seed)

    dqn_agent.load_state_dict(torch.load('checkpoint.pth'))

    env_info = env.reset(train_mode=False)[brain_name]
    states = env_info.vector_observations
    scores = np.zeros(num_agents)

    for i in range(200):
        actions = dqn_agent.act(states)
        env_info = env.step(actions)[brain_name]
        next_states = env_info.vector_observations
        rewards = env_info.rewards
        dones = env_info.local_done
        scores += rewards
        states = next_states
        if np.any(dones):
            break
示例#25
0
# Import internal modules
from agents.dqn_agent import DQNAgent

# Import external modules
import gym


if __name__ == "__main__":
    # initialize gym environment and the agent
    env = gym.make('CartPole-v0')
    agent = DQNAgent(environment=env)

    # Start the game
    agent.run()
示例#26
0
import gym
import numpy as np
from agents.dqn_agent import DQNAgent

env = gym.make("LunarLander-v2")
env.seed(0)
agent = DQNAgent(env.action_space.n, env.observation_space.shape[0])
episodes = 400
steps = 3000
loss = []
for i_episode in range(episodes):
    obv = np.reshape(env.reset(), (1, 8))
    total_reward = 0
    done = False
    for t in range(steps):
        # env.render()
        # print(observation)
        action = agent.act(obv, total_reward, done)
        next_obv, reward, done, info = env.step(action)
        next_obv = np.reshape(next_obv, (1, 8))
        total_reward += reward
        agent.store_transition(obv, action, reward, next_obv, done)
        obv = next_obv
        agent.replay()
        if done:
            print("{}/{}, reward: {} in {} timesteps".format(
                i_episode, episodes, total_reward, t + 1))
            break
    loss.append(total_reward)

    # Average score of last 100 episode