def test_env_runner_log_episode_metrics(mock_data_logger, mock_task,
                                        mock_agent):
    # Assign
    episodes = [1, 2]
    epsilons = [0.2, 0.1]
    mean_scores = [0.5, 1]
    scores = [1.5, 5]
    iterations = [10, 10]
    episode_data = dict(episodes=episodes,
                        epsilons=epsilons,
                        mean_scores=mean_scores,
                        iterations=iterations,
                        scores=scores)
    env_runner = EnvRunner(mock_task, mock_agent, data_logger=mock_data_logger)

    # Act
    env_runner.log_episode_metrics(**episode_data)

    # Assert
    for idx, episode in enumerate(episodes):
        mock_data_logger.log_value.assert_any_call("episode/epsilon",
                                                   epsilons[idx], episode)
        mock_data_logger.log_value.assert_any_call("episode/avg_score",
                                                   mean_scores[idx], episode)
        mock_data_logger.log_value.assert_any_call("episode/score",
                                                   scores[idx], episode)
        mock_data_logger.log_value.assert_any_call("episode/iterations",
                                                   iterations[idx], episode)
def test_runs_d3pg():
    # Assign
    task = GymTask('BipedalWalker-v3')
    agent = D3PGAgent(task.state_size, task.action_size, device=DEVICE)
    env_runner = EnvRunner(task, agent, max_iterations=50)

    # Act
    env_runner.run(reward_goal=10, max_episodes=10, force_new=True)
def test_runs_td3():
    # Assign
    task = GymTask('Pendulum-v0')
    agent = TD3Agent(task.state_size, task.action_size, device=DEVICE)
    env_runner = EnvRunner(task, agent, max_iterations=50)

    # Act
    env_runner.run(reward_goal=10, max_episodes=10, force_new=True)
def test_runs_rainbow():
    # Assign
    task = GymTask('CartPole-v1')
    agent = RainbowAgent(task.state_size, task.action_size, device=DEVICE)
    env_runner = EnvRunner(task, agent, max_iterations=50)

    # Act
    env_runner.run(reward_goal=10, max_episodes=10, force_new=True)
def test_env_runner_log_data_interaction_no_data_logger(mock_task, mock_agent):
    # Assign
    env_runner = EnvRunner(mock_task, mock_agent)

    # Act
    env_runner.log_data_interaction()

    # Assert
    mock_agent.log_metrics.assert_not_called()
def test_env_runner_load_state_no_file(mock_task, mock_agent):
    # Assign
    env_runner = EnvRunner(mock_task, mock_agent, max_iterations=10)
    env_runner.logger = mock.MagicMock()

    # Act
    env_runner.load_state(file_prefix='saved_state')

    # Assert
    env_runner.logger.warning.assert_called_once_with(
        "Couldn't load state. Forcing restart.")
    mock_agent.load_state.assert_not_called()
def test_env_runner_log_episode_metrics_values_missing(mock_data_logger,
                                                       mock_task, mock_agent):
    # Assign
    episodes = [1, 2]
    episode_data = dict(episodes=episodes)
    env_runner = EnvRunner(mock_task, mock_agent, data_logger=mock_data_logger)

    # Act
    env_runner.log_episode_metrics(**episode_data)

    # Assert
    mock_data_logger.log_value.assert_not_called()
def test_env_runner_log_data_interaction(mock_data_logger, mock_task,
                                         mock_agent):
    # Assign
    env_runner = EnvRunner(mock_task, mock_agent, data_logger=mock_data_logger)

    # Act
    env_runner.log_data_interaction()

    # Assert
    mock_agent.log_metrics.assert_called_once_with(mock_data_logger,
                                                   0,
                                                   full_log=False)
def test_env_runner_info_no_data_logger(mock_task, mock_agent):
    # Assign
    env_runner = EnvRunner(mock_task, mock_agent)
    env_runner.logger = mock.MagicMock()
    info_data = dict(episodes=[2],
                     iterations=[10],
                     scores=[1],
                     mean_scores=[2],
                     epsilons=[1])

    # Act
    env_runner.info(**info_data)

    # Assert
    env_runner.logger.info.assert_called_once()
示例#10
0
    def __init__(self,
                 env_name,
                 agent_name: str,
                 hyperparameters: Optional[Hyperparameters] = None):
        self._logger.info(
            "Initiating SageMakerExecutor with env_name '%s' and agent '%s'",
            env_name, agent_name)

        env = gym.make(env_name)
        self.task = GymTask(env, env_name)
        agent = None
        if agent_name.upper() == "DQN":
            from ai_traineree.agents.dqn import DQNAgent
            agent = DQNAgent
        elif agent_name.upper() == "PPO":
            from ai_traineree.agents.ppo import PPOAgent
            agent = PPOAgent
        elif agent_name.upper() == "DDPG":
            from ai_traineree.agents.ddpg import DDPGAgent
            agent = DDPGAgent
        else:
            self._logger.warning(
                "No agent provided. You're given a PPO agent.")
            from ai_traineree.agents.ppo import PPOAgent
            agent = PPOAgent

        self.max_iterations = int(hyperparameters.get("max_iterations", 10000))
        self.max_episodes = int(hyperparameters.get("max_episodes", 1000))
        self.log_every = int(hyperparameters.get("log_every", 10))
        self.score_goal = int(hyperparameters.get("score_goal", 100))

        self.eps_start: float = float(hyperparameters.get('eps_start', 1.0))
        self.eps_end: float = float(hyperparameters.get('eps_end', 0.02))
        self.eps_decay: float = float(hyperparameters.get('eps_decay', 0.995))

        self.agent: AgentType = agent(self.task.state_size,
                                      self.task.action_size,
                                      config=hyperparameters)

        self.env_runner = EnvRunner(self.task,
                                    self.agent,
                                    max_iterations=self.max_iterations)
示例#11
0
def test_env_runner_info_with_data_logger(mock_task, mock_agent):
    # Assign
    data_logger = mock.MagicMock()
    env_runner = EnvRunner(mock_task, mock_agent, data_logger=data_logger)
    env_runner.logger = mock.MagicMock()
    info_data = dict(episodes=[2],
                     iterations=[10],
                     scores=[1],
                     mean_scores=[2],
                     epsilons=[1])

    # Act
    env_runner.info(**info_data)

    # Assert
    env_runner.logger.info.assert_called_once()
    assert data_logger.log_value.call_count == 4
    mock_agent.log_metrics.assert_called_once_with(data_logger,
                                                   mock.ANY,
                                                   full_log=False)
示例#12
0
def test_env_runner_load_state(mock_task, mock_agent, mock_os):
    # Assign
    env_runner = EnvRunner(mock_task, mock_agent, max_iterations=10)
    mock_os.listdir.return_value = [
        'saved_state_e10.json', 'saved_state_e999.json', 'other.file'
    ]
    mocked_state = '{"episode": 10, "epsilon": 0.2, "score": 0.3, "average_score": -0.1}'

    # Act
    with mock.patch('builtins.open',
                    mock.mock_open(read_data=mocked_state)) as mock_file:
        env_runner.load_state(file_prefix='saved_state')
        mock_file.assert_called_once_with(
            f'{env_runner.state_dir}/saved_state_e999.json', 'r')

    # Assert
    mock_agent.load_state.assert_called_once()
    assert env_runner.episode == 10
    assert env_runner.epsilon == 0.2
    assert len(env_runner.all_scores) == 1
    assert env_runner.all_scores[0] == 0.3
示例#13
0
def test_env_runner_save_state(mock_task, mock_agent, mock_json, mock_path):
    # Assign
    mock_task.step.return_value = ([1, 0.1], -1, False, {})
    mock_agent.act.return_value = 1
    env_runner = EnvRunner(mock_task, mock_agent, max_iterations=10)

    # Act
    env_runner.run(max_episodes=10)
    with mock.patch('builtins.open'):
        env_runner.save_state('saved_state.state')

    # Assert
    mock_agent.save_state.assert_called_once()
    state = mock_json.dump.call_args[0][0]
    assert state['episode'] == 10
    assert state['tot_iterations'] == 10 * 10
示例#14
0
def test_env_runner_log_data_interaction_debug_log(mock_data_logger, mock_task,
                                                   mock_agent):
    # Assign
    mock_task.step.return_value = ([1, 0.1], -1, False, {})
    mock_agent.act.return_value = 1
    env_runner = EnvRunner(mock_task,
                           mock_agent,
                           data_logger=mock_data_logger,
                           debug_log=True)

    # Act
    env_runner.interact_episode(eps=0.1,
                                max_iterations=10,
                                log_interaction_freq=None)
    env_runner.log_data_interaction()

    # Assert
    mock_agent.log_metrics.assert_called_once_with(mock_data_logger,
                                                   10,
                                                   full_log=False)
    assert mock_data_logger.log_values_dict.call_count == 20  # 10x iter per states and actions
    assert mock_data_logger.log_value.call_count == 20  # 10x iter per rewards and dones
示例#15
0
from ai_traineree.tasks import GymTask
from ai_traineree.types import TaskType

import pylab as plt

env_name = 'LunarLanderContinuous-v2'
task: TaskType = GymTask(env_name)
config = {
    'action_scale': 2,
    'batch_size': 200,
    'number_updates': 5,
    'update_freq': 10,
    'update_policy_freq': 10,
}
agent = Agent(task.state_size, task.action_size, **config)
env_runner = EnvRunner(task, agent)

# interact_episode(task, agent, 0, render=True)
scores = env_runner.run(reward_goal=80,
                        max_episodes=2000,
                        log_episode_freq=1,
                        force_new=True)
env_runner.interact_episode(0, render=True)

# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(range(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.savefig(f'{env_name}.png', dpi=120)
示例#16
0
from ai_traineree.agents.dqn import DQNAgent
from ai_traineree.env_runner import EnvRunner
from ai_traineree.tasks import GymTask

import pylab as plt

env_name = 'Breakout-ram-v0'
task = GymTask(env_name)
agent = DQNAgent(task.state_size, task.action_size, hidden_layers=(400, 300))
env_runner = EnvRunner(task, agent)

# env_runner.interact_episode(0, render=True)
scores = env_runner.run(reward_goal=5, max_episodes=5, log_every=1)
env_runner.interact_episode(100, render=True)

# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(range(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.savefig(f'{env_name}.png', dpi=120)
plt.show()
示例#17
0
from ai_traineree.agents.dqn import DQNAgent
from ai_traineree.env_runner import EnvRunner
from ai_traineree.tasks import GymTask
from torch.utils.tensorboard import SummaryWriter

import numpy as np
import pylab as plt

writer = SummaryWriter()

env_name = 'CartPole-v1'
task = GymTask(env_name)
agent = DQNAgent(task.state_size, task.action_size, n_steps=5)
env_runner = EnvRunner(task, agent, writer=writer)

scores = env_runner.run(
    reward_goal=100,
    max_episodes=5000,
    eps_end=0.002,
    eps_decay=0.99,
    gif_every_episodes=500,
    force_new=True,
)
env_runner.interact_episode(1000, render=True)

# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
示例#18
0
config = {
    'rollout_length': 60,
    'batch_size': 60,
    "number_updates": 1,

    "using_gae": False,  # Default is True
    "ppo_ratio_clip": 0.2,
    "entropy_weight": 0.0005,
    "gamma": 0.99,
    "action_scale": 1,
    "max_grad_norm_actor": 3.0,
    "max_grad_norm_critic": 5.0,
    "critic_lr": 0.001,
    "actor_lr": 0.0004,
}
agent = Agent(task.state_size, task.action_size, hidden_layers=(100, 100, 50), **config)
env_runner = EnvRunner(task, agent, data_logger=data_logger)
# env_runner.interact_episode(0, render=True)
scores = env_runner.run(80, 2000, eps_decay=0.99, force_new=True, checkpoint_every=20)
env_runner.interact_episode(0, render=True)

data_logger.close()
# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(range(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.savefig(f'{env_name}.png', dpi=120)
plt.show()
task: TaskType = GymTask(env_name)
config = {
    'warm_up': 500,
    'batch_size': 200,
    'update_freq': 30,
    "number_updates": 1,
    "gamma": 0.99,
    "critic_lr": 1e-3,
    "actor_lr": 2e-3,
    "alpha": 0.2,
    "tau": 0.01,
    "max_grad_norm_alpha": 1.0,
    "max_grad_norm_actor": 10.0,
    "max_grad_norm_critic": 10.0,
}
agent = Agent(task.state_size, task.action_size, hidden_layers=(100, 100), **config)

env_runner = EnvRunner(task, agent, max_iterations=10000, data_logger=data_logger)
# env_runner.interact_episode(render=True)
scores = env_runner.run(reward_goal=10, max_episodes=500, eps_decay=0.99, log_episode_freq=1, gif_every_episodes=200, force_new=True)
env_runner.interact_episode(render=True)

# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.savefig(f'{env_name}.png', dpi=120)
plt.show()
示例#20
0
from ai_traineree.env_runner import EnvRunner
from ai_traineree.tasks import GymTask

env_name = 'Pendulum-v0'
task = GymTask(env_name)
config = {
    'warm_up': 100,
    'batch_size': 50,
    'hidden_layers': (50, 50),
    'noise_scale': 1.,
    'clip': (-2, 2),
    'actor_lr': 1e-4,
    'critic_lr': 2e-4,
}
agent = Agent(task.state_size, task.action_size, **config)
env_runner = EnvRunner(task, agent)

# env_runner.interact_episode(0, render=True)
scores = env_runner.run(0,
                        2000,
                        eps_start=1.0,
                        eps_end=0.05,
                        eps_decay=0.99,
                        log_every=1)
env_runner.interact_episode(0, render=True)

# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
示例#21
0
from ai_traineree.env_runner import EnvRunner
from ai_traineree.tasks import GymTask
from ai_traineree.types import TaskType

import pylab as plt

env_name = 'LunarLanderContinuous-v2'
task: TaskType = GymTask(env_name)
config = {'batch_size': 64, 'warm_up': 0, 'action_scale': 2, 'update_freq': 2}
agent = DDPG(task.state_size,
             task.action_size,
             hidden_layers=(300, 200),
             noise_scale=0.4,
             noise_sigma=0.2,
             config=config)
env_runner = EnvRunner(task, agent)

# interact_episode(task, agent, 0, render=True)
scores = env_runner.run(reward_goal=80,
                        max_episodes=40,
                        eps_start=1.0,
                        eps_end=0.05,
                        eps_decay=0.991)
env_runner.interact_episode(0, render=True)

# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(range(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
    1e-3,
    "update_freq":
    150,
    "batch_size":
    400,
    "buffer_size":
    int(5e3),
    "device":
    "cuda",
}

for _ in range(prev_states):
    task.reset()

agent = DQNAgent(task.state_size, task.action_size, **config)
env_runner = EnvRunner(task, agent, data_logger=data_logger)

# env_runner.interact_episode(0, render=True)
scores = env_runner.run(
    reward_goal=1000,
    max_episodes=20000,
    log_every=1,
    eps_start=0.9,
    gif_every_episodes=200,
    force_new=True,
)
# env_runner.interact_episode(render=True)
data_logger.close()

# plot scores
fig = plt.figure()
示例#23
0
    "gae_lambda": 0.95,
    "ppo_ratio_clip": 0.20,
    "entropy_weight": 0.005,
    "gamma": 0.99,
    "std_init": 0.5,
    "std_max": 1.0,
    "std_min": 0.1,

    "max_grad_norm_actor": 200.0,
    "max_grad_norm_critic": 200.0,
    "critic_lr": 3e-4,
    "critic_betas": (0.9, 0.999),
    "actor_lr": 3e-4,
    "actor_betas": (0.9, 0.999),
}
agent = Agent(task.state_size, task.action_size, hidden_layers=(100, 100), **config)
env_runner = EnvRunner(task, agent, max_iterations=2000, data_logger=data_logger)
# env_runner.interact_episode(render=True)
scores = env_runner.run(300, 1000, log_episode_freq=1, gif_every_episodes=500, force_new=True)
env_runner.interact_episode(render=True)

# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(range(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.savefig(f'{env_name}.png', dpi=120)
plt.show()
示例#24
0
from ai_traineree.agents.ddpg import DDPGAgent as DDPG
from ai_traineree.env_runner import EnvRunner
from ai_traineree.loggers import TensorboardLogger
from ai_traineree.tasks import GymTask
from ai_traineree.types import TaskType

import pylab as plt


data_logger = TensorboardLogger()
env_name = 'LunarLanderContinuous-v2'
task: TaskType = GymTask(env_name)
config = {'action_scale': 1, 'update_freq': 2}
agent = DDPG(task.state_size, task.action_size, hidden_layers=(100, 100), noise_scale=0.4, noise_sigma=0.2, **config)
env_runner = EnvRunner(task, agent, data_logger=data_logger)
scores = env_runner.run(reward_goal=80, max_episodes=1000, eps_start=1.0, eps_end=0.05, eps_decay=0.999, force_new=True)
# env_runner.interact_episode(0, render=True)
data_logger.close()

# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(range(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.savefig(f'{env_name}.png', dpi=120)
plt.show()
from ai_traineree.loggers import TensorboardLogger
from ai_traineree.tasks import GymTask
from typing import Any, Dict
from pprint import pprint

config_default = {'hidden_layers': (50, 50)}
config_updates = [{'n_steps': n} for n in range(1, 11)]

task = GymTask("CartPole-v1")
seeds = [32167, 1, 999, 2833700, 13]

for idx, config_update in enumerate(config_updates):
    config: Dict[str, Any] = config_default.copy()
    config.update(config_update)

    for seed in seeds:
        config['seed'] = seed
        pprint(config)
        torch.manual_seed(config['seed'])
        agent = Agent(task.state_size, task.action_size, **config)

        data_logger = TensorboardLogger(
            log_dir=f'runs/MultiExp-{task.name}-i{idx}-s{seed}')
        env_runner = EnvRunner(task, agent, data_logger=data_logger)
        env_runner.seed(seed)
        env_runner.run(reward_goal=99999,
                       max_episodes=500,
                       eps_decay=0.95,
                       force_new=True)
        data_logger.close()
示例#26
0
class SageMakerExecutor:

    _logger = logging.getLogger("SageMakerExecutor")

    def __init__(self,
                 env_name,
                 agent_name: str,
                 hyperparameters: Optional[Hyperparameters] = None):
        self._logger.info(
            "Initiating SageMakerExecutor with env_name '%s' and agent '%s'",
            env_name, agent_name)

        env = gym.make(env_name)
        self.task = GymTask(env, env_name)
        agent = None
        if agent_name.upper() == "DQN":
            from ai_traineree.agents.dqn import DQNAgent
            agent = DQNAgent
        elif agent_name.upper() == "PPO":
            from ai_traineree.agents.ppo import PPOAgent
            agent = PPOAgent
        elif agent_name.upper() == "DDPG":
            from ai_traineree.agents.ddpg import DDPGAgent
            agent = DDPGAgent
        else:
            self._logger.warning(
                "No agent provided. You're given a PPO agent.")
            from ai_traineree.agents.ppo import PPOAgent
            agent = PPOAgent

        self.max_iterations = int(hyperparameters.get("max_iterations", 10000))
        self.max_episodes = int(hyperparameters.get("max_episodes", 1000))
        self.log_every = int(hyperparameters.get("log_every", 10))
        self.score_goal = int(hyperparameters.get("score_goal", 100))

        self.eps_start: float = float(hyperparameters.get('eps_start', 1.0))
        self.eps_end: float = float(hyperparameters.get('eps_end', 0.02))
        self.eps_decay: float = float(hyperparameters.get('eps_decay', 0.995))

        self.agent: AgentType = agent(self.task.state_size,
                                      self.task.action_size,
                                      config=hyperparameters)

        self.env_runner = EnvRunner(self.task,
                                    self.agent,
                                    max_iterations=self.max_iterations)

    def run(self) -> None:
        self._logger.info("Running model '%s' for env '%s'", self.agent.name,
                          self.task.name)
        self.env_runner.run(
            reward_goal=self.score_goal,
            max_episodes=self.max_episodes,
            eps_start=self.eps_start,
            eps_end=self.eps_end,
            eps_decay=self.eps_decay,
            log_every=self.log_every,
        )

    def save_results(self, path):
        self._logger.info("Saving the model to path %s", path)
        self.agent.save_state(path)
import numpy as np
import pylab as plt

from ai_traineree.env_runner import EnvRunner
from ai_traineree.agents.dqn import DQNAgent
from ai_traineree.tasks import GymTask
from ai_traineree.types import TaskType

env_name = 'LunarLander-v2'
task: TaskType = GymTask(env_name)
config = {'batch_size': 64}
agent = DQNAgent(task.state_size, task.action_size, config=config)
env_runner = EnvRunner(task, agent)

env_runner.interact_episode(0, render=True)
scores = env_runner.run(50, 800, eps_start=1.0, eps_end=0.05, eps_decay=0.995)
env_runner.interact_episode(0, render=True)

# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.savefig(f'{env_name}.png', dpi=120)
plt.show()
示例#28
0
config = {
    "update_freq":
    10,
    "batch_size":
    100,
    "warm_up":
    100,
    "lr":
    1e-4,
    "network_fn":
    lambda: QNetwork2D(state_size, task.action_size, hidden_layers=(200, 200)),
    "state_transform":
    agent_state_tranform,
}
agent = DQNAgent(state_size, task.action_size, **config)
env_runner = EnvRunner(task, agent, max_iterations=2000, writer=writer)

scores = env_runner.run(reward_goal=500,
                        max_episodes=1000,
                        log_every=1,
                        eps_start=0.99,
                        gif_every_episodes=100)
env_runner.interact_episode(render=True)

# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(range(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.savefig(f'{env_name}.png', dpi=120)
示例#29
0
task: TaskType = GymTask(env_name)
config = {
    'rollout_length': 30,
    'batch_size': 30,
    "number_updates": 1,
    "ppo_ratio_clip": 0.2,
    "value_loss_weight": 2,
    "entropy_weight": 0.0005,
    "gamma": 0.98,
    "action_scale": 2,
    "max_grad_norm_actor": 2.0,
    "max_grad_norm_critic": 2.0,
    "critic_lr": 1e-3,
    "actor_lr": 1e-3,
}
agent = Agent(task.state_size, task.action_size, hidden_layers=(300, 300), config=config)
env_runner = EnvRunner(task, agent)

env_runner.interact_episode(0, render=True)
scores = env_runner.run(80, 4000)
env_runner.interact_episode(0, render=True)

# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.savefig(f'{env_name}.png', dpi=120)
plt.show()
示例#30
0
from ai_traineree.env_runner import EnvRunner
from ai_traineree.loggers import TensorboardLogger
from ai_traineree.tasks import GymTask


def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0))
    return (cumsum[N:] - cumsum[:-N]) / float(N)


env_name = 'CartPole-v1'
task = GymTask(env_name)
data_logger = TensorboardLogger()

agent = Agent(task.state_size, task.action_size, device='cpu')
env_runner = EnvRunner(task, agent, data_logger=data_logger)

scores = env_runner.run(reward_goal=100,
                        max_episodes=500,
                        eps_decay=0.9,
                        force_new=True)
env_runner.interact_episode(render=True)
data_logger.close()

avg_length = 100
ma = running_mean(scores, avg_length)
# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(range(len(scores)), scores)
plt.plot(range(avg_length, avg_length + len(ma)), ma)