示例#1
0
parser.add_argument('--updates_per_step', type=int, default=1, metavar='N')
parser.add_argument('--start_steps', type=int, default=10000, metavar='N')
parser.add_argument('--target_update_interval', type=int, default=1, metavar='N')
parser.add_argument('--replay_size', type=int, default=1000000, metavar='N')

args = parser.parse_args()

args.cuda =True if torch.cuda.is_available() else False


env = h_env.HockeyEnv(mode=h_env.HockeyEnv.NORMAL)
# Agent
agent = SAC(env.observation_space.shape[0], env.action_space, args)
agent.load_model('full_player_models/sac_actor_hockey_11200_batch_size-4_gamma-0.95_tau-0.005_lr-0.0003_alpha-0.2_tuning-True_hidden_size-256_updatesStep-1_startSteps-10000_targetIntervall-1_replaysize-1000000','full_player_models/sac_critic_hockey_11200_batch_size-4_gamma-0.95_tau-0.005_lr-0.0003_alpha-0.2_tuning-True_hidden_size-256_updatesStep-1_startSteps-10000_targetIntervall-1_replaysize-1000000')
# opponent = copy.deepcopy(agent)
basic_strong = h_env.BasicOpponent(weak=False)
time_ = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
#Tesnorboard
writer = SummaryWriter(f"strongplay-runs/ERE{time_}_batch_size-{args.batch_size}_gamma-{args.gamma}_tau-{args.tau}_lr-{args.lr}_alpha-{args.alpha}_tuning-{args.automatic_entropy_tuning}_hidden_size-{args.hidden_size}_updatesStep-{args.updates_per_step}_startSteps-{args.start_steps}_targetIntervall-{args.target_update_interval}_replaysize-{args.replay_size}")

# Memory
memory = ERE_PrioritizedReplay(args.replay_size)
# memory = ReplayMemory(args.replay_size,args.seed)

# Training Loop
total_numsteps = 0
updates = 0


o = env.reset()
# _ = env.render()
parser.add_argument('--eval_episodes', help='Set number of evaluation episodes', type=int, default=30)
parser.add_argument('--filename', help='Path to the pretrained model', default=None)
parser.add_argument('--mode', help='Mode for evaluating currently: (shooting | defense)', default='normal')
parser.add_argument('--show', help='Set if want to render training process', action='store_true')
parser.add_argument('--q', help='Quiet mode (no prints)', action='store_true')
parser.add_argument('--opposite', help='Evaluate agent on opposite side', action='store_true')

opts = parser.parse_args()

if __name__ == '__main__':
    if opts.mode == 'normal':
        mode = h_env.HockeyEnv_BasicOpponent.NORMAL
    elif opts.mode == 'shooting':
        mode = h_env.HockeyEnv_BasicOpponent.TRAIN_SHOOTING
    elif opts.mode == 'defense':
        mode = h_env.HockeyEnv_BasicOpponent.TRAIN_DEFENSE
    else:
        raise ValueError('Unknown training mode. See --help')

    logger = Logger(prefix_path=os.path.dirname(os.path.realpath(__file__)) + '/logs',
                    mode=opts.mode,
                    quiet=opts.q)
    q_agent = logger.load_model(filename=opts.filename)
    q_agent._config['show'] = opts.show
    q_agent._config['max_steps'] = 250
    q_agent.eval()
    env = h_env.HockeyEnv(mode=mode)
    opponent = h_env.BasicOpponent(weak=False)
    evaluate(agent=q_agent, env=env, opponent=opponent, eval_episodes=opts.eval_episodes,
             action_mapping=q_agent.action_mapping, evaluate_on_opposite_side=opts.opposite)
                    help='Evaluate agent vs weak basic opponent',
                    default=False,
                    action='store_true')

opts = parser.parse_args()

if __name__ == '__main__':
    if opts.mode == 'normal':
        mode = h_env.HockeyEnv_BasicOpponent.NORMAL
    elif opts.mode == 'shooting':
        mode = h_env.HockeyEnv_BasicOpponent.TRAIN_SHOOTING
    elif opts.mode == 'defense':
        mode = h_env.HockeyEnv_BasicOpponent.TRAIN_DEFENSE
    else:
        raise ValueError('Unknown training mode. See --help.')

    if opts.filename is None:
        raise ValueError('Parameter --filename must be present. See --help.')

    env = h_env.HockeyEnv(mode=mode)

    agent = SACAgent.load_model(opts.filename)
    agent.eval()
    agent._config['show'] = opts.show
    opponent = h_env.BasicOpponent(weak=opts.weak)
    evaluate(agent,
             env,
             opponent,
             opts.eval_episodes,
             evaluate_on_opposite_side=opts.opposite)
示例#4
0
        raise ValueError('Unknown training mode. See --help')

    opts.device = torch.device(
        'cuda' if opts.cuda and torch.cuda.is_available() else 'cpu')

    dirname = time.strftime(f'%y%m%d_%H%M%S_{random.randint(0, 1e6):06}',
                            time.gmtime(time.time()))
    abs_path = os.path.dirname(os.path.realpath(__file__))
    logger = Logger(prefix_path=os.path.join(abs_path, dirname),
                    mode=opts.mode,
                    cleanup=True,
                    quiet=opts.q)

    env = h_env.HockeyEnv(mode=mode, verbose=(not opts.q))
    opponents = [
        h_env.BasicOpponent(weak=True),
    ]

    # Add absolute paths for pretrained agents
    pretrained_agents = []

    if opts.selfplay:
        for p in pretrained_agents:
            a = SACAgent.load_model(p)
            a.eval()
            opponents.append(a)

    if opts.preload_path is None:
        agent = SACAgent(logger=logger,
                         obs_dim=env.observation_space.shape,
                         action_space=env.action_space,
示例#5
0
    def train(self, agent, env):
        epsilon = self._config['epsilon']
        epsilon_decay = self._config['epsilon_decay']
        min_epsilon = self._config['min_epsilon']
        episode_counter = 1
        total_step_counter = 0
        total_grad_updates = 0

        beta = self._config['per_beta']
        beta_inc = self._config['per_beta_inc']
        beta_max = self._config['per_beta_max']

        rew_stats = []
        loss_stats = []
        lost_stats = {}
        touch_stats = {}
        won_stats = {}

        eval_stats = {'reward': [], 'touch': [], 'won': [], 'lost': []}

        opponents = [
            h_env.BasicOpponent(weak=True),
            h_env.BasicOpponent(weak=False)
        ]

        while episode_counter <= self._config['max_episodes']:
            if self._config['self_play']:
                opponent = poll_opponent(opponents=opponents)
            else:
                opponent = h_env.BasicOpponent(weak=False)

            ob = env.reset()
            obs_agent2 = env.obs_agent_two()

            if (env.puck.position[0] < 5 and self._config['mode']
                    == 'defense') or (env.puck.position[0] > 5
                                      and self._config['mode'] == 'shooting'):
                continue

            epsilon = max(epsilon - epsilon_decay, min_epsilon)
            if self._config['per']:
                beta = min(beta_max, beta + beta_inc)
                agent.update_per_beta(beta=beta)

            total_reward = 0
            touched = 0
            first_time_touch = 1
            touch_stats[episode_counter] = 0
            won_stats[episode_counter] = 0
            lost_stats[episode_counter] = 0

            for step in range(1, self._config['max_steps'] + 1):
                a1 = agent.act(ob, eps=epsilon)
                a1_list = agent.action_mapping[a1]

                if self._config['mode'] in ['defense', 'normal']:
                    a2 = opponent.act(obs_agent2)
                    # a copy of our agent has been chosen, transform the action id to a list
                    if not isinstance(a2, np.ndarray):
                        a2 = agent.action_mapping[a2]
                elif self._config['mode'] == 'shooting':
                    a2 = [0, 0, 0, 0]
                else:
                    raise NotImplementedError(
                        f'Training for {self._config["mode"]} not implemented.'
                    )

                (ob_new, reward, done,
                 _info) = env.step(np.hstack([a1_list, a2]))

                touched = max(touched, _info['reward_touch_puck'])

                step_reward = reward + 5 * _info['reward_closeness_to_puck'] - (1 - touched) * 0.1 + \
                              touched * first_time_touch * 0.1 * step

                first_time_touch = 1 - touched

                total_reward += step_reward

                agent.store_transition((ob, a1, step_reward, ob_new, done))

                if self._config['show']:
                    time.sleep(0.01)
                    env.render()

                if touched > 0:
                    touch_stats[episode_counter] = 1

                if done:
                    won_stats[episode_counter] = 1 if env.winner == 1 else 0
                    lost_stats[episode_counter] = 1 if env.winner == -1 else 0
                    break

                if total_step_counter % self._config['train_every'] == 0 and \
                        total_step_counter > self._config['start_learning_from']:

                    loss_stats.append(agent.train_model())
                    rew_stats.append(total_reward)
                    total_grad_updates += 1

                    if total_grad_updates % self._config[
                            'update_target_every'] == 0:
                        agent.update_target_net()

                    if self._config['self_play'] and total_grad_updates % self._config['add_opponent_every'] == 0 and \
                            episode_counter >= self._config['start_self_play_from']:
                        opponents.append(deepcopy(agent))
                        agent.id += 1

                ob = ob_new
                obs_agent2 = env.obs_agent_two()
                total_step_counter += 1

            self.logger.print_episode_info(env.winner, episode_counter, step,
                                           total_reward, epsilon, touched,
                                           opponent)

            if episode_counter % self._config['evaluate_every'] == 0:
                self.logger.info("Evaluating agent")
                agent.eval()
                old_show = agent._config['show']
                agent._config['show'] = False
                rew, touch, won, lost = evaluate(
                    agent=agent,
                    env=env,
                    opponent=h_env.BasicOpponent(weak=False),
                    eval_episodes=self._config['eval_episodes'],
                    quiet=True,
                    action_mapping=agent.action_mapping)
                agent.train()
                agent._config['show'] = old_show

                eval_stats['reward'].append(rew)
                eval_stats['touch'].append(touch)
                eval_stats['won'].append(won)
                eval_stats['lost'].append(lost)
                self.logger.save_model(agent, f'a-{episode_counter}.pkl')

            if total_step_counter > self._config['start_learning_from']:
                agent.step_lr_scheduler()

            episode_counter += 1

        if self._config['show']:
            env.close()

        # Print train stats
        self.logger.print_stats(rew_stats, touch_stats, won_stats, lost_stats)

        self.logger.info('Saving statistics...')

        # Plot reward
        self.logger.plot_running_mean(rew_stats,
                                      'Total reward',
                                      'total-reward.pdf',
                                      show=False)

        # Plot loss
        self.logger.plot_running_mean(loss_stats,
                                      'Loss',
                                      'loss.pdf',
                                      show=False)

        # Plot evaluation stats
        self.logger.plot_intermediate_stats(eval_stats, show=False)

        # Save model
        self.logger.save_model(agent, 'agent.pkl')

        # Save arrays of won-lost stats
        self.logger.save_array(data=eval_stats["won"],
                               filename="eval-won-stats")
        self.logger.save_array(data=eval_stats["lost"],
                               filename="eval-lost-stats")
示例#6
0
    def train(self, agent, opponents, env, run_evaluation):
        rew_stats, q1_losses, q2_losses, actor_losses, alpha_losses = [], [], [], [], []

        lost_stats, touch_stats, won_stats = {}, {}, {}
        eval_stats = {
            'weak': {
                'reward': [],
                'touch': [],
                'won': [],
                'lost': []
            },
            'strong': {
                'reward': [],
                'touch': [],
                'won': [],
                'lost': []
            }
        }

        episode_counter = 1
        total_step_counter = 0
        grad_updates = 0
        new_op_grad = []
        while episode_counter <= self._config['max_episodes']:
            ob = env.reset()
            obs_agent2 = env.obs_agent_two()

            total_reward, touched = 0, 0
            touch_stats[episode_counter] = 0
            won_stats[episode_counter] = 0
            lost_stats[episode_counter] = 0

            opponent = utils.poll_opponent(opponents)

            first_time_touch = 1
            for step in range(self._config['max_steps']):
                a1 = agent.act(ob)

                if self._config['mode'] == 'defense':
                    a2 = opponent.act(obs_agent2)
                elif self._config['mode'] == 'shooting':
                    a2 = np.zeros_like(a1)
                else:
                    a2 = opponent.act(obs_agent2)

                actions = np.hstack([a1, a2])
                next_state, reward, done, _info = env.step(actions)

                touched = max(touched, _info['reward_touch_puck'])

                step_reward = (
                    reward
                    + 5 * _info['reward_closeness_to_puck']
                    - (1 - touched) * 0.1
                    + touched * first_time_touch * 0.1 * step
                )
                first_time_touch = 1 - touched

                total_reward += step_reward

                agent.store_transition((ob, a1, step_reward, next_state, done))

                if self._config['show']:
                    time.sleep(0.01)
                    env.render()

                if touched > 0:
                    touch_stats[episode_counter] = 1

                if done:
                    won_stats[episode_counter] = 1 if env.winner == 1 else 0
                    lost_stats[episode_counter] = 1 if env.winner == -1 else 0
                    break

                ob = next_state
                obs_agent2 = env.obs_agent_two()
                total_step_counter += 1

            if agent.buffer.size < self._config['batch_size']:
                continue

            for _ in range(self._config['grad_steps']):
                losses = agent.update_parameters(total_step_counter)
                grad_updates += 1

                q1_losses.append(losses[0])
                q2_losses.append(losses[1])
                actor_losses.append(losses[2])
                alpha_losses.append(losses[3])

                # Add trained agent to opponents queue
                if self._config['selfplay']:
                    if (
                        grad_updates % self._config['add_self_every'] == 0
                    ):
                        new_opponent = SACAgent.clone_from(agent)
                        new_opponent.eval()
                        opponents.append(new_opponent)
                        new_op_grad.append(grad_updates)

            agent.schedulers_step()
            self.logger.print_episode_info(env.winner, episode_counter, step, total_reward)

            if episode_counter % self._config['evaluate_every'] == 0:
                agent.eval()
                for eval_op in ['strong', 'weak']:
                    ev_opponent = opponents[0] if eval_op == 'strong' else h_env.BasicOpponent(False)
                    rew, touch, won, lost = evaluate(
                        agent,
                        env,
                        ev_opponent,
                        100,
                        quiet=True
                    )
                    eval_stats[eval_op]['reward'].append(rew)
                    eval_stats[eval_op]['touch'].append(touch)
                    eval_stats[eval_op]['won'].append(won)
                    eval_stats[eval_op]['lost'].append(lost)
                agent.train()

                self.logger.save_model(agent, f'a-{episode_counter}.pkl')

            rew_stats.append(total_reward)

            episode_counter += 1

        if self._config['show']:
            env.close()

        # Print train stats
        self.logger.print_stats(rew_stats, touch_stats, won_stats, lost_stats)

        self.logger.info('Saving training statistics...')

        # Plot reward
        self.logger.plot_running_mean(data=rew_stats, title='Total reward', filename='total-reward.pdf', show=False)

        # Plot evaluation stats
        self.logger.plot_evaluation_stats(eval_stats, self._config['evaluate_every'], 'evaluation-won-lost.pdf')

        # Plot losses
        for loss, title in zip([q1_losses, q2_losses, actor_losses, alpha_losses],
                               ['Q1 loss', 'Q2 loss', 'Policy loss', 'Alpha loss']):
            self.logger.plot_running_mean(
                data=loss,
                title=title,
                filename=f'{title.replace(" ", "-")}.pdf',
                show=False,
                v_milestones=new_op_grad,
            )

        # Save agent
        self.logger.save_model(agent, 'agent.pkl')

        if run_evaluation:
            agent.eval()
            agent._config['show'] = True
            evaluate(agent, env, h_env.BasicOpponent(weak=False), self._config['eval_episodes'])
    elif opts.mode == 'shooting':
        mode = h_env.HockeyEnv_BasicOpponent.TRAIN_SHOOTING
    elif opts.mode == 'defense':
        mode = h_env.HockeyEnv_BasicOpponent.TRAIN_DEFENSE
    else:
        raise ValueError('Unknown training mode. See --help')

    opts.device = torch.device(
        'cuda' if opts.cuda and torch.cuda.is_available() else 'cpu')
    logger = Logger(prefix_path=os.path.dirname(os.path.realpath(__file__)) +
                    '/logs',
                    mode=opts.mode,
                    cleanup=False,
                    quiet=opts.q)

    opponents = [h_env.BasicOpponent(weak=True)]
    env = h_env.HockeyEnv(mode=mode, verbose=(not opts.q))

    if opts.TD3agent:
        agent = TD3Agent(logger=logger,
                         obs_dim=env.observation_space.shape,
                         action_space=env.action_space,
                         userconfig=vars(opts))
    else:
        agent = DDPGAgent(logger=logger,
                          obs_dim=env.observation_space.shape,
                          action_space=env.action_space,
                          userconfig=vars(opts))

    trainer = DDPGTrainer(logger, vars(opts))
    trainer.train(agent, opponents, env, opts.evaluate)
示例#8
0
def training_loop(hyperparameters):
    print(f"Starting training with hyperparameters: {hyperparameters}")
    save_path = hyperparameters["save_path"]
    load_path = hyperparameters["load_path"]

    # create the save path and save hyperparameter configuration
    if not os.path.exists(save_path):
        os.mkdir(save_path)
    else:
        a = input("Warning, Directory already exists. Dou want to continue?")
        if a not in ["Y","y"]:
            raise Exception("Path already exists, please start with another path.")

    with open(save_path+ "/parameters.json", "w") as f:
        json.dump(hyperparameters, f)

    # general configurations
    state_dim=18
    action_dim=4
    max_action=1
    iterations=hyperparameters["max_iterations"]
    batch_size=hyperparameters["batch_size"]
    max_episodes=hyperparameters["max_episodes"]
    train_mode = hyperparameters["train_mode"]
    closeness_factor=hyperparameters["closeness_factor"]
    c = closeness_factor

    # init the agent
    agent1 = TD3Agent([state_dim + action_dim, 256, 256, 1],
                        [state_dim, 256, 256, action_dim],
                        optimizer=hyperparameters["optimizer"],
                        policy_noise=hyperparameters["policy_noise"],
                        policy_noise_clip=hyperparameters["policy_noise_clip"],
                        gamma=hyperparameters["gamma"],
                        delay=hyperparameters["delay"],
                        tau=hyperparameters["tau"],
                        lr=hyperparameters["lr"],
                        max_action=max_action,
                        weight_decay=hyperparameters["weight_decay"])

    # load the agent if given
    loaded_state=False
    if load_path:
        agent1.load(load_path)
        loaded_state=True

    # define opponent
    if hyperparameters["self_play"]:
        agent2=agent1
    else:
        agent2 = h_env.BasicOpponent(weak=hyperparameters["weak_agent"])

    # load enviroment and replaybuffer
    replay_buffer = ReplayBuffer(state_dim, action_dim)

    if train_mode == "defense":
        env = h_env.HockeyEnv(mode=h_env.HockeyEnv.TRAIN_DEFENSE)
    elif train_mode == "shooting":
        env = h_env.HockeyEnv(mode=h_env.HockeyEnv.TRAIN_SHOOTING)
    else:
        env = h_env.HockeyEnv()


    # add figure to plot later
    if hyperparameters["plot_performance"]:
        fig, (ax_loss, ax_reward) = plt.subplots(2)
        ax_loss.set_xlim(0, max_episodes)
        ax_loss.set_ylim(0, 20)
        ax_reward.set_xlim(0, max_episodes)
        ax_reward.set_ylim(-30, 20)

    with HiddenPrints():
    # first sample enough data to start:
        obs_last = env.reset()
        for i in range(batch_size*100):
            a1 = env.action_space.sample()[:4] if not loaded_state else agent1.act(env.obs_agent_two())
            a2 = agent2.act(env.obs_agent_two())
            obs, r, d, info = env.step(np.hstack([a1,a2]))
            done = 1 if d else 0
            replay_buffer.add(obs_last, a1, obs, r, done)
            obs_last=obs
            if d:
                obs_last = env.reset()

    print("Finished collection of data prior to training")

    # tracking of performance
    episode_critic_loss=[]
    episode_rewards=[]
    win_count=[]
    if not os.path.isfile(save_path + "/performance.csv"):
        pd.DataFrame(data={"Episode_rewards":[], "Episode_critic_loss":[], "Win/Loss":[]}).to_csv(save_path + "/performance.csv", sep=",", index=False)

    # Then start training
    for episode_count in range(max_episodes+1):
        obs_last = env.reset()
        total_reward=0
        critic_loss=[]

        for i in range(iterations):
            # run the enviroment
            with HiddenPrints():
                with torch.no_grad():
                    a1 =  agent1.act(env.obs_agent_two()) + np.random.normal(loc=0, scale=hyperparameters["exploration_noise"], size=action_dim)
                a2 = agent2.act(env.obs_agent_two())
                obs, r, d, info = env.step(np.hstack([a1,a2]))
            total_reward+=r
            done = 1 if d else 0

            # mopify reward with cloeness to puck reward
            if hyperparameters["closeness_decay"]:
                c = closeness_factor *(1 - episode_count/max_episodes)
            newreward = r + c * info["reward_closeness_to_puck"] 

            # add to replaybuffer
            replay_buffer.add(obs_last, a1, obs, newreward, done)
            obs_last=obs
            
            # sample minibatch and train
            states, actions, next_states, reward, done = replay_buffer.sample(batch_size)
            loss = agent1.train(states, actions, next_states, reward, done)
            critic_loss.append(loss.detach().numpy())

            # if done, finish episode
            if d:
                episode_rewards.append(total_reward)
                episode_critic_loss.append(np.mean(critic_loss))
                win_count.append(info["winner"])
                print(f"Episode {episode_count} finished after {i} steps with a total reward of {total_reward}")
                
                # Online plotting
                if hyperparameters["plot_performance"] and episode_count>40 :
                    ax_loss.plot(list(range(-1, episode_count-29)), moving_average(episode_critic_loss, 30), 'r-')
                    ax_reward.plot(list(range(-1, episode_count-29)), moving_average(episode_rewards, 30), "r-")
                    plt.draw()
                    plt.pause(1e-17)

                break
        
        # Intermediate evaluation of win/loss and saving of model
        if episode_count % 500 ==0 and episode_count != 0:
                print(f"The agents win ratio in the last 500 episodes was {win_count[-500:].count(1)/500}")
                print(f"The agents loose ratio in the last 500 episodes was {win_count[-500:].count(-1)/500}")
                try:
                    agent1.save(save_path)
                    print("saved model")
                except Exception:
                    print("Saving Failed model failed")
                pd.DataFrame(data={"Episode_rewards": episode_rewards[-500:], "Episode_critic_loss": episode_critic_loss[-500:], "Win/Loss": win_count[-500:]}).to_csv(save_path + "/performance.csv", sep=",", index=False, mode="a", header=False)
                    
    print(f"Finished training with a final mean reward of {np.mean(episode_rewards[-500:])}")





    # plot the performance summary
    if hyperparameters["plot_performance_summary"]:
            try:
                fig, (ax1, ax2) = plt.subplots(2)
                x = list(range(len(episode_critic_loss)))
                coef = np.polyfit(x, episode_critic_loss,1)
                poly1d_fn = np.poly1d(coef)
                ax1.plot(episode_critic_loss)
                ax1.plot(poly1d_fn(list(range(len(episode_critic_loss)))))


                x = list(range(len(episode_rewards)))
                coef = np.polyfit(x, episode_rewards,1)
                poly1d_fn = np.poly1d(coef)
                ax2.plot(episode_rewards)
                ax2.plot(poly1d_fn(list(range(len(episode_rewards)))))
                fig.show()
                fig.savefig(save_path + "/performance.png", bbox_inches="tight")
            except:
                print("Failed saving figure")
示例#9
0
    def train(self, agent, opponents, env, eval):

        epsilon = self._config['eps']
        epsilon_decay = self._config['epsilon_decay']
        min_epsilon = self._config['min_epsilon']
        iter_fit = self._config['iter_fit']
        episode_counter = 1
        total_step_counter = 0

        rew_stats = []
        loss_stats = []
        lost_stats = {}
        touch_stats = {}
        won_stats = {}
        eval_stats = {'reward': [], 'touch': [], 'won': [], 'lost': []}
        while episode_counter <= self._config['max_episodes']:
            ob = env.reset()
            obs_agent2 = env.obs_agent_two()
            epsilon = max(epsilon_decay * epsilon, min_epsilon)
            total_reward = 0
            touched = 0
            touch_stats[episode_counter] = 0
            won_stats[episode_counter] = 0
            lost_stats[episode_counter] = 0
            opponent = utils.poll_opponent(opponents)

            first_time_touch = 1
            for step in range(self._config['max_steps']):
                if self._config['TD3agent']:
                    a1 = agent.act(ob, noise=self._config['noise'])
                else:
                    a1 = agent.act(ob, eps=epsilon)
                if self._config['mode'] == 'defense':
                    a2 = opponent.act(obs_agent2)
                elif self._config['mode'] == 'shooting':
                    a2 = [0, 0, 0, 0]
                else:
                    a2 = opponent.act(obs_agent2)
                (ob_new, reward, done, _info) = env.step(np.hstack([a1, a2]))
                touched = max(touched, _info['reward_touch_puck'])
                current_reward = reward + 5 * _info[
                    'reward_closeness_to_puck'] - (
                        1 - touched
                    ) * 0.1 + touched * first_time_touch * 0.1 * step

                total_reward += current_reward

                first_time_touch = 1 - touched
                agent.store_transition((ob, a1, current_reward, ob_new, done))

                if self._config['show']:
                    time.sleep(0.01)
                    env.render()

                if touched > 0:
                    touch_stats[episode_counter] = 1

                if done:
                    won_stats[episode_counter] = 1 if env.winner == 1 else 0
                    lost_stats[episode_counter] = 1 if env.winner == -1 else 0
                    break

                ob = ob_new
                obs_agent2 = env.obs_agent_two()
                total_step_counter += 1

            loss_stats.extend(
                agent.train(iter_fit=iter_fit,
                            total_step_counter=episode_counter))

            rew_stats.append(total_reward)

            self.logger.print_episode_info(env.winner, episode_counter, step,
                                           total_reward, epsilon)

            if episode_counter % self._config['evaluate_every'] == 0:
                agent.eval()

                rew, touch, won, lost = evaluate(
                    agent,
                    env,
                    h_env.BasicOpponent(weak=True),
                    self._config['eval_episodes'],
                    quiet=True)
                agent.train_mode()

                eval_stats['reward'].append(rew)
                eval_stats['touch'].append(touch)
                eval_stats['won'].append(won)
                eval_stats['lost'].append(lost)
                self.logger.save_model(agent, f'a-{episode_counter}.pk l')

                self.logger.plot_intermediate_stats(eval_stats, show=False)

            agent.schedulers_step()
            episode_counter += 1

        if self._config['show']:
            env.close()

        # Print train stats
        self.logger.print_stats(rew_stats, touch_stats, won_stats, lost_stats)

        self.logger.info('Saving training statistics...')

        # Plot reward
        self.logger.plot_running_mean(rew_stats,
                                      'Total reward',
                                      'total-reward.pdf',
                                      show=False)

        # Plot evaluation stats
        self.logger.plot_intermediate_stats(eval_stats, show=False)

        # Plot loss
        self.logger.plot_running_mean(loss_stats,
                                      'Loss',
                                      'loss.pdf',
                                      show=False)

        # Save model
        self.logger.save_model(agent, 'agent.pkl')

        # Log rew histograms

        print(eval_stats['won'])

        if eval:
            agent.eval()
            agent._config['show'] = True
            evaluate(agent, env, h_env.BasicOpponent(weak=False),
                     self._config['eval_episodes'])
            agent.train_mode()