Пример #1
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    experiment_name = args.env_name + '-' + args.algo + '-' + datetime.datetime.now(
    ).strftime("%Y-%m-%d-%H-%M-%S-%f")
    log_dir, eval_log_dir, save_dir = setup_dirs(experiment_name, args.log_dir,
                                                 args.save_dir)

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = make_vec_envs(args.env_name,
                         args.seed,
                         args.num_processes,
                         args.gamma,
                         log_dir,
                         args.add_timestep,
                         device,
                         False,
                         frame_skip=args.frame_skip)

    if args.load_path:
        actor_critic, _ob_rms = torch.load(args.load_path)
        vec_norm = get_vec_normalize(envs)
        if vec_norm is not None:
            vec_norm.train()
            vec_norm.ob_rms = _ob_rms
        actor_critic.train()
    else:
        actor_critic = Policy(envs.observation_space.shape,
                              envs.action_space,
                              beta=args.beta_dist,
                              base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo.startswith('a2c'):
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               lr_schedule=args.lr_schedule,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo.startswith('ppo'):
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         lr_schedule=args.lr_schedule,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.algo.endswith('sil'):
        agent = algo.SIL(agent,
                         update_ratio=args.sil_update_ratio,
                         epochs=args.sil_epochs,
                         batch_size=args.sil_batch_size,
                         beta=args.sil_beta,
                         value_loss_coef=args.sil_value_loss_coef,
                         entropy_coef=args.sil_entropy_coef)
        replay = ReplayStorage(10000,
                               num_processes=args.num_processes,
                               gamma=args.gamma,
                               prio_alpha=args.sil_alpha,
                               obs_shape=envs.observation_space.shape,
                               action_space=envs.action_space,
                               recurrent_hidden_state_size=actor_critic.
                               recurrent_hidden_state_size,
                               device=device)
    else:
        replay = None

    action_high = torch.from_numpy(envs.action_space.high).to(device)
    action_low = torch.from_numpy(envs.action_space.low).to(device)
    action_mid = 0.5 * (action_high + action_low)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    benchmark_rewards = deque(maxlen=10)

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                # sample actions
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            if args.clip_action and isinstance(envs.action_space,
                                               gym.spaces.Box):
                clipped_action = action.clone()
                if args.shift_action:
                    # FIXME experimenting with this, so far resulting in
                    # faster learning when clipping guassian continuous
                    # output (vs leaving centred at 0 and unscaled)
                    clipped_action = 0.5 * clipped_action + action_mid
                clipped_action = torch.max(
                    torch.min(clipped_action, action_high), action_low)
            else:
                clipped_action = action

            # act in environment and observe
            obs, reward, done, infos = envs.step(clipped_action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])
                    if 'rb' in info['episode']:
                        benchmark_rewards.append(info['episode']['rb'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)
            if replay is not None:
                replay.insert(rollouts.obs[step],
                              rollouts.recurrent_hidden_states[step], action,
                              reward, done)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(
            rollouts, j, replay)

        rollouts.after_update()

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        train_eprew = np.mean(episode_rewards)
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} episodes: mean/med {:.1f}/{:.1f}, min/max reward {:.2f}/{:.2f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), train_eprew,
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss),
                end='')
            if len(benchmark_rewards):
                print(", benchmark {:.1f}/{:.1f}, {:.1f}/{:.1f}".format(
                    np.mean(benchmark_rewards), np.median(benchmark_rewards),
                    np.min(benchmark_rewards), np.max(benchmark_rewards)),
                      end='')
            print()

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(args.env_name,
                                      args.seed + args.num_processes,
                                      args.num_processes, args.gamma,
                                      eval_log_dir, args.add_timestep, device,
                                      True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                clipped_action = action
                if args.clip_action and isinstance(envs.action_space,
                                                   gym.spaces.Box):
                    if args.shift_action:
                        clipped_action = 0.5 * clipped_action + action_mid
                    clipped_action = torch.max(
                        torch.min(clipped_action, action_high), action_low)

                obs, reward, done, infos = eval_envs.step(clipped_action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            eval_eprew = np.mean(eval_episode_rewards)
            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), eval_eprew))

        if len(episode_rewards
               ) and j % args.save_interval == 0 and save_dir != "":
            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            ep_rewstr = ("%d" % train_eprew).replace("-", "n")
            save_filename = os.path.join(
                save_dir, './checkpoint-%d-%s.pt' % (j, ep_rewstr))

            torch.save(save_model, save_filename)

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, log_dir, args.env_name, args.algo,
                                  args.num_frames)
            except IOError:
                pass
Пример #2
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    train_envs = make_vec_envs(args.env_name,
                               args.seed,
                               args.num_processes,
                               args.gamma,
                               args.no_norm,
                               args.num_stack,
                               args.log_dir,
                               args.add_timestep,
                               device,
                               allow_early_resets=False)

    if args.eval_interval:
        eval_seed = args.seed if args.seed is None else args.seed + args.num_processes
        eval_envs = make_vec_envs(args.env_name,
                                  eval_seed,
                                  args.num_processes // 4,
                                  args.gamma,
                                  args.no_norm,
                                  args.num_stack,
                                  eval_log_dir,
                                  args.add_timestep,
                                  device=device,
                                  allow_early_resets=True,
                                  eval=True,
                                  rank_offsest=args.num_processes)

        if eval_envs.venv.__class__.__name__ == "VecNormalize":
            eval_envs.venv.ob_rms = train_envs.venv.ob_rms
    else:
        eval_envs = None

    print(train_envs.observation_space.shape)

    noisy_net = True

    actor_critic = create_policy(
        train_envs.observation_space,
        train_envs.action_space,
        name='basic',
        nn_kwargs={
            #'batch_norm': False if args.algo == 'acktr' else True,
            'recurrent': 'lstm' if args.recurrent_policy else '',
            'hidden_size': 512,
        },
        noisy_net=noisy_net,
        train=True)

    if args.resume and os.path.isfile(args.resume):
        print('Resuming from checkpoint (%s)' % args.resume)
        state_dict, ob_rms = torch.load(args.resume, map_location='cpu')
        actor_critic.load_state_dict(state_dict)

    actor_critic.to(device)

    if args.algo.startswith('a2c'):
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               lr_schedule=lr_update_schedule,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo.startswith('ppo'):
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         lr_schedule=lr_update_schedule,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.algo.endswith('sil'):
        agent = algo.SIL(agent,
                         update_ratio=args.sil_update_ratio,
                         epochs=args.sil_epochs,
                         batch_size=args.sil_batch_size,
                         value_loss_coef=args.sil_value_loss_coef
                         or args.value_loss_coef,
                         entropy_coef=args.sil_entropy_coef
                         or args.entropy_coef)
        replay = ReplayStorage(1e5,
                               args.num_processes,
                               args.gamma,
                               0.1,
                               train_envs.observation_space.shape,
                               train_envs.action_space,
                               actor_critic.recurrent_hidden_state_size,
                               device=device)
    else:
        replay = None

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              train_envs.observation_space.shape,
                              train_envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = train_envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    for j in range(num_updates):
        if noisy_net:
            actor_critic.reset_noise()

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = train_envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done],
                                 device=device)
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)
            if replay is not None:
                replay.insert(rollouts.obs[step],
                              rollouts.recurrent_hidden_states[step], action,
                              reward, done)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy, other_metrics = agent.update(
            rollouts, j, replay)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model.state_dict(),
                hasattr(train_envs.venv, 'ob_rms') and train_envs.venv.ob_rms
                or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * update_factor

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {}, last {} mean/median reward {:.1f}/{:.1f}, "
                "min / max reward {:.1f}/{:.1f}, value/action loss {:.5f}/{:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss),
                end=', ' if other_metrics else '\n')
            if 'sil_value_loss' in other_metrics:
                print("SIL value/action loss {:.1f}/{:.1f}.".format(
                    other_metrics['sil_value_loss'],
                    other_metrics['sil_action_loss']))

        if args.eval_interval and len(
                episode_rewards) > 1 and j > 0 and j % args.eval_interval == 0:
            actor_critic.eval()

            eval_episode_rewards = []
            num_eval_processes = args.num_processes // 4
            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                2,
                num_eval_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(num_eval_processes, 1, device=device)

            while len(eval_episode_rewards) < 50:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)
                eval_masks = torch.tensor([[0.0] if done_ else [1.0]
                                           for done_ in done],
                                          device=device)
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), np.mean(eval_episode_rewards)))

            actor_critic.train()

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
Пример #3
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from torch.utils.tensorboard import SummaryWriter

        save_path = os.path.join(args.save_dir, args.algo)
        try:
            os.makedirs(save_path)
        except OSError:
            pass

        viz = SummaryWriter(
            os.path.join(
                save_path, args.env_name +
                time.strftime("_%d_%b_%H_%M", time.localtime())))

    train_envs = make_vec_envs(args.env_name,
                               args.seed,
                               args.num_processes,
                               args.gamma,
                               args.no_norm,
                               args.num_stack,
                               args.log_dir,
                               args.add_timestep,
                               device,
                               allow_early_resets=False)

    if args.eval_interval:
        eval_envs = make_vec_envs(args.env_name,
                                  args.seed + args.num_processes,
                                  args.num_processes,
                                  args.gamma,
                                  args.no_norm,
                                  args.num_stack,
                                  eval_log_dir,
                                  args.add_timestep,
                                  device,
                                  allow_early_resets=True,
                                  eval=True)

        if eval_envs.venv.__class__.__name__ == "VecNormalize":
            eval_envs.venv.ob_rms = train_envs.venv.ob_rms
    else:
        eval_envs = None

    # FIXME this is very specific to Pommerman env right now
    actor_critic = create_policy(train_envs.observation_space,
                                 train_envs.action_space,
                                 name='pomm',
                                 nn_kwargs={
                                     'batch_norm':
                                     False if args.algo == 'acktr' else True,
                                     'recurrent':
                                     args.recurrent_policy,
                                     'hidden_size':
                                     512,
                                 },
                                 train=True)

    if args.load_path != '':
        actor_critic.load_state_dict(state_dict)
    actor_critic.to(device)

    if args.algo.startswith('a2c'):
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               lr_schedule=lr_update_schedule,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo.startswith('ppo'):
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         lr_schedule=lr_update_schedule,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.algo.endswith('sil'):
        agent = algo.SIL(agent,
                         update_ratio=args.sil_update_ratio,
                         epochs=args.sil_epochs,
                         batch_size=args.sil_batch_size,
                         value_loss_coef=args.sil_value_loss_coef
                         or args.value_loss_coef,
                         entropy_coef=args.sil_entropy_coef
                         or args.entropy_coef)
        replay = ReplayStorage(5e5,
                               args.num_processes,
                               args.gamma,
                               0.1,
                               train_envs.observation_space.shape,
                               train_envs.action_space,
                               actor_critic.recurrent_hidden_state_size,
                               device=device)
    else:
        replay = None

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              train_envs.observation_space.shape,
                              train_envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = train_envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = train_envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done],
                                 device=device)
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)
            if replay is not None:
                replay.insert(rollouts.obs[step],
                              rollouts.recurrent_hidden_states[step], action,
                              reward, done)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy, other_metrics = agent.update(
            rollouts, j, replay)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model.state_dict(),
                hasattr(train_envs.venv, 'ob_rms') and train_envs.venv.ob_rms
                or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * update_factor

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {}, last {} mean/median reward {:.1f}/{:.1f}, "
                "min / max reward {:.1f}/{:.1f}, dist_entropy{:.5f} value/action loss {:.5f}/{:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss),
                end=', ' if other_metrics else '\n')
            if 'sil_value_loss' in other_metrics:
                print("SIL value/action loss {:.1f}/{:.1f}.".format(
                    other_metrics['sil_value_loss'],
                    other_metrics['sil_action_loss']))

            if args.vis:
                viz.add_scalar('episode_rewards/mean',
                               np.mean(episode_rewards), total_num_steps)
                viz.add_scalar('episode_rewards/median',
                               np.median(episode_rewards), total_num_steps)
                viz.add_scalar('episode_rewards/min', np.min(episode_rewards),
                               total_num_steps)
                viz.add_scalar('episode_rewards/max', np.max(episode_rewards),
                               total_num_steps)
                viz.add_scalar('train/value_loss', value_loss, total_num_steps)
                viz.add_scalar('train/action_loss', action_loss,
                               total_num_steps)
                viz.add_scalar('train/dist_entropy', dist_entropy,
                               total_num_steps)

        if args.eval_interval and len(
                episode_rewards) > 1 and j > 0 and j % args.eval_interval == 0:
            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 50:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)
                eval_masks = torch.tensor([[0.0] if done_ else [1.0]
                                           for done_ in done],
                                          device=device)
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), np.mean(eval_episode_rewards)))
Пример #4
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    #The network
    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo.startswith('a2c'):
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo.startswith('ppo'):
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':

        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.algo.endswith('sil'):
        agent = algo.SIL(agent,
                         update_ratio=args.sil_update_ratio,
                         epochs=args.sil_epochs,
                         batch_size=args.sil_batch_size,
                         beta=args.sil_beta,
                         value_loss_coef=args.sil_value_loss_coef,
                         entropy_coef=args.sil_entropy_coef)
        replay = ReplayStorage(10000,
                               num_processes=args.num_processes,
                               gamma=args.gamma,
                               prio_alpha=args.sil_alpha,
                               obs_shape=envs.observation_space.shape,
                               action_space=envs.action_space,
                               recurrent_hidden_state_size=actor_critic.
                               recurrent_hidden_state_size,
                               device=device)
    else:
        replay = None

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)
            if replay is not None:
                replay.insert(rollouts.obs[step],
                              rollouts.recurrent_hidden_states[step], action,
                              reward, done)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(
            rollouts, j, replay)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(args.env_name,
                                      args.seed + args.num_processes,
                                      args.num_processes, args.gamma,
                                      eval_log_dir, args.add_timestep, device,
                                      True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), np.mean(eval_episode_rewards)))

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass