Exemplo n.º 1
0
def main():
    parser = otc_arg_parser()
    # args = get_args()
    args = parser.parse_args()
    args.cuda = not args.no_cuda and torch.cuda.is_available()

    assert args.algo in ['a2c', 'ppo', 'acktr']
    if args.recurrent_policy:
        assert args.algo in ['a2c', 'ppo'], \
            'Recurrent policy is not implemented for ACKTR'

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    tf_log_dir = os.path.join(log_dir, args.exp_name)
    if not os.path.exists(tf_log_dir):
        os.makedirs(tf_log_dir)
    writer = SummaryWriter(log_dir=tf_log_dir)
    eval_log_dir = log_dir + "_eval"
    # history_file = os.path.join(log_dir, args.exp_name+'.csv')

    torch.set_num_threads(1)
    # device = torch.device("cuda:0" if args.cuda else "cpu")
    device = torch.device("cuda" if args.cuda else "cpu")

    # envs = make_vec_envs(args.env, args.seed, args.num_processes,
    #                      args.gamma, args.log_dir, device, False)
    envs = make_otc_env(args, device)

    save_path = os.path.join(args.save_dir, args.exp_name)
    if args.load:
        actor_critic, ob_rms = \
                torch.load(
                    os.path.join(save_path, args.env + ".pt"))
        vec_norm = get_vec_normalize(envs)
        if vec_norm is not None:
            vec_norm.eval()
            vec_norm.ob_rms = ob_rms
    else:
        obs_shape = envs.observation_space.spaces['visual'].shape
        vector_obs_len = envs.observation_space.spaces['vector'].shape[0]
        actor_critic = Policy(obs_shape,
                              envs.action_space,
                              base=CNNBase,
                              base_kwargs={'recurrent': args.recurrent_policy},
                              vector_obs_len=vector_obs_len)
    if torch.cuda.device_count() > 1:
        actor_critic_parallel = nn.DataParallel(actor_critic,
                                                device_ids=[0, 1])
        actor_critic = actor_critic_parallel.module
    if args.half_precision:
        actor_critic.half()  # convert to half precision
        for layer in actor_critic.modules():
            if isinstance(layer, nn.BatchNorm2d):
                layer.float()
    actor_critic.to(device, non_blocking=True)
    from pytorch_wrappers import VecPyTorch  #,  VecPyTorchFrameStack
    envs = VecPyTorch(envs, device, half_precision=args.half_precision)
    # envs = VecPyTorchFrameStack(envs, 1, device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir,
            "trajs_{}.pt".format(args.env.split('-')[0].lower()))

        gail_train_loader = torch.utils.data.DataLoader(
            gail.ExpertDataset(file_name,
                               num_trajectories=4,
                               subsample_frequency=20),
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape,
                              ([envs.vector_obs_len]), envs.action_space,
                              actor_critic.recurrent_hidden_state_size)
    if args.half_precision:
        rollouts.half()
    obs, vector_obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.vector_obs[0].copy_(vector_obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=100)
    episode_floors = deque(maxlen=100)
    episode_times = deque(maxlen=100)
    # history_column_names = ['AgentId', 'Start', 'Seed', 'Floor', 'Reward', 'Steps', 'Time']
    # history_column_types = {'AgentId':np.int, 'Start':np.int, 'Seed':np.int, 'Floor':np.int, 'Reward':np.float, 'Steps':np.int, 'Time':np.float}
    # try:
    #     history_df = pd.read_csv(history_file, dtype={'AgentId':np.int, 'Start': np.int,'Seed':np.int,'Floor': np.int,'Steps':np.int},)
    # except FileNotFoundError:
    #     history_df = pd.DataFrame(columns = history_column_names).astype( dtype=history_column_types)
    #     history_df.to_csv(history_file, encoding='utf-8', index=False)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.vector_obs[step],
                    rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # action_cpu = action.cpu() # send a copy to the cpu

            # Obser reward and next obs
            obs, vector_obs, reward, done, infos = envs.step(action)

            # for i in range(len(action)):
            #     info = infos[i]
            #     # actual_action = action if 'actual_action' not in info.keys() else info['actual_action']
            #     # action[i][0]=int(actual_action)
            #     if 'actual_action' in info.keys() and int(info['actual_action']) != int(action_cpu[i][0]):
            #         action[i][0]=int(info['actual_action'])

            history_is_dirty = False
            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])
                    episode_floors.append(int(info['episode']['floor']))
                    episode_times.append(info['episode']['l'])
            #         data = [int(info['episode']['agent']),
            #                 int(info['episode']['start']), int(info['episode']['seed']), int(info['episode']['floor']),
            #                 np.around(info['episode']['r'],6), int(info['episode']['l']), info['episode']['t']]
            #         new_line = pd.DataFrame([data], columns = history_column_names).astype( dtype=history_column_types)
            #         history_df = new_line.append(history_df)
            #         history_is_dirty = True
            # if history_is_dirty:
            #     history_df.to_csv(history_file, encoding='utf-8', index=False)

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            # [[0.0] if done_ else [1.0] for done_ in done]).to(device)
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            #  for info in infos]).to(device)
            if args.half_precision:
                masks = masks.half()
                bad_masks = bad_masks.half()
            rollouts.insert(obs, vector_obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.vector_obs[-1],
                rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.vector_obs[step],
                    rollouts.actions[step], args.gamma, rollouts.masks[step])

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            try:
                os.makedirs(save_path)
            except OSError:
                pass
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print("Save at update {} / timestep {}".format(j, total_num_steps))
            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.env + ".pt"))

        if j % args.log_interval == 0:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            if len(episode_rewards) == 0:
                print(
                    "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}"
                    .format(
                        j,
                        total_num_steps,
                        int(total_num_steps / (end - start)),
                        0,
                        0,  # len(episode_rewards), np.mean(episode_rewards),
                        0,
                        0,  # np.median(episode_rewards), np.min(episode_rewards),
                        0,  # np.max(episode_rewards), 
                        dist_entropy,
                        value_loss,
                        action_loss))
            else:
                writer.add_scalar('reward',
                                  np.average(episode_rewards),
                                  global_step=total_num_steps)
                writer.add_scalar('floor',
                                  np.average(episode_floors),
                                  global_step=total_num_steps)
                writer.add_scalar('reward.std',
                                  np.std(episode_rewards),
                                  global_step=total_num_steps)
                writer.add_scalar('floor.std',
                                  np.std(episode_floors),
                                  global_step=total_num_steps)
                writer.add_scalar('steps',
                                  np.average(episode_times),
                                  global_step=total_num_steps)
                # writer.add_scalar('median', np.median(episode_rewards), global_step=total_num_steps)
                # writer.add_scalar('min', np.min(episode_rewards), global_step=total_num_steps)
                # writer.add_scalar('max', np.max(episode_rewards), global_step=total_num_steps)
                writer.add_scalar('FPS',
                                  int(total_num_steps / (end - start)),
                                  global_step=total_num_steps)
                writer.add_scalar('value_loss',
                                  np.around(value_loss, 6),
                                  global_step=total_num_steps)
                writer.add_scalar("action_loss:",
                                  np.around(action_loss, 6),
                                  global_step=total_num_steps)
                writer.add_scalar("dist_entropy:",
                                  np.around(dist_entropy, 6),
                                  global_step=total_num_steps)
                print(
                    "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}"
                    .format(j, total_num_steps,
                            int(total_num_steps / (end - start)),
                            len(episode_rewards), np.mean(episode_rewards),
                            np.median(episode_rewards),
                            np.min(episode_rewards), np.max(episode_rewards),
                            dist_entropy, value_loss, action_loss))
            print("value_loss:", np.around(value_loss, 6), "action_loss:",
                  np.around(action_loss, 6), "dist_entropy:",
                  np.around(dist_entropy, 6))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env, args.seed,
                     args.num_processes, eval_log_dir, device)
Exemplo n.º 2
0
def main():
    parser = otc_arg_parser()
    # args = get_args()
    args = parser.parse_args()
    args.cuda = not args.no_cuda and torch.cuda.is_available()

    assert args.algo in ['a2c', 'ppo', 'acktr']
    if args.recurrent_policy:
        assert args.algo in ['a2c', 'ppo'], \
            'Recurrent policy is not implemented for ACKTR'

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    args.score = True
    args.load = True
    args.num_processes = 1

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    # utils.cleanup_log_dir(log_dir)
    # utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")
    device = torch.device("cpu")

    # envs = make_vec_envs(args.env, args.seed, args.num_processes,
    #                      args.gamma, args.log_dir, device, False)
    envs = make_otc_env(args, device, start_index=258)

    save_path = os.path.join(args.save_dir, args.exp_name)
    actor_critic, ob_rms = \
            torch.load(
                os.path.join(save_path, args.env + ".pt"),
                map_location='cpu')
    actor_critic.to(device)
    from pytorch_wrappers import VecPyTorch  #,  VecPyTorchFrameStack
    envs = VecPyTorch(envs, device)
    # envs = VecPyTorchFrameStack(envs, 1, device)
    env = envs  #[0]

    vec_norm = get_vec_normalize(env)
    if vec_norm is not None:
        vec_norm.eval()
        vec_norm.ob_rms = ob_rms

    recurrent_hidden_states = torch.zeros(
        1, actor_critic.recurrent_hidden_state_size)
    masks = torch.zeros(1, 1)

    obs, vector_obs = env.reset()

    episode_reward = 0
    episode_rewards = []
    total_episodes = 0
    max_level = 0
    max_levels = []

    while True:
        with torch.no_grad():
            value, action, _, recurrent_hidden_states = actor_critic.act(
                obs,
                vector_obs,
                recurrent_hidden_states,
                masks,
                deterministic=args.cuda_deterministic)

        # Obser reward and next obs
        obs, vector_obs, reward, done, _ = env.step(action)

        masks.fill_(0.0 if done else 1.0)
        reward = float(reward.mean())
        episode_reward += reward
        if reward == 1:
            max_level += 1
        # elif reward > 0:
        #     print ('reward:', reward)
        if done:
            episode_rewards.append(episode_reward)
            ave_reward = sum(episode_rewards) / len(episode_rewards)
            total_episodes += 1
            max_levels.append(max_level)
            ave_level = sum(max_levels) / len(max_levels)
            print('ep:', total_episodes, 'level:', max_level, 'ave_level:',
                  round(ave_level, 2), 'episode_reward:',
                  round(episode_reward, 2), 'ave_reward', round(ave_reward, 2))
            episode_reward = 0
            max_level = 0
            if is_grading(env):
                if done_grading(env):
                    break
            elif total_episodes >= 25:
                break