def main():
    args = get_args()
    

    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    torch.set_num_threads(1)

    device = torch.device(args.device)
    utils.cleanup_log_dir(args.log_dir)
    env_make = make_pybullet_env(args.task, log_dir=args.log_dir, frame_skip=args.frame_skip)
    envs = make_vec_envs(env_make, args.num_processes, args.log_dir, device, args.frame_stack)
    actor_critic = MetaPolicy(envs.observation_space, envs.action_space)
    loss_writer = LossWriter(args.log_dir, fieldnames= ('V_loss','action_loss','meta_action_loss','meta_value_loss','meta_loss', 'loss'))

    if args.restart_model:
        actor_critic.load_state_dict(
            torch.load(args.restart_model, map_location=device))

    actor_critic.to(device)

    agent = MetaPPO(
        actor_critic, args.clip_param, args.ppo_epoch,
        args.num_mini_batch, args.value_loss_coef,
        args.entropy_coef, lr=args.lr, eps=args.eps,
        max_grad_norm=args.max_grad_norm)

    obs = envs.reset()
    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs, envs.action_space, actor_critic.recurrent_hidden_state_size)
    rollouts.to(device)  # they live in GPU, converted to torch from the env wrapper

    start = time.time()
    num_updates = int(args.num_env_steps) // args.num_steps // args.num_processes

    for j in range(num_updates):

        ppo_rollout(args.num_steps, envs, actor_critic, rollouts)

        value_loss, meta_value_loss, action_loss, meta_action_loss, loss, meta_loss = ppo_update(
            agent, actor_critic, rollouts, args.use_gae, args.gamma, args.gae_lambda)
        
        loss_writer.write_row({'V_loss': value_loss.item(), 'action_loss': action_loss.item(), 'meta_action_loss':meta_action_loss.item(),'meta_value_loss':meta_value_loss.item(),'meta_loss': meta_loss.item(), 'loss': loss.item()} )
        
        if (j % args.save_interval == 0 or j == num_updates - 1) and args.log_dir != "":
            ppo_save_model(actor_critic, os.path.join(args.log_dir, "model.state_dict"), j)

        if j % args.log_interval == 0:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            s = "Update {}, num timesteps {}, FPS {} \n".format(
                j, total_num_steps, int(total_num_steps / (time.time() - start)))
            s += "Loss {:.5f}, meta loss {:.5f}, value_loss {:.5f}, meta_value_loss {:.5f}, action_loss {:.5f}, meta action loss {:.5f}".format(
                loss.item(), meta_loss.item(), value_loss.item(), meta_value_loss.item(), action_loss.item(), meta_action_loss.item())
            print(s, flush=True)
示例#2
0
def main():
    parser = otc_arg_parser()
    # args = get_args()
    args = parser.parse_args()
    args.cuda = not args.no_cuda and torch.cuda.is_available()

    assert args.algo in ['a2c', 'ppo', 'acktr']
    if args.recurrent_policy:
        assert args.algo in ['a2c', 'ppo'], \
            'Recurrent policy is not implemented for ACKTR'

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    tf_log_dir = os.path.join(log_dir, args.exp_name)
    if not os.path.exists(tf_log_dir):
        os.makedirs(tf_log_dir)
    writer = SummaryWriter(log_dir=tf_log_dir)
    eval_log_dir = log_dir + "_eval"
    # history_file = os.path.join(log_dir, args.exp_name+'.csv')

    torch.set_num_threads(1)
    # device = torch.device("cuda:0" if args.cuda else "cpu")
    device = torch.device("cuda" if args.cuda else "cpu")

    # envs = make_vec_envs(args.env, args.seed, args.num_processes,
    #                      args.gamma, args.log_dir, device, False)
    envs = make_otc_env(args, device)

    save_path = os.path.join(args.save_dir, args.exp_name)
    if args.load:
        actor_critic, ob_rms = \
                torch.load(
                    os.path.join(save_path, args.env + ".pt"))
        vec_norm = get_vec_normalize(envs)
        if vec_norm is not None:
            vec_norm.eval()
            vec_norm.ob_rms = ob_rms
    else:
        obs_shape = envs.observation_space.spaces['visual'].shape
        vector_obs_len = envs.observation_space.spaces['vector'].shape[0]
        actor_critic = Policy(obs_shape,
                              envs.action_space,
                              base=CNNBase,
                              base_kwargs={'recurrent': args.recurrent_policy},
                              vector_obs_len=vector_obs_len)
    if torch.cuda.device_count() > 1:
        actor_critic_parallel = nn.DataParallel(actor_critic,
                                                device_ids=[0, 1])
        actor_critic = actor_critic_parallel.module
    if args.half_precision:
        actor_critic.half()  # convert to half precision
        for layer in actor_critic.modules():
            if isinstance(layer, nn.BatchNorm2d):
                layer.float()
    actor_critic.to(device, non_blocking=True)
    from pytorch_wrappers import VecPyTorch  #,  VecPyTorchFrameStack
    envs = VecPyTorch(envs, device, half_precision=args.half_precision)
    # envs = VecPyTorchFrameStack(envs, 1, device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir,
            "trajs_{}.pt".format(args.env.split('-')[0].lower()))

        gail_train_loader = torch.utils.data.DataLoader(
            gail.ExpertDataset(file_name,
                               num_trajectories=4,
                               subsample_frequency=20),
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape,
                              ([envs.vector_obs_len]), envs.action_space,
                              actor_critic.recurrent_hidden_state_size)
    if args.half_precision:
        rollouts.half()
    obs, vector_obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.vector_obs[0].copy_(vector_obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=100)
    episode_floors = deque(maxlen=100)
    episode_times = deque(maxlen=100)
    # history_column_names = ['AgentId', 'Start', 'Seed', 'Floor', 'Reward', 'Steps', 'Time']
    # history_column_types = {'AgentId':np.int, 'Start':np.int, 'Seed':np.int, 'Floor':np.int, 'Reward':np.float, 'Steps':np.int, 'Time':np.float}
    # try:
    #     history_df = pd.read_csv(history_file, dtype={'AgentId':np.int, 'Start': np.int,'Seed':np.int,'Floor': np.int,'Steps':np.int},)
    # except FileNotFoundError:
    #     history_df = pd.DataFrame(columns = history_column_names).astype( dtype=history_column_types)
    #     history_df.to_csv(history_file, encoding='utf-8', index=False)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.vector_obs[step],
                    rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # action_cpu = action.cpu() # send a copy to the cpu

            # Obser reward and next obs
            obs, vector_obs, reward, done, infos = envs.step(action)

            # for i in range(len(action)):
            #     info = infos[i]
            #     # actual_action = action if 'actual_action' not in info.keys() else info['actual_action']
            #     # action[i][0]=int(actual_action)
            #     if 'actual_action' in info.keys() and int(info['actual_action']) != int(action_cpu[i][0]):
            #         action[i][0]=int(info['actual_action'])

            history_is_dirty = False
            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])
                    episode_floors.append(int(info['episode']['floor']))
                    episode_times.append(info['episode']['l'])
            #         data = [int(info['episode']['agent']),
            #                 int(info['episode']['start']), int(info['episode']['seed']), int(info['episode']['floor']),
            #                 np.around(info['episode']['r'],6), int(info['episode']['l']), info['episode']['t']]
            #         new_line = pd.DataFrame([data], columns = history_column_names).astype( dtype=history_column_types)
            #         history_df = new_line.append(history_df)
            #         history_is_dirty = True
            # if history_is_dirty:
            #     history_df.to_csv(history_file, encoding='utf-8', index=False)

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            # [[0.0] if done_ else [1.0] for done_ in done]).to(device)
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            #  for info in infos]).to(device)
            if args.half_precision:
                masks = masks.half()
                bad_masks = bad_masks.half()
            rollouts.insert(obs, vector_obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.vector_obs[-1],
                rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.vector_obs[step],
                    rollouts.actions[step], args.gamma, rollouts.masks[step])

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            try:
                os.makedirs(save_path)
            except OSError:
                pass
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print("Save at update {} / timestep {}".format(j, total_num_steps))
            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.env + ".pt"))

        if j % args.log_interval == 0:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            if len(episode_rewards) == 0:
                print(
                    "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}"
                    .format(
                        j,
                        total_num_steps,
                        int(total_num_steps / (end - start)),
                        0,
                        0,  # len(episode_rewards), np.mean(episode_rewards),
                        0,
                        0,  # np.median(episode_rewards), np.min(episode_rewards),
                        0,  # np.max(episode_rewards), 
                        dist_entropy,
                        value_loss,
                        action_loss))
            else:
                writer.add_scalar('reward',
                                  np.average(episode_rewards),
                                  global_step=total_num_steps)
                writer.add_scalar('floor',
                                  np.average(episode_floors),
                                  global_step=total_num_steps)
                writer.add_scalar('reward.std',
                                  np.std(episode_rewards),
                                  global_step=total_num_steps)
                writer.add_scalar('floor.std',
                                  np.std(episode_floors),
                                  global_step=total_num_steps)
                writer.add_scalar('steps',
                                  np.average(episode_times),
                                  global_step=total_num_steps)
                # writer.add_scalar('median', np.median(episode_rewards), global_step=total_num_steps)
                # writer.add_scalar('min', np.min(episode_rewards), global_step=total_num_steps)
                # writer.add_scalar('max', np.max(episode_rewards), global_step=total_num_steps)
                writer.add_scalar('FPS',
                                  int(total_num_steps / (end - start)),
                                  global_step=total_num_steps)
                writer.add_scalar('value_loss',
                                  np.around(value_loss, 6),
                                  global_step=total_num_steps)
                writer.add_scalar("action_loss:",
                                  np.around(action_loss, 6),
                                  global_step=total_num_steps)
                writer.add_scalar("dist_entropy:",
                                  np.around(dist_entropy, 6),
                                  global_step=total_num_steps)
                print(
                    "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}"
                    .format(j, total_num_steps,
                            int(total_num_steps / (end - start)),
                            len(episode_rewards), np.mean(episode_rewards),
                            np.median(episode_rewards),
                            np.min(episode_rewards), np.max(episode_rewards),
                            dist_entropy, value_loss, action_loss))
            print("value_loss:", np.around(value_loss, 6), "action_loss:",
                  np.around(action_loss, 6), "dist_entropy:",
                  np.around(dist_entropy, 6))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env, args.seed,
                     args.num_processes, eval_log_dir, device)
               batch_size=eval_batch_size,
               shuffle=False,
               collate_fn=collate_fn,
               num_workers=0)
}

# construct environment

env = MaximumIndependentSetEnv(max_epi_t=max_epi_t,
                               max_num_nodes=max_num_nodes,
                               hamming_reward_coef=hamming_reward_coef,
                               device=device)

# construct rollout storage
rollout = RolloutStorage(max_t=max_rollout_t,
                         batch_size=rollout_batch_size,
                         num_samples=train_num_samples)

# construct actor critic network
actor_critic = ActorCritic(actor_class=PolicyGraphConvNet,
                           critic_class=ValueGraphConvNet,
                           max_num_nodes=max_num_nodes,
                           hidden_dim=hidden_dim,
                           num_layers=num_layers,
                           device=device)

# construct PPO framework
framework = ProxPolicyOptimFramework(actor_critic=actor_critic,
                                     init_lr=init_lr,
                                     clip_value=clip_value,
                                     optim_num_samples=optim_num_samples,
示例#4
0
文件: update.py 项目: oidelima/ppo
    def update(self, rollouts: RolloutStorage):
        advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
        if advantages.numel() > 1:
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)

        logger = collections.Counter()

        for e in range(self.ppo_epoch):
            if self.agent.is_recurrent:
                data_generator = rollouts.recurrent_generator(
                    advantages, self.num_mini_batch
                )
            else:
                data_generator = rollouts.feed_forward_generator(
                    advantages, self.num_mini_batch
                )

            sample: Batch
            for sample in data_generator:
                # Reshape to do in a single forward pass for all steps
                act = self.agent(
                    inputs=sample.obs,
                    rnn_hxs=sample.recurrent_hidden_states,
                    masks=sample.masks,
                    action=sample.actions,
                )
                values = act.value
                action_log_probs = act.action_log_probs
                loss = act.aux_loss
                # log_values = act.log
                # logger.update(**log_values)

                if not self.aux_loss_only:
                    ratio = torch.exp(action_log_probs - sample.old_action_log_probs)
                    surr1 = ratio * sample.adv
                    surr2 = (
                        torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param)
                        * sample.adv
                    )
                    action_loss = -torch.min(surr1, surr2).mean()
                    logger.update(action_loss=action_loss)
                    loss += action_loss

                if self.use_clipped_value_loss:

                    value_pred_clipped = sample.value_preds + (
                        values - sample.value_preds
                    ).clamp(-self.clip_param, self.clip_param)
                    value_losses = (values - sample.ret).pow(2)
                    value_losses_clipped = (value_pred_clipped - sample.ret).pow(2)
                    value_loss = (
                        0.5 * torch.max(value_losses, value_losses_clipped).mean()
                    )
                else:
                    value_loss = 0.5 * F.mse_loss(sample.ret, values)
                logger.update(value_loss=value_loss)
                loss += self.value_loss_coef * value_loss

                self.optimizer.zero_grad()
                loss.backward()

                nn.utils.clip_grad_norm_(self.agent.parameters(), self.max_grad_norm)
                self.optimizer.step()

                # noinspection PyTypeChecker
                logger.update(n=1.0)

        n = logger.pop("n", 0)
        return {k: v.mean().item() / n for k, v in logger.items()}
示例#5
0
def main():

    from config import config_enhanced
    writer = SummaryWriter(os.path.join('runs', name_dir(config_enhanced)))

    torch.multiprocessing.freeze_support()

    print("Current config_enhanced is:")
    pprint(config_enhanced)
    writer.add_text("config", str(config_enhanced))

    save_path = str(writer.get_logdir())
    try:
        os.makedirs(save_path)
    except OSError:
        pass

    # with open(os.path.join(save_path, "config.json"), 'w') as outfile:
    #     json.dump(config_enhanced, outfile)

    torch.manual_seed(config_enhanced['seed'])
    torch.cuda.manual_seed_all(config_enhanced['seed'])

    use_cuda = torch.cuda.is_available()
    if torch.cuda.is_available() and config_enhanced['cuda_deterministic']:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    # torch.set_num_threads(1)
    if use_cuda:
        device = torch.device('cuda')
        print("using GPU")
    else:
        device = torch.device('cpu')
        print("using CPU")

    if config_enhanced['num_processes'] == "num_cpu":
        num_processes = multiprocessing.cpu_count() - 1
    else:
        num_processes = config_enhanced['num_processes']

    # if torch.cuda.device_count() > 1:
    #     print("Let's use", torch.cuda.device_count(), "GPUs!")
    #     # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
    #     model = torch.nn.DataParallel(model)

    env = CholeskyTaskGraph(**config_enhanced['env_settings'])
    envs = VectorEnv(env, num_processes)
    envs.reset()

    model = SimpleNet(**config_enhanced["network_parameters"])
    if config_enhanced["model_path"]:
        model.load_state_dict(torch.load(config_enhanced['model_path']))

    actor_critic = Policy(model, envs.action_space, config_enhanced)
    actor_critic = actor_critic.to(device)

    if config_enhanced['agent'] == 'PPO':
        print("using PPO")
        agent_settings = config_enhanced['PPO_settings']
        agent = PPO(
            actor_critic,
            **agent_settings)

    elif config_enhanced['agent'] == 'A2C':
        print("using A2C")
        agent_settings = config_enhanced['A2C_settings']
        agent = A2C_ACKTR(
            actor_critic,
            **agent_settings)

    rollouts = RolloutStorage(config_enhanced['trajectory_length'], num_processes,
                              env_example.observation_space.shape, env_example.action_space)



    obs = envs.reset()
    obs = torch.tensor(obs, device=device)
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        config_enhanced['num_env_steps']) // config_enhanced['trajectory_length'] // num_processes
    for j in range(num_updates):

        if config_enhanced['use_linear_lr_decay']:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates, config_enhanced['network']['lr'])

        for step in tqdm(range(config_enhanced['trajectory_length'])):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob = actor_critic.act(
                    rollouts.obs[step])
            actions = action.squeeze(-1).detach().cpu().numpy()

            # Observe reward and next obs
            obs, reward, done, infos = envs.step(actions)
            obs = torch.tensor(obs, device=device)
            reward = torch.tensor(reward, device=device).unsqueeze(-1)
            done = torch.tensor(done, device=device)

            n_step = (j * config_enhanced['trajectory_length'] + step) * num_processes
            for info in infos:
                if 'episode' in info.keys():
                    reward_episode = info['episode']['r']
                    episode_rewards.append(reward_episode)
                    writer.add_scalar('reward', reward_episode, n_step)
                    writer.add_scalar('solved', int(info['episode']['length'] == envs.envs[0].max_steps))

            # If done then clean the history of observations.
            masks = torch.FloatTensor(
                [[0.0] if done_ else [1.0] for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1]).detach()

        rollouts.compute_returns(next_value, config_enhanced["use_gae"], config_enhanced["gamma"],
                                 config_enhanced['gae_lambda'], config_enhanced['use_proper_time_limits'])

        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        writer.add_scalar('value loss', value_loss, n_step)
        writer.add_scalar('action loss', action_loss, n_step)
        writer.add_scalar('dist_entropy', dist_entropy, n_step)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % config_enhanced['save_interval'] == 0
                or j == num_updates - 1):
            save_path = str(writer.get_logdir())
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save(actor_critic, os.path.join(save_path, "model.pth"))

        if j % config_enhanced['log_interval'] == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                    .format(j, n_step,
                            int(n_step / (end - start)),
                            len(episode_rewards), np.mean(episode_rewards),
                            np.median(episode_rewards), np.min(episode_rewards),
                            np.max(episode_rewards), dist_entropy, value_loss,
                            action_loss))

        if (config_enhanced['evaluate_every'] is not None and len(episode_rewards) > 1
                and j % config_enhanced['evaluate_every'] == 0):
            eval_reward = evaluate(actor_critic, boxworld, config_enhanced, device)
            writer.add_scalar("eval reward", eval_reward, n_step)
示例#6
0
文件: train.py 项目: oidelima/ppo
    def setup(
        self,
        num_steps,
        eval_steps,
        num_processes,
        seed,
        cuda_deterministic,
        cuda,
        time_limit,
        gamma,
        normalize,
        log_interval,
        eval_interval,
        no_eval,
        use_gae,
        tau,
        ppo_args,
        agent_args,
        render,
        render_eval,
        load_path,
        synchronous,
        num_batch,
        env_args,
        success_reward,
        use_tqdm,
    ):
        # Properly restrict pytorch to not consume extra resources.
        #  - https://github.com/pytorch/pytorch/issues/975
        #  - https://github.com/ray-project/ray/issues/3609
        torch.set_num_threads(1)
        os.environ["OMP_NUM_THREADS"] = "1"

        if render_eval and not render:
            eval_interval = 1
        if render or render_eval:
            ppo_args.update(ppo_epoch=0)
            num_processes = 1
            cuda = False

        # reproducibility
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        np.random.seed(seed)
        cuda &= torch.cuda.is_available()
        if cuda and cuda_deterministic:
            torch.backends.cudnn.benchmark = False
            torch.backends.cudnn.deterministic = True
        torch.set_num_threads(1)

        self.device = "cpu"
        if cuda:
            self.device = self.get_device()
        # print("Using device", self.device)

        self.envs = self.make_vec_envs(
            **env_args,
            seed=seed,
            gamma=(gamma if normalize else None),
            render=render,
            synchronous=True if render else synchronous,
            evaluation=False,
            num_processes=num_processes,
            time_limit=time_limit,
        )
        self.make_eval_envs = functools.partial(
            self.make_vec_envs,
            **env_args,
            seed=seed,
            gamma=(gamma if normalize else None),
            render=render,
            synchronous=True if render else synchronous,
            evaluation=True,
            num_processes=num_processes,
            time_limit=time_limit,
        )

        self.envs.to(self.device)
        self.agent = self.build_agent(envs=self.envs, **agent_args)
        self.rollouts = RolloutStorage(
            num_steps=num_steps,
            num_processes=num_processes,
            obs_space=self.envs.observation_space,
            action_space=self.envs.action_space,
            recurrent_hidden_state_size=self.agent.recurrent_hidden_state_size,
            use_gae=use_gae,
            gamma=gamma,
            tau=tau,
        )

        # copy to device
        if cuda:
            tick = time.time()
            self.agent.to(self.device)
            self.rollouts.to(self.device)
            print("Values copied to GPU in", time.time() - tick, "seconds")

        self.ppo = PPO(agent=self.agent, num_batch=num_batch, **ppo_args)
        self.counter = Counter()

        self.i = 0
        if load_path:
            self._restore(load_path)

        self.make_train_iterator = lambda: self.train_generator(
            num_steps=num_steps,
            num_processes=num_processes,
            eval_steps=eval_steps,
            log_interval=log_interval,
            eval_interval=eval_interval,
            no_eval=no_eval,
            use_tqdm=use_tqdm,
            success_reward=success_reward,
        )
        self.train_iterator = self.make_train_iterator()
示例#7
0
文件: train.py 项目: oidelima/ppo
class TrainBase(abc.ABC):
    def setup(
        self,
        num_steps,
        eval_steps,
        num_processes,
        seed,
        cuda_deterministic,
        cuda,
        time_limit,
        gamma,
        normalize,
        log_interval,
        eval_interval,
        no_eval,
        use_gae,
        tau,
        ppo_args,
        agent_args,
        render,
        render_eval,
        load_path,
        synchronous,
        num_batch,
        env_args,
        success_reward,
        use_tqdm,
    ):
        # Properly restrict pytorch to not consume extra resources.
        #  - https://github.com/pytorch/pytorch/issues/975
        #  - https://github.com/ray-project/ray/issues/3609
        torch.set_num_threads(1)
        os.environ["OMP_NUM_THREADS"] = "1"

        if render_eval and not render:
            eval_interval = 1
        if render or render_eval:
            ppo_args.update(ppo_epoch=0)
            num_processes = 1
            cuda = False

        # reproducibility
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        np.random.seed(seed)
        cuda &= torch.cuda.is_available()
        if cuda and cuda_deterministic:
            torch.backends.cudnn.benchmark = False
            torch.backends.cudnn.deterministic = True
        torch.set_num_threads(1)

        self.device = "cpu"
        if cuda:
            self.device = self.get_device()
        # print("Using device", self.device)

        self.envs = self.make_vec_envs(
            **env_args,
            seed=seed,
            gamma=(gamma if normalize else None),
            render=render,
            synchronous=True if render else synchronous,
            evaluation=False,
            num_processes=num_processes,
            time_limit=time_limit,
        )
        self.make_eval_envs = functools.partial(
            self.make_vec_envs,
            **env_args,
            seed=seed,
            gamma=(gamma if normalize else None),
            render=render,
            synchronous=True if render else synchronous,
            evaluation=True,
            num_processes=num_processes,
            time_limit=time_limit,
        )

        self.envs.to(self.device)
        self.agent = self.build_agent(envs=self.envs, **agent_args)
        self.rollouts = RolloutStorage(
            num_steps=num_steps,
            num_processes=num_processes,
            obs_space=self.envs.observation_space,
            action_space=self.envs.action_space,
            recurrent_hidden_state_size=self.agent.recurrent_hidden_state_size,
            use_gae=use_gae,
            gamma=gamma,
            tau=tau,
        )

        # copy to device
        if cuda:
            tick = time.time()
            self.agent.to(self.device)
            self.rollouts.to(self.device)
            print("Values copied to GPU in", time.time() - tick, "seconds")

        self.ppo = PPO(agent=self.agent, num_batch=num_batch, **ppo_args)
        self.counter = Counter()

        self.i = 0
        if load_path:
            self._restore(load_path)

        self.make_train_iterator = lambda: self.train_generator(
            num_steps=num_steps,
            num_processes=num_processes,
            eval_steps=eval_steps,
            log_interval=log_interval,
            eval_interval=eval_interval,
            no_eval=no_eval,
            use_tqdm=use_tqdm,
            success_reward=success_reward,
        )
        self.train_iterator = self.make_train_iterator()

    def _train(self):
        try:
            return next(self.train_iterator)
        except StopIteration:
            self.train_iterator = self.make_train_iterator()
            return self._train()

    def train_generator(
        self,
        num_steps,
        num_processes,
        eval_steps,
        log_interval,
        eval_interval,
        no_eval,
        success_reward,
        use_tqdm,
    ):
        if eval_interval and not no_eval:
            # vec_norm = get_vec_normalize(eval_envs)
            # if vec_norm is not None:
            #     vec_norm.eval()
            #     vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            # self.envs.evaluate()
            eval_masks = torch.zeros(num_processes, 1, device=self.device)
            eval_counter = Counter()
            envs = self.make_eval_envs()
            envs.to(self.device)
            with self.agent.recurrent_module.evaluating(
                    envs.observation_space):
                eval_recurrent_hidden_states = torch.zeros(
                    num_processes,
                    self.agent.recurrent_hidden_state_size,
                    device=self.device,
                )

                eval_result = self.run_epoch(
                    obs=envs.reset(),
                    rnn_hxs=eval_recurrent_hidden_states,
                    masks=eval_masks,
                    num_steps=eval_steps,
                    # max(num_steps, time_limit) if time_limit else num_steps,
                    counter=eval_counter,
                    success_reward=success_reward,
                    use_tqdm=use_tqdm,
                    rollouts=None,
                    envs=envs,
                )
            envs.close()
            eval_result = {f"eval_{k}": v for k, v in eval_result.items()}
        else:
            eval_result = {}
        # self.envs.train()
        obs = self.envs.reset()
        self.rollouts.obs[0].copy_(obs)
        tick = time.time()
        log_progress = None

        if eval_interval:
            eval_iterator = range(self.i % eval_interval, eval_interval)
            if use_tqdm:
                eval_iterator = tqdm(eval_iterator, desc="next eval")
        else:
            eval_iterator = itertools.count(self.i)

        for _ in eval_iterator:
            if self.i % log_interval == 0 and use_tqdm:
                log_progress = tqdm(total=log_interval, desc="next log")
            self.i += 1
            epoch_counter = self.run_epoch(
                obs=self.rollouts.obs[0],
                rnn_hxs=self.rollouts.recurrent_hidden_states[0],
                masks=self.rollouts.masks[0],
                num_steps=num_steps,
                counter=self.counter,
                success_reward=success_reward,
                use_tqdm=False,
                rollouts=self.rollouts,
                envs=self.envs,
            )

            with torch.no_grad():
                next_value = self.agent.get_value(
                    self.rollouts.obs[-1],
                    self.rollouts.recurrent_hidden_states[-1],
                    self.rollouts.masks[-1],
                ).detach()

            self.rollouts.compute_returns(next_value=next_value)
            train_results = self.ppo.update(self.rollouts)
            self.rollouts.after_update()
            if log_progress is not None:
                log_progress.update()
            if self.i % log_interval == 0:
                total_num_steps = log_interval * num_processes * num_steps
                fps = total_num_steps / (time.time() - tick)
                tick = time.time()
                yield dict(tick=tick,
                           fps=fps,
                           **epoch_counter,
                           **train_results,
                           **eval_result)

    def run_epoch(
        self,
        obs,
        rnn_hxs,
        masks,
        num_steps,
        counter,
        success_reward,
        use_tqdm,
        rollouts,
        envs,
    ):
        # noinspection PyTypeChecker
        episode_counter = defaultdict(list)
        iterator = range(num_steps)
        if use_tqdm:
            iterator = tqdm(iterator, desc="evaluating")
        for _ in iterator:
            with torch.no_grad():
                act = self.agent(inputs=obs, rnn_hxs=rnn_hxs,
                                 masks=masks)  # type: AgentValues

            # Observe reward and next obs
            obs, reward, done, infos = envs.step(act.action)
            self.process_infos(episode_counter, done, infos, **act.log)

            # track rewards
            counter["reward"] += reward.numpy()
            counter["time_step"] += np.ones_like(done)
            episode_rewards = counter["reward"][done]
            episode_counter["rewards"] += list(episode_rewards)
            if success_reward is not None:
                # noinspection PyTypeChecker
                episode_counter["success"] += list(
                    episode_rewards >= success_reward)

            episode_counter["time_steps"] += list(counter["time_step"][done])
            counter["reward"][done] = 0
            counter["time_step"][done] = 0

            # If done then clean the history of observations.
            masks = torch.tensor(1 - done,
                                 dtype=torch.float32,
                                 device=obs.device).unsqueeze(1)
            rnn_hxs = act.rnn_hxs
            if rollouts is not None:
                rollouts.insert(
                    obs=obs,
                    recurrent_hidden_states=act.rnn_hxs,
                    actions=act.action,
                    action_log_probs=act.action_log_probs,
                    values=act.value,
                    rewards=reward,
                    masks=masks,
                )

        return dict(episode_counter)

    @staticmethod
    def process_infos(episode_counter, done, infos, **act_log):
        for d in infos:
            for k, v in d.items():
                episode_counter[k] += v if type(v) is list else [float(v)]
        for k, v in act_log.items():
            episode_counter[k] += v if type(v) is list else [float(v)]

    @staticmethod
    def build_agent(envs, **agent_args):
        return Agent(envs.observation_space.shape, envs.action_space,
                     **agent_args)

    @staticmethod
    def make_env(env_id, seed, rank, add_timestep, time_limit, evaluation):
        env = gym.make(env_id)
        is_atari = hasattr(gym.envs, "atari") and isinstance(
            env.unwrapped, gym.envs.atari.atari_env.AtariEnv)
        env.seed(seed + rank)
        obs_shape = env.observation_space.shape
        if add_timestep and len(
                obs_shape) == 1 and str(env).find("TimeLimit") > -1:
            env = AddTimestep(env)
        if is_atari and len(env.observation_space.shape) == 3:
            env = wrap_deepmind(env)

        # elif len(env.observation_space.shape) == 3:
        #     raise NotImplementedError(
        #         "CNN models work only for atari,\n"
        #         "please use a custom wrapper for a custom pixel input env.\n"
        #         "See wrap_deepmind for an example.")

        # If the input has shape (W,H,3), wrap for PyTorch convolutions
        obs_shape = env.observation_space.shape
        if len(obs_shape) == 3 and obs_shape[2] in [1, 3]:
            env = TransposeImage(env)

        if time_limit is not None:
            env = TimeLimit(env, max_episode_steps=time_limit)

        return env

    def make_vec_envs(
        self,
        num_processes,
        gamma,
        render,
        synchronous,
        env_id,
        add_timestep,
        seed,
        evaluation,
        time_limit,
        num_frame_stack=None,
        **env_args,
    ):
        envs = [
            functools.partial(  # thunk
                self.make_env,
                rank=i,
                env_id=env_id,
                add_timestep=add_timestep,
                seed=seed,
                evaluation=evaluation,
                time_limit=time_limit,
                evaluating=evaluation,
                **env_args,
            ) for i in range(num_processes)
        ]

        if len(envs) == 1 or sys.platform == "darwin" or synchronous:
            envs = DummyVecEnv(envs, render=render)
        else:
            envs = SubprocVecEnv(envs)

        # if (
        # envs.observation_space.shape
        # and len(envs.observation_space.shape) == 1
        # ):
        # if gamma is None:
        # envs = VecNormalize(envs, ret=False)
        # else:
        # envs = VecNormalize(envs, gamma=gamma)

        envs = VecPyTorch(envs)

        if num_frame_stack is not None:
            envs = VecPyTorchFrameStack(envs, num_frame_stack)
        # elif len(envs.observation_space.shape) == 3:
        #     envs = VecPyTorchFrameStack(envs, 4, device)

        return envs

    def _save(self, checkpoint_dir):
        modules = dict(optimizer=self.ppo.optimizer,
                       agent=self.agent)  # type: Dict[str, torch.nn.Module]
        # if isinstance(self.envs.venv, VecNormalize):
        #     modules.update(vec_normalize=self.envs.venv)
        state_dict = {
            name: module.state_dict()
            for name, module in modules.items()
        }
        save_path = Path(
            checkpoint_dir,
            f"{self.i if self.save_separate else 'checkpoint'}.pt")
        torch.save(dict(step=self.i, **state_dict), save_path)
        print(f"Saved parameters to {save_path}")
        return str(save_path)

    def _restore(self, checkpoint):
        load_path = checkpoint
        state_dict = torch.load(load_path, map_location=self.device)
        self.agent.load_state_dict(state_dict["agent"])
        self.ppo.optimizer.load_state_dict(state_dict["optimizer"])
        self.i = state_dict.get("step", -1) + 1
        # if isinstance(self.envs.venv, VecNormalize):
        #     self.envs.venv.load_state_dict(state_dict["vec_normalize"])
        print(f"Loaded parameters from {load_path}.")

    @abc.abstractmethod
    def get_device(self):
        raise NotImplementedError