Пример #1
0
def main():
    parser = otc_arg_parser()
    # args = get_args()
    args = parser.parse_args()
    args.cuda = not args.no_cuda and torch.cuda.is_available()

    assert args.algo in ['a2c', 'ppo', 'acktr']
    if args.recurrent_policy:
        assert args.algo in ['a2c', 'ppo'], \
            'Recurrent policy is not implemented for ACKTR'

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    tf_log_dir = os.path.join(log_dir, args.exp_name)
    if not os.path.exists(tf_log_dir):
        os.makedirs(tf_log_dir)
    writer = SummaryWriter(log_dir=tf_log_dir)
    eval_log_dir = log_dir + "_eval"
    # history_file = os.path.join(log_dir, args.exp_name+'.csv')

    torch.set_num_threads(1)
    # device = torch.device("cuda:0" if args.cuda else "cpu")
    device = torch.device("cuda" if args.cuda else "cpu")

    # envs = make_vec_envs(args.env, args.seed, args.num_processes,
    #                      args.gamma, args.log_dir, device, False)
    envs = make_otc_env(args, device)

    save_path = os.path.join(args.save_dir, args.exp_name)
    if args.load:
        actor_critic, ob_rms = \
                torch.load(
                    os.path.join(save_path, args.env + ".pt"))
        vec_norm = get_vec_normalize(envs)
        if vec_norm is not None:
            vec_norm.eval()
            vec_norm.ob_rms = ob_rms
    else:
        obs_shape = envs.observation_space.spaces['visual'].shape
        vector_obs_len = envs.observation_space.spaces['vector'].shape[0]
        actor_critic = Policy(obs_shape,
                              envs.action_space,
                              base=CNNBase,
                              base_kwargs={'recurrent': args.recurrent_policy},
                              vector_obs_len=vector_obs_len)
    if torch.cuda.device_count() > 1:
        actor_critic_parallel = nn.DataParallel(actor_critic,
                                                device_ids=[0, 1])
        actor_critic = actor_critic_parallel.module
    if args.half_precision:
        actor_critic.half()  # convert to half precision
        for layer in actor_critic.modules():
            if isinstance(layer, nn.BatchNorm2d):
                layer.float()
    actor_critic.to(device, non_blocking=True)
    from pytorch_wrappers import VecPyTorch  #,  VecPyTorchFrameStack
    envs = VecPyTorch(envs, device, half_precision=args.half_precision)
    # envs = VecPyTorchFrameStack(envs, 1, device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir,
            "trajs_{}.pt".format(args.env.split('-')[0].lower()))

        gail_train_loader = torch.utils.data.DataLoader(
            gail.ExpertDataset(file_name,
                               num_trajectories=4,
                               subsample_frequency=20),
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape,
                              ([envs.vector_obs_len]), envs.action_space,
                              actor_critic.recurrent_hidden_state_size)
    if args.half_precision:
        rollouts.half()
    obs, vector_obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.vector_obs[0].copy_(vector_obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=100)
    episode_floors = deque(maxlen=100)
    episode_times = deque(maxlen=100)
    # history_column_names = ['AgentId', 'Start', 'Seed', 'Floor', 'Reward', 'Steps', 'Time']
    # history_column_types = {'AgentId':np.int, 'Start':np.int, 'Seed':np.int, 'Floor':np.int, 'Reward':np.float, 'Steps':np.int, 'Time':np.float}
    # try:
    #     history_df = pd.read_csv(history_file, dtype={'AgentId':np.int, 'Start': np.int,'Seed':np.int,'Floor': np.int,'Steps':np.int},)
    # except FileNotFoundError:
    #     history_df = pd.DataFrame(columns = history_column_names).astype( dtype=history_column_types)
    #     history_df.to_csv(history_file, encoding='utf-8', index=False)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.vector_obs[step],
                    rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # action_cpu = action.cpu() # send a copy to the cpu

            # Obser reward and next obs
            obs, vector_obs, reward, done, infos = envs.step(action)

            # for i in range(len(action)):
            #     info = infos[i]
            #     # actual_action = action if 'actual_action' not in info.keys() else info['actual_action']
            #     # action[i][0]=int(actual_action)
            #     if 'actual_action' in info.keys() and int(info['actual_action']) != int(action_cpu[i][0]):
            #         action[i][0]=int(info['actual_action'])

            history_is_dirty = False
            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])
                    episode_floors.append(int(info['episode']['floor']))
                    episode_times.append(info['episode']['l'])
            #         data = [int(info['episode']['agent']),
            #                 int(info['episode']['start']), int(info['episode']['seed']), int(info['episode']['floor']),
            #                 np.around(info['episode']['r'],6), int(info['episode']['l']), info['episode']['t']]
            #         new_line = pd.DataFrame([data], columns = history_column_names).astype( dtype=history_column_types)
            #         history_df = new_line.append(history_df)
            #         history_is_dirty = True
            # if history_is_dirty:
            #     history_df.to_csv(history_file, encoding='utf-8', index=False)

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            # [[0.0] if done_ else [1.0] for done_ in done]).to(device)
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            #  for info in infos]).to(device)
            if args.half_precision:
                masks = masks.half()
                bad_masks = bad_masks.half()
            rollouts.insert(obs, vector_obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.vector_obs[-1],
                rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.vector_obs[step],
                    rollouts.actions[step], args.gamma, rollouts.masks[step])

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            try:
                os.makedirs(save_path)
            except OSError:
                pass
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print("Save at update {} / timestep {}".format(j, total_num_steps))
            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.env + ".pt"))

        if j % args.log_interval == 0:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            if len(episode_rewards) == 0:
                print(
                    "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}"
                    .format(
                        j,
                        total_num_steps,
                        int(total_num_steps / (end - start)),
                        0,
                        0,  # len(episode_rewards), np.mean(episode_rewards),
                        0,
                        0,  # np.median(episode_rewards), np.min(episode_rewards),
                        0,  # np.max(episode_rewards), 
                        dist_entropy,
                        value_loss,
                        action_loss))
            else:
                writer.add_scalar('reward',
                                  np.average(episode_rewards),
                                  global_step=total_num_steps)
                writer.add_scalar('floor',
                                  np.average(episode_floors),
                                  global_step=total_num_steps)
                writer.add_scalar('reward.std',
                                  np.std(episode_rewards),
                                  global_step=total_num_steps)
                writer.add_scalar('floor.std',
                                  np.std(episode_floors),
                                  global_step=total_num_steps)
                writer.add_scalar('steps',
                                  np.average(episode_times),
                                  global_step=total_num_steps)
                # writer.add_scalar('median', np.median(episode_rewards), global_step=total_num_steps)
                # writer.add_scalar('min', np.min(episode_rewards), global_step=total_num_steps)
                # writer.add_scalar('max', np.max(episode_rewards), global_step=total_num_steps)
                writer.add_scalar('FPS',
                                  int(total_num_steps / (end - start)),
                                  global_step=total_num_steps)
                writer.add_scalar('value_loss',
                                  np.around(value_loss, 6),
                                  global_step=total_num_steps)
                writer.add_scalar("action_loss:",
                                  np.around(action_loss, 6),
                                  global_step=total_num_steps)
                writer.add_scalar("dist_entropy:",
                                  np.around(dist_entropy, 6),
                                  global_step=total_num_steps)
                print(
                    "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}"
                    .format(j, total_num_steps,
                            int(total_num_steps / (end - start)),
                            len(episode_rewards), np.mean(episode_rewards),
                            np.median(episode_rewards),
                            np.min(episode_rewards), np.max(episode_rewards),
                            dist_entropy, value_loss, action_loss))
            print("value_loss:", np.around(value_loss, 6), "action_loss:",
                  np.around(action_loss, 6), "dist_entropy:",
                  np.around(dist_entropy, 6))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env, args.seed,
                     args.num_processes, eval_log_dir, device)
Пример #2
0
def main():

    from config import config_enhanced
    writer = SummaryWriter(os.path.join('runs', name_dir(config_enhanced)))

    torch.multiprocessing.freeze_support()

    print("Current config_enhanced is:")
    pprint(config_enhanced)
    writer.add_text("config", str(config_enhanced))

    save_path = str(writer.get_logdir())
    try:
        os.makedirs(save_path)
    except OSError:
        pass

    # with open(os.path.join(save_path, "config.json"), 'w') as outfile:
    #     json.dump(config_enhanced, outfile)

    torch.manual_seed(config_enhanced['seed'])
    torch.cuda.manual_seed_all(config_enhanced['seed'])

    use_cuda = torch.cuda.is_available()
    if torch.cuda.is_available() and config_enhanced['cuda_deterministic']:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    # torch.set_num_threads(1)
    if use_cuda:
        device = torch.device('cuda')
        print("using GPU")
    else:
        device = torch.device('cpu')
        print("using CPU")

    if config_enhanced['num_processes'] == "num_cpu":
        num_processes = multiprocessing.cpu_count() - 1
    else:
        num_processes = config_enhanced['num_processes']

    # if torch.cuda.device_count() > 1:
    #     print("Let's use", torch.cuda.device_count(), "GPUs!")
    #     # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
    #     model = torch.nn.DataParallel(model)

    env = CholeskyTaskGraph(**config_enhanced['env_settings'])
    envs = VectorEnv(env, num_processes)
    envs.reset()

    model = SimpleNet(**config_enhanced["network_parameters"])
    if config_enhanced["model_path"]:
        model.load_state_dict(torch.load(config_enhanced['model_path']))

    actor_critic = Policy(model, envs.action_space, config_enhanced)
    actor_critic = actor_critic.to(device)

    if config_enhanced['agent'] == 'PPO':
        print("using PPO")
        agent_settings = config_enhanced['PPO_settings']
        agent = PPO(
            actor_critic,
            **agent_settings)

    elif config_enhanced['agent'] == 'A2C':
        print("using A2C")
        agent_settings = config_enhanced['A2C_settings']
        agent = A2C_ACKTR(
            actor_critic,
            **agent_settings)

    rollouts = RolloutStorage(config_enhanced['trajectory_length'], num_processes,
                              env_example.observation_space.shape, env_example.action_space)



    obs = envs.reset()
    obs = torch.tensor(obs, device=device)
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        config_enhanced['num_env_steps']) // config_enhanced['trajectory_length'] // num_processes
    for j in range(num_updates):

        if config_enhanced['use_linear_lr_decay']:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates, config_enhanced['network']['lr'])

        for step in tqdm(range(config_enhanced['trajectory_length'])):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob = actor_critic.act(
                    rollouts.obs[step])
            actions = action.squeeze(-1).detach().cpu().numpy()

            # Observe reward and next obs
            obs, reward, done, infos = envs.step(actions)
            obs = torch.tensor(obs, device=device)
            reward = torch.tensor(reward, device=device).unsqueeze(-1)
            done = torch.tensor(done, device=device)

            n_step = (j * config_enhanced['trajectory_length'] + step) * num_processes
            for info in infos:
                if 'episode' in info.keys():
                    reward_episode = info['episode']['r']
                    episode_rewards.append(reward_episode)
                    writer.add_scalar('reward', reward_episode, n_step)
                    writer.add_scalar('solved', int(info['episode']['length'] == envs.envs[0].max_steps))

            # If done then clean the history of observations.
            masks = torch.FloatTensor(
                [[0.0] if done_ else [1.0] for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1]).detach()

        rollouts.compute_returns(next_value, config_enhanced["use_gae"], config_enhanced["gamma"],
                                 config_enhanced['gae_lambda'], config_enhanced['use_proper_time_limits'])

        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        writer.add_scalar('value loss', value_loss, n_step)
        writer.add_scalar('action loss', action_loss, n_step)
        writer.add_scalar('dist_entropy', dist_entropy, n_step)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % config_enhanced['save_interval'] == 0
                or j == num_updates - 1):
            save_path = str(writer.get_logdir())
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save(actor_critic, os.path.join(save_path, "model.pth"))

        if j % config_enhanced['log_interval'] == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                    .format(j, n_step,
                            int(n_step / (end - start)),
                            len(episode_rewards), np.mean(episode_rewards),
                            np.median(episode_rewards), np.min(episode_rewards),
                            np.max(episode_rewards), dist_entropy, value_loss,
                            action_loss))

        if (config_enhanced['evaluate_every'] is not None and len(episode_rewards) > 1
                and j % config_enhanced['evaluate_every'] == 0):
            eval_reward = evaluate(actor_critic, boxworld, config_enhanced, device)
            writer.add_scalar("eval reward", eval_reward, n_step)