示例#1
0
def main():
    args = get_args()

    if not args.training:
        print("Warning! Training is turned off!")

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    # Set up the directories and names for saving stuff
    session_name = utils.datetimenow(subseconds=True)
    if args.load_id:
        loading_id = args.load_id
        cutoff_idx = loading_id.find('_s')
        model_name = loading_id[1:cutoff_idx]
    else:
        model_name = session_name  #i.e. if new model, model_name is same as session_name

    unique_id = 'm'+ model_name + '_s' + session_name

    print("The unique ID for this model and session combination " + \
          "is %s" % str(unique_id))

    # Make dirs to log experimental data, models
    exp_dir = '../exps'
    if not os.path.isdir(exp_dir):
        os.mkdir(exp_dir)

    data_logs_dir = os.path.join(exp_dir, 'data_logs')
    if not os.path.isdir(data_logs_dir):
        os.mkdir(data_logs_dir)

    data_logs_dir_uniq = os.path.join(data_logs_dir, unique_id)
    if not os.path.isdir(data_logs_dir_uniq):
        os.mkdir(data_logs_dir_uniq)
        print("Data will be logged to %s. " % data_logs_dir_uniq + \
              "Ignore the tmp/openai logging statement below.")

    models_dir = os.path.join(exp_dir, 'models')
    if not os.path.isdir(models_dir):
        os.mkdir(models_dir)
        print("New model will be saved at %s" % models_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    # Save args into csv for record keeping
    utils.save_configs_to_csv(args, session_name=session_name,
                              model_name=model_name, unique_id=unique_id)

    # Set up envs and model etc
    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, data_logs_dir_uniq, device, False)

    if args.load_id:
        loaded_id = str(args.load_id)
        path = '../exps/models/' + loaded_id + '.pt'
        actor_critic = torch.load(path)
    else:
        actor_critic = Policy(
            obs_shape=envs.observation_space.shape,
            action_space=envs.action_space,
            base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C(
            args,
            actor_critic,
            args.value_loss_coef,
            args.entropy_coef,
            lr=args.lr,
            eps=args.eps,
            alpha=args.alpha,
            max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(
            args, # todo change ppo script to include args (which was added
            # to help make the training fully episodic and with overlapping
            # segments)
            actor_critic,
            args.clip_param,
            args.ppo_epoch,
            args.num_mini_batch,
            args.value_loss_coef,
            args.entropy_coef,
            lr=args.lr,
            eps=args.eps,
            max_grad_norm=args.max_grad_norm)

    rollouts = RolloutStorage(200, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_episodes = int(10e9)

    for j in range(num_episodes):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_episodes, args.lr)

        for step in range(200):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = \
                    actor_critic.act(rollouts.obs[step],
                                     rollouts.recurrent_hidden_states[step],
                                     rollouts.masks[step])

            # Obser reward and next obs
            # envs.render()
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():  # Only added at the end of an epi
                    episode_rewards.append(info['episode']['r'])

            p_dists = torch.FloatTensor(
                [info['p_dist'] if 'p_dist' in info.keys() else np.zeros(2)
                 for info in infos])

            # If done then clean the history of observations.
            masks = torch.FloatTensor(
                [[0.0] if done_ else [1.0] for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])

            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks,
                            bad_masks, p_dists)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        if args.training:
            value_loss, action_loss, dist_entropy = agent.update(rollouts)

        if args.save_experimental_data:
            rollouts.save_experimental_data(save_dir=data_logs_dir_uniq)

        if "Bandit" in args.env_name:
            reset_hxs_every_episode = True
        else:
            reset_hxs_every_episode = False
        rollouts.after_update(reset_hxs_every_episode)

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0 or j == num_episodes - 1):
            torch.save(actor_critic,
                       os.path.join(models_dir, unique_id + ".pt"))

        if j % args.log_interval == 0 and len(episode_rewards) > 1 and args.training:
            total_num_steps = (j + 1) * args.num_processes * 200
            end = time.time()
            print(
                "Episodes {}, num timesteps {}, FPS {}. Entropy: {:.4f} , Value loss: {:.4f}, Policy loss: {:.4f}, \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)), dist_entropy,
                        value_loss, action_loss,
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards)))
        elif j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * 200
            end = time.time()
            print(
                "Episodes {}, num timesteps {}. \n"
                .format(j, total_num_steps))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, data_logs_dir_uniq, device)
示例#2
0
def pg(envs, printout, use_gail=False):

    if use_gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail_util.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            '/home/paperspace/repos/pytorch-a2c-ppo-acktr-gail/gail_experts',
            "trajs_reacher.pt")

        gail_train_loader = torch.utils.data.DataLoader(
            gail_util.ExpertDataset(file_name,
                                    num_trajectories=4,
                                    subsample_step=4),
            batch_size=ppo_args.gail_batchsize,
            shuffle=True,
            drop_last=True)

    actor_critic = Policy(envs.observation_space.shape, envs.action_space)
    actor_critic.to(device)
    agent = algo.PPO(actor_critic=actor_critic,
                     clip_param=ppo_args.clip_param,
                     ppo_epoch=ppo_args.ppo_epoch,
                     num_mini_batch=ppo_args.num_mb,
                     value_loss_coef=ppo_args.vloss_coef,
                     entropy_coef=ppo_args.entropy_coef,
                     lr=ppo_args.lr,
                     eps=ppo_args.adam_eps,
                     max_grad_norm=.5)

    rollouts = storage.RolloutStorage(ppo_args.num_steps,
                                      ppo_args.num_processes,
                                      envs.observation_space.shape,
                                      envs.action_space,
                                      actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    num_updates = int(
        ppo_args.total_steps) // ppo_args.num_steps // ppo_args.num_processes

    episode_rewards = deque(maxlen=10)
    scores = np.zeros((ppo_args.num_envs, 1))
    final_scores = np.zeros((ppo_args.num_envs, 1))
    start = timer()
    for j in range(num_updates):

        utils.update_linear_schedule(agent.optimizer, j, num_updates,
                                     ppo_args.lr)

        for step in range(ppo_args.num_steps):
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.ones_like(masks)
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        if use_gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = ppo_args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(ppo_args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], ppo_args.gamma,
                    rollouts.masks[step])

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, ppo_args.use_gae, ppo_args.gamma,
                                 ppo_args.gae_lambda, ppo_args.time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        save_path = 'saved_models'
        save_interval = 100
        # save for every interval-th update or for the last epoch
        if (j % save_interval == 0 or j == num_updates - 1):

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, "ppo" + env_name + ".pt"))

        log_interval = 10
        if j % log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j +
                               1) * ppo_args.num_processes * ppo_args.num_steps
            end = timer()

            printout(
                f'Updates {j}, num timesteps {total_num_steps}, FPS { int(total_num_steps / (end - start))} \n '
                f'Last {len(episode_rewards)} training episodes: mean/median reward {np.mean(episode_rewards):.1f}/{ np.median(episode_rewards):.1f}, '
                f'min/max reward {np.min(episode_rewards):.1f}/{np.max(episode_rewards):.1f}'
            )
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    save_path = os.path.join(args.save_dir, args.algo)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir,
            "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

        expert_dataset = gail.ExpertDataset(file_name,
                                            num_trajectories=4,
                                            subsample_frequency=20)
        drop_last = len(expert_dataset) > args.gail_batch_size
        gail_train_loader = torch.utils.data.DataLoader(
            dataset=expert_dataset,
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=drop_last)

    if args.load:
        # actor_critic,ob_rms2=torch.load(os.path.join(save_path, args.env_name + ".pt"))
        # evaluate(actor_critic, ob_rms2, args.env_name, args.seed,
        #              args.num_processes, eval_log_dir, device)
        #actor_critic.eval()
        #exit()
        #.state_dict()
        actor_critic, agent.optimizer, start_epoch = load_checkpoint(
            actor_critic, agent.optimizer,
            os.path.join(save_path, args.env_name + ".pt"))
        actor_critic = actor_critic.to(device)
        for state in agent.optimizer.state.values():
            for k, v in state.items():
                if isinstance(v, torch.Tensor):
                    state[k] = v.to(device)
    else:
        start_epoch = 0

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    # ob_rms=ob_rms2
    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    for j in range(start_epoch, num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step])

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            try:
                os.makedirs(save_path)
            except OSError:
                pass
            state = {
                'epoch': j + 1,
                'state_dict': actor_critic.state_dict(),
                'optimizer': agent.optimizer.state_dict()
            }
            torch.save(state, os.path.join(save_path, args.env_name + ".pt"))
            # torch.save([
            #     actor_critic,
            #     getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            # ], os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)
示例#4
0
文件: main.py 项目: laceyg/mila
def main():
    chrono = exp.chrono()

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    for j in range(args.repeat):
        with chrono.time('train') as t:
            for n in range(args.number):

                if args.use_linear_lr_decay:
                    utils.update_linear_schedule(
                        agent.optimizer, j, num_updates, agent.optimizer.lr
                        if args.algo == "acktr" else args.lr)

                for step in range(args.num_steps):
                    # Sample actions
                    with torch.no_grad():
                        value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                            rollouts.obs[step],
                            rollouts.recurrent_hidden_states[step],
                            rollouts.masks[step])

                    # Obser reward and next obs
                    obs, reward, done, infos = envs.step(action)

                    for info in infos:
                        if 'episode' in info.keys():
                            episode_rewards.append(info['episode']['r'])

                    # If done then clean the history of observations.
                    masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                               for done_ in done])
                    bad_masks = torch.FloatTensor(
                        [[0.0] if 'bad_transition' in info.keys() else [1.0]
                         for info in infos])

                    rollouts.insert(obs, recurrent_hidden_states, action,
                                    action_log_prob, value, reward, masks,
                                    bad_masks)

                with torch.no_grad():
                    next_value = actor_critic.get_value(
                        rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                        rollouts.masks[-1]).detach()
                # ---
                rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                         args.gae_lambda,
                                         args.use_proper_time_limits)

                value_loss, action_loss, dist_entropy = agent.update(rollouts)

                exp.log_batch_loss(action_loss)
                exp.log_metric('value_loss', value_loss)

                rollouts.after_update()

                total_num_steps = (j + 1) * args.num_processes * args.num_steps

                if j % args.log_interval == 0 and len(episode_rewards) > 1:
                    end = time.time()
                    print(
                        "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                        .format(j, total_num_steps,
                                int(total_num_steps / (end - start)),
                                len(episode_rewards), np.mean(episode_rewards),
                                np.median(episode_rewards),
                                np.min(episode_rewards),
                                np.max(episode_rewards), dist_entropy,
                                value_loss, action_loss))

            # -- number
        # -- chrono
        exp.show_eta(j, t)
    # -- epoch
    exp.report()
    envs.close()
示例#5
0
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    # coinrun environments need to be treated differently.
    coinrun_envs = {
        'CoinRun': 'standard',
        'CoinRun-Platforms': 'platform',
        'Random-Mazes': 'maze'
    }

    envs = make_vec_envs(args.env_name,
                         args.seed,
                         args.num_processes,
                         args.gamma,
                         args.log_dir,
                         device,
                         False,
                         coin_run_level=args.num_levels,
                         difficulty=args.high_difficulty,
                         coin_run_seed=args.seed)
    if args.env_name in coinrun_envs.keys():
        observation_space_shape = (3, 64, 64)
        args.save_dir = args.save_dir + "/NUM_LEVELS_{}".format(
            args.num_levels)  # Save the level info in the

    else:
        observation_space_shape = envs.observation_space.shape

    # trained model name
    if args.continue_ppo_training:
        actor_critic, _ = torch.load(os.path.join(args.check_point,
                                                  args.env_name + ".pt"),
                                     map_location=torch.device(device))
    elif args.cor_gail:
        embed_size = args.embed_size
        actor_critic = Policy(observation_space_shape,
                              envs.action_space,
                              hidden_size=args.hidden_size,
                              embed_size=embed_size,
                              base_kwargs={'recurrent': args.recurrent_policy})
        actor_critic.to(device)
        correlator = Correlator(observation_space_shape,
                                envs.action_space,
                                hidden_dim=args.hidden_size,
                                embed_dim=embed_size,
                                lr=args.lr,
                                device=device)

        correlator.to(device)
        embeds = torch.zeros(1, embed_size)
    else:
        embed_size = 0
        actor_critic = Policy(observation_space_shape,
                              envs.action_space,
                              hidden_size=args.hidden_size,
                              base_kwargs={'recurrent': args.recurrent_policy})
        actor_critic.to(device)
        embeds = None

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm,
                         use_clipped_value_loss=True,
                         ftrl_mode=args.cor_gail or args.no_regret_gail,
                         correlated_mode=args.cor_gail)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.gail or args.no_regret_gail or args.cor_gail:
        file_name = os.path.join(
            args.gail_experts_dir,
            "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

        expert_dataset = gail.ExpertDataset(
            file_name, num_trajectories=50,
            subsample_frequency=1)  #if subsample set to a different number,
        # grad_pen might need adjustment
        drop_last = len(expert_dataset) > args.gail_batch_size
        gail_train_loader = torch.utils.data.DataLoader(
            dataset=expert_dataset,
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=drop_last)
        if args.gail:
            discr = gail.Discriminator(observation_space_shape,
                                       envs.action_space,
                                       device=device)
        if args.no_regret_gail or args.cor_gail:
            queue = deque(
                maxlen=args.queue_size
            )  # Strategy Queues: Each element of a queue is a dicr strategy
            agent_queue = deque(
                maxlen=args.queue_size
            )  # Strategy Queues: Each element of a queue is an agent strategy
            pruning_frequency = 1
        if args.no_regret_gail:
            discr = regret_gail.NoRegretDiscriminator(observation_space_shape,
                                                      envs.action_space,
                                                      device=device)
        if args.cor_gail:
            discr = cor_gail.CorDiscriminator(observation_space_shape,
                                              envs.action_space,
                                              hidden_size=args.hidden_size,
                                              embed_size=embed_size,
                                              device=device)
        discr.to(device)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              observation_space_shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size,
                              embed_size)

    obs = envs.reset()

    rollouts.obs[0].copy_(obs)
    if args.cor_gail:
        rollouts.embeds[0].copy_(embeds)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    for j in range(num_updates):
        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions # Roll-out
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step], rollouts.embeds[step])

            obs, reward, done, infos = envs.step(action.to('cpu'))
            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])

            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)
            # Sample mediating/correlating actions # Correlated Roll-out
            if args.cor_gail:
                embeds, embeds_log_prob, mean = correlator.act(
                    rollouts.obs[step], rollouts.actions[step])
                rollouts.insert_embedding(embeds, embeds_log_prob)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1], rollouts.embeds[-1]).detach()

        if args.gail or args.no_regret_gail or args.cor_gail:
            if args.env_name not in {'CoinRun', 'Random-Mazes'}:
                if j >= 10:
                    envs.venv.eval()

            gail_epoch = args.gail_epoch
            if args.gail:
                if j < 10:
                    gail_epoch = 100  # Warm up

                # no need for gail epoch or warm up in the no-regret case and cor_gail.
            for _ in range(gail_epoch):
                if utils.get_vec_normalize(envs):
                    obfilt = utils.get_vec_normalize(envs)._obfilt
                else:
                    obfilt = None

                if args.gail:
                    discr.update(gail_train_loader, rollouts, obfilt)

                if args.no_regret_gail or args.cor_gail:
                    last_strategy = discr.update(gail_train_loader, rollouts,
                                                 queue, args.max_grad_norm,
                                                 obfilt, j)

            for step in range(args.num_steps):
                if args.gail:
                    rollouts.rewards[step] = discr.predict_reward(
                        rollouts.obs[step], rollouts.actions[step], args.gamma,
                        rollouts.masks[step])
                if args.no_regret_gail:
                    rollouts.rewards[step] = discr.predict_reward(
                        rollouts.obs[step], rollouts.actions[step], args.gamma,
                        rollouts.masks[step], queue)
                if args.cor_gail:
                    rollouts.rewards[
                        step], correlator_reward = discr.predict_reward(
                            rollouts.obs[step], rollouts.actions[step],
                            rollouts.embeds[step], args.gamma,
                            rollouts.masks[step], queue)

                    rollouts.correlated_reward[step] = correlator_reward

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        if args.gail:
            value_loss, action_loss, dist_entropy = agent.update(rollouts, j)

        elif args.no_regret_gail or args.cor_gail:
            value_loss, action_loss, dist_entropy, agent_gains, agent_strategy = \
                agent.mixed_update(rollouts, agent_queue, j)

        if args.cor_gail:
            correlator.update(rollouts, agent_gains, args.max_grad_norm)

        if args.no_regret_gail or args.cor_gail:
            queue, _ = utils.queue_update(queue, pruning_frequency,
                                          args.queue_size, j, last_strategy)
            agent_queue, pruning_frequency = utils.queue_update(
                agent_queue, pruning_frequency, args.queue_size, j,
                agent_strategy)

        rollouts.after_update()
        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            if not args.cor_gail:
                torch.save([
                    actor_critic,
                    getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
                ], os.path.join(save_path, args.env_name + ".pt"))

            else:
                print("saving models in {}".format(
                    os.path.join(save_path, args.env_name)))
                torch.save(
                    correlator.state_dict(),
                    os.path.join(save_path, args.env_name + "correlator.pt"))
                torch.save([
                    actor_critic.state_dict(),
                    getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
                ], os.path.join(save_path, args.env_name + "actor.pt"))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f},"
                " value loss/action loss {:.1f}/{}".format(
                    j, total_num_steps, int(total_num_steps / (end - start)),
                    len(episode_rewards), np.mean(episode_rewards),
                    np.median(episode_rewards), value_loss, action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)
示例#6
0
def inner_loop_ppo(args, learning_rate, num_steps, num_updates, inst_on,
                   visualize, save_dir):
    torch.set_num_threads(1)
    log_writer = SummaryWriter(save_dir, max_queue=1, filename_suffix="log")
    device = torch.device("cpu")

    env_name = ENV_NAME  # "Safexp-PointGoal1-v0"
    envs = make_vec_envs(env_name,
                         np.random.randint(2**32),
                         NUM_PROC,
                         args.gamma,
                         None,
                         device,
                         allow_early_resets=True,
                         normalize=args.norm_vectors)
    eval_envs = make_vec_envs(env_name,
                              np.random.randint(2**32),
                              1,
                              args.gamma,
                              None,
                              device,
                              allow_early_resets=True,
                              normalize=args.norm_vectors)

    actor_critic_policy = init_default_ppo(envs, log(args.init_sigma))

    # Prepare modified observation shape for instinct
    obs_shape = envs.observation_space.shape
    inst_action_space = deepcopy(envs.action_space)
    inst_obs_shape = list(obs_shape)
    inst_obs_shape[0] = inst_obs_shape[0] + envs.action_space.shape[0]
    # Prepare modified action space for instinct
    inst_action_space.shape = list(inst_action_space.shape)
    inst_action_space.shape[0] = inst_action_space.shape[0] + 1
    inst_action_space.shape = tuple(inst_action_space.shape)
    actor_critic_instinct = Policy(tuple(inst_obs_shape),
                                   inst_action_space,
                                   init_log_std=log(args.init_sigma),
                                   base_kwargs={'recurrent': False})
    actor_critic_policy.to(device)
    actor_critic_instinct.to(device)

    agent_policy = algo.PPO(actor_critic_policy,
                            args.clip_param,
                            args.ppo_epoch,
                            args.num_mini_batch,
                            args.value_loss_coef,
                            args.entropy_coef,
                            lr=learning_rate,
                            eps=args.eps,
                            max_grad_norm=args.max_grad_norm)

    agent_instinct = algo.PPO(actor_critic_instinct,
                              args.clip_param,
                              args.ppo_epoch,
                              args.num_mini_batch,
                              args.value_loss_coef,
                              args.entropy_coef,
                              lr=learning_rate,
                              eps=args.eps,
                              max_grad_norm=args.max_grad_norm)

    rollouts_rewards = RolloutStorage(
        num_steps, NUM_PROC, envs.observation_space.shape, envs.action_space,
        actor_critic_policy.recurrent_hidden_state_size)

    rollouts_cost = RolloutStorage(
        num_steps, NUM_PROC, inst_obs_shape, inst_action_space,
        actor_critic_instinct.recurrent_hidden_state_size)

    obs = envs.reset()
    i_obs = torch.cat(
        [obs, torch.zeros((NUM_PROC, envs.action_space.shape[0]))],
        dim=1)  # Add zero action to the observation
    rollouts_rewards.obs[0].copy_(obs)
    rollouts_rewards.to(device)
    rollouts_cost.obs[0].copy_(i_obs)
    rollouts_cost.to(device)

    fitnesses = []
    best_fitness_so_far = float("-Inf")
    is_instinct_training = False
    for j in range(num_updates):
        is_instinct_training_old = is_instinct_training
        is_instinct_training = phase_shifter(
            j, PHASE_LENGTH,
            len(TrainPhases)) == TrainPhases.INSTINCT_TRAIN_PHASE.value
        is_instinct_deterministic = not is_instinct_training
        is_policy_deterministic = not is_instinct_deterministic
        for step in range(num_steps):
            # Sample actions
            with torch.no_grad():
                # (value, action, action_log_probs, rnn_hxs), (instinct_value, instinct_action, instinct_outputs_log_prob, i_rnn_hxs), final_action
                value, action, action_log_probs, recurrent_hidden_states = actor_critic_policy.act(
                    rollouts_rewards.obs[step],
                    rollouts_rewards.recurrent_hidden_states[step],
                    rollouts_rewards.masks[step],
                    deterministic=is_policy_deterministic)
                instinct_value, instinct_action, instinct_outputs_log_prob, instinct_recurrent_hidden_states = actor_critic_instinct.act(
                    rollouts_cost.obs[step],
                    rollouts_cost.recurrent_hidden_states[step],
                    rollouts_cost.masks[step],
                    deterministic=is_instinct_deterministic,
                )

            # Combine two networks
            final_action, i_control = policy_instinct_combinator(
                action, instinct_action)
            obs, reward, done, infos = envs.step(final_action)
            # envs.render()

            reward, violation_cost = reward_cost_combinator(
                reward, infos, NUM_PROC, i_control)

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts_rewards.insert(obs, recurrent_hidden_states, action,
                                    action_log_probs, value, reward, masks,
                                    bad_masks)
            i_obs = torch.cat([obs, action], dim=1)
            rollouts_cost.insert(i_obs, instinct_recurrent_hidden_states,
                                 instinct_action, instinct_outputs_log_prob,
                                 instinct_value, violation_cost, masks,
                                 bad_masks)

        with torch.no_grad():
            next_value_policy = actor_critic_policy.get_value(
                rollouts_rewards.obs[-1],
                rollouts_rewards.recurrent_hidden_states[-1],
                rollouts_rewards.masks[-1]).detach()
            next_value_instinct = actor_critic_instinct.get_value(
                rollouts_cost.obs[-1],
                rollouts_cost.recurrent_hidden_states[-1],
                rollouts_cost.masks[-1].detach())

        rollouts_rewards.compute_returns(next_value_policy, args.use_gae,
                                         args.gamma, args.gae_lambda,
                                         args.use_proper_time_limits)
        rollouts_cost.compute_returns(next_value_instinct, args.use_gae,
                                      args.gamma, args.gae_lambda,
                                      args.use_proper_time_limits)

        if not is_instinct_training:
            print("training policy")
            # Policy training phase
            p_before = deepcopy(agent_instinct.actor_critic)
            value_loss, action_loss, dist_entropy = agent_policy.update(
                rollouts_rewards)
            val_loss_i, action_loss_i, dist_entropy_i = 0, 0, 0
            p_after = deepcopy(agent_instinct.actor_critic)
            assert compare_two_models(
                p_before, p_after), "policy changed when it shouldn't"
        else:
            print("training instinct")
            # Instinct training phase
            value_loss, action_loss, dist_entropy = 0, 0, 0
            p_before = deepcopy(agent_policy.actor_critic)
            val_loss_i, action_loss_i, dist_entropy_i = agent_instinct.update(
                rollouts_cost)
            p_after = deepcopy(agent_policy.actor_critic)
            assert compare_two_models(
                p_before, p_after), "policy changed when it shouldn't"

        rollouts_rewards.after_update()
        rollouts_cost.after_update()

        ob_rms = utils.get_vec_normalize(envs)
        if ob_rms is not None:
            ob_rms = ob_rms.ob_rms

        fits, info = evaluate(EvalActorCritic(actor_critic_policy,
                                              actor_critic_instinct),
                              ob_rms,
                              eval_envs,
                              NUM_PROC,
                              reward_cost_combinator,
                              device,
                              instinct_on=inst_on,
                              visualise=visualize)
        instinct_reward = info['instinct_reward']
        eval_hazard_collisions = info['hazard_collisions']
        print(
            f"Step {j}, Fitness {fits.item()}, value_loss = {value_loss}, action_loss = {action_loss}, "
            f"dist_entropy = {dist_entropy}")
        print(
            f"Step {j}, Instinct reward {instinct_reward}, value_loss instinct = {val_loss_i}, action_loss instinct= {action_loss_i}, "
            f"dist_entropy instinct = {dist_entropy_i} hazard_collisions = {eval_hazard_collisions}"
        )
        print(
            "-----------------------------------------------------------------"
        )

        # Tensorboard logging
        log_writer.add_scalar("fitness", fits.item(), j)
        log_writer.add_scalar("value loss", value_loss, j)
        log_writer.add_scalar("action loss", action_loss, j)
        log_writer.add_scalar("dist entropy", dist_entropy, j)

        log_writer.add_scalar("cost/instinct_reward", instinct_reward, j)
        log_writer.add_scalar("cost/hazard_collisions", eval_hazard_collisions,
                              j)
        log_writer.add_scalar("value loss instinct", val_loss_i, j)
        log_writer.add_scalar("action loss instinct", action_loss_i, j)
        log_writer.add_scalar("dist entropy instinct", dist_entropy_i, j)

        fitnesses.append(fits)
        if fits.item() > best_fitness_so_far:
            best_fitness_so_far = fits.item()
            torch.save(actor_critic_policy, join(save_dir,
                                                 "model_rl_policy.pt"))
            torch.save(actor_critic_instinct,
                       join(save_dir, "model_rl_instinct.pt"))
        if is_instinct_training != is_instinct_training_old:
            torch.save(actor_critic_policy,
                       join(save_dir, f"model_rl_policy_update_{j}.pt"))
            torch.save(actor_critic_instinct,
                       join(save_dir, f"model_rl_instinct_update_{j}.pt"))
        torch.save(actor_critic_policy,
                   join(save_dir, "model_rl_policy_latest.pt"))
        torch.save(actor_critic_instinct,
                   join(save_dir, "model_rl_instinct_latest.pt"))
    return (fitnesses[-1]), 0, 0
示例#7
0
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    args_dir, logs_dir, models_dir, samples_dir = get_all_save_paths(
        args, 'pretrain', combine_action=args.combine_action)
    eval_log_dir = logs_dir + "_eval"
    utils.cleanup_log_dir(logs_dir)
    utils.cleanup_log_dir(eval_log_dir)

    _, _, intrinsic_models_dir, _ = get_all_save_paths(args,
                                                       'learn_reward',
                                                       load_only=True)
    if args.load_iter != 'final':
        intrinsic_model_file_name = os.path.join(
            intrinsic_models_dir,
            args.env_name + '_{}.pt'.format(args.load_iter))
    else:
        intrinsic_model_file_name = os.path.join(
            intrinsic_models_dir, args.env_name + '.pt'.format(args.load_iter))
    intrinsic_arg_file_name = os.path.join(args_dir, 'command.txt')

    # save args to arg_file
    with open(intrinsic_arg_file_name, 'w') as f:
        json.dump(args.__dict__, f, indent=2)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, logs_dir, device, False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)
    else:
        raise NotImplementedError

    if args.use_intrinsic:
        obs_shape = envs.observation_space.shape
        if len(obs_shape) == 3:
            action_dim = envs.action_space.n
        elif len(obs_shape) == 1:
            action_dim = envs.action_space.shape[0]

        if 'NoFrameskip' in args.env_name:
            file_name = os.path.join(
                args.experts_dir, "trajs_ppo_{}.pt".format(
                    args.env_name.split('-')[0].replace('NoFrameskip',
                                                        '').lower()))
        else:
            file_name = os.path.join(
                args.experts_dir,
                "trajs_ppo_{}.pt".format(args.env_name.split('-')[0].lower()))

        rff = RewardForwardFilter(args.gamma)
        intrinsic_rms = RunningMeanStd(shape=())

        if args.intrinsic_module == 'icm':
            print('Loading pretrained intrinsic module: %s' %
                  intrinsic_model_file_name)
            inverse_model, forward_dynamics_model, encoder = torch.load(
                intrinsic_model_file_name)
            icm =  IntrinsicCuriosityModule(envs, device, inverse_model, forward_dynamics_model, \
                                            inverse_lr=args.intrinsic_lr, forward_lr=args.intrinsic_lr,\
                                            )

        if args.intrinsic_module == 'vae':
            print('Loading pretrained intrinsic module: %s' %
                  intrinsic_model_file_name)
            vae = torch.load(intrinsic_model_file_name)
            icm =  GenerativeIntrinsicRewardModule(envs, device, \
                                                   vae, lr=args.intrinsic_lr, \
                                                   )

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            obs, reward, done, infos = envs.step(action)
            next_obs = obs

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, next_obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.use_intrinsic:
            for step in range(args.num_steps):
                state = rollouts.obs[step]
                action = rollouts.actions[step]
                next_state = rollouts.next_obs[step]
                if args.intrinsic_module == 'icm':
                    state = encoder(state)
                    next_state = encoder(next_state)
                with torch.no_grad():
                    rollouts.rewards[
                        step], pred_next_state = icm.calculate_intrinsic_reward(
                            state, action, next_state, args.lambda_true_action)
            if args.standardize == 'True':
                buf_rews = rollouts.rewards.cpu().numpy()
                intrinsic_rffs = np.array(
                    [rff.update(rew) for rew in buf_rews.T])
                rffs_mean, rffs_std, rffs_count = mpi_moments(
                    intrinsic_rffs.ravel())
                intrinsic_rms.update_from_moments(rffs_mean, rffs_std**2,
                                                  rffs_count)
                mean = intrinsic_rms.mean
                std = np.asarray(np.sqrt(intrinsic_rms.var))
                rollouts.rewards = rollouts.rewards / torch.from_numpy(std).to(
                    device)

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(models_dir, args.algo)
            policy_file_name = os.path.join(save_path, args.env_name + '.pt')

            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], policy_file_name)

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "{} Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(args.env_name, j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)
def main():
    args = get_args()

    if comet_loaded and len(args.comet) > 0:
        comet_credentials = args.comet.split("/")
        experiment = Experiment(api_key=comet_credentials[2],
                                project_name=comet_credentials[1],
                                workspace=comet_credentials[0])
        for key, value in vars(args).items():
            experiment.log_parameter(key, value)
    else:
        experiment = None

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, False,
                         args.custom_gym, args.navi)
    base = None
    if args.navi:
        base = NaviBase
    obs_shape = envs.observation_space.shape

    actor_critic = Policy(
        obs_shape,
        envs.action_space,
        base_kwargs={'recurrent': args.recurrent_policy},
        navi=args.navi,
        base=base,
    )
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'random':
        agent = algo.RANDOM_AGENT(actor_critic,
                                  args.value_loss_coef,
                                  args.entropy_coef,
                                  acktr=True)

        actor_critic = RandomPolicy(
            obs_shape,
            envs.action_space,
            base_kwargs={'recurrent': args.recurrent_policy},
            navi=args.navi,
            base=base,
        )
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir,
            "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

        gail_train_loader = torch.utils.data.DataLoader(
            gail.ExpertDataset(file_name,
                               num_trajectories=4,
                               subsample_frequency=20),
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    episode_length = deque(maxlen=10)
    episode_success_rate = deque(maxlen=100)
    episode_total = 0

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        print("args.num_steps: " + str(args.num_steps))
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Observe reward and next obs
            obs, reward, done, infos = envs.step(action)
            for idx, info in enumerate(infos):
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])
                    episode_length.append(info['episode']['l'])
                    if "Pacman" not in args.env_name:
                        episode_success_rate.append(
                            info['was_successful_trajectory'])
                    episode_total += 1

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step])

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            if experiment is not None:
                experiment.log_metric("Reward Mean",
                                      np.mean(episode_rewards),
                                      step=total_num_steps)
                experiment.log_metric("Reward Min",
                                      np.min(episode_rewards),
                                      step=total_num_steps)
                experiment.log_metric("Reward Max",
                                      np.max(episode_rewards),
                                      step=total_num_steps)
                experiment.log_metric("Episode Length Mean ",
                                      np.mean(episode_length),
                                      step=total_num_steps)
                experiment.log_metric("Episode Length Min",
                                      np.min(episode_length),
                                      step=total_num_steps)
                experiment.log_metric("Episode Length Max",
                                      np.max(episode_length),
                                      step=total_num_steps)
                experiment.log_metric("# Trajectories (Total)",
                                      j,
                                      step=total_num_steps)
                if "Pacman" not in args.env_name:
                    experiment.log_metric("Episodic Success Rate",
                                          np.mean(episode_success_rate),
                                          step=total_num_steps)
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)
示例#9
0
def main():
    #wandb.run = config.tensorboard.run
    wandb.init(settings=wandb.Settings(start_method="fork"),
               project='growspaceenv_baselines',
               entity='growspace')
    #torch.manual_seed(config.seed)
    #torch.cuda.manual_seed_all(config.seed)

    if config.cuda and torch.cuda.is_available() and config.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(config.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if config.cuda else "cpu")

    envs = make_vec_envs(config.env_name, config.seed, config.num_processes,
                         config.gamma, config.log_dir, device, False,
                         config.custom_gym)

    if "Mnist" in config.env_name:
        base = 'Mnist'
    else:
        base = None

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base,
                          base_kwargs={'recurrent': config.recurrent_policy})
    actor_critic.to(device)

    if config.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               config.value_loss_coef,
                               config.entropy_coef,
                               lr=config.lr,
                               eps=config.eps,
                               alpha=config.alpha,
                               max_grad_norm=config.max_grad_norm)
    elif config.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         config.clip_param,
                         config.ppo_epoch,
                         config.num_mini_batch,
                         config.value_loss_coef,
                         config.entropy_coef,
                         lr=config.lr,
                         eps=config.eps,
                         max_grad_norm=config.max_grad_norm,
                         optimizer=config.optimizer,
                         momentum=config.momentum)
    elif config.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               config.value_loss_coef,
                               config.entropy_coef,
                               acktr=True)

    if config.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            config.gail_experts_dir,
            "trajs_{}.pt".format(config.env_name.split('-')[0].lower()))

        expert_dataset = gail.ExpertDataset(file_name,
                                            num_trajectories=4,
                                            subsample_frequency=20)
        drop_last = len(expert_dataset) > config.gail_batch_size
        gail_train_loader = torch.utils.data.DataLoader(
            dataset=expert_dataset,
            batch_size=config.gail_batch_size,
            shuffle=True,
            drop_last=drop_last)

    rollouts = RolloutStorage(config.num_steps, config.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = []
    episode_length = []
    episode_branches = []
    episode_branch1 = []
    episode_branch2 = []
    episode_light_width = []
    episode_light_move = []
    episode_success = []
    episode_plantpixel = []

    start = time.time()
    num_updates = int(
        config.num_env_steps) // config.num_steps // config.num_processes
    x = 0
    action_space_type = envs.action_space

    for j in range(num_updates):

        if isinstance(action_space_type, Discrete):
            action_dist = np.zeros(envs.action_space.n)

        total_num_steps = (j + 1) * config.num_processes * config.num_steps

        if config.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if config.algo == "acktr" else config.lr)
        #new_branches = []
        for step in range(config.num_steps):
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            if isinstance(action_space_type, Discrete):
                action_dist[action] += 1

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])
                    episode_length.append(info['episode']['l'])
                    wandb.log({"Episode_Reward": info['episode']['r']},
                              step=total_num_steps)

                if 'new_branches' in info.keys():
                    episode_branches.append(info['new_branches'])

                if 'new_b1' in info.keys():
                    episode_branch1.append(info['new_b1'])

                if 'new_b2' in info.keys():
                    episode_branch2.append(info['new_b2'])

                if 'light_width' in info.keys():
                    episode_light_width.append(info['light_width'])

                if 'light_move' in info.keys():
                    episode_light_move.append(info['light_move'])

                if 'success' in info.keys():
                    episode_success.append(info['success'])

                if 'plant_pixel' in info.keys():
                    episode_plantpixel.append(info['plant_pixel'])

                if j == x:
                    if 'img' in info.keys():
                        img = info['img']
                        path = './hittiyas/growspaceenv_braselines/scripts/imgs/'
                        cv2.imwrite(
                            os.path.join(path, 'step' + str(step) + '.png'),
                            img)
                    x += 1000

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if config.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = config.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(config.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], config.gamma,
                    rollouts.masks[step])

        rollouts.compute_returns(next_value, config.use_gae, config.gamma,
                                 config.gae_lambda,
                                 config.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % config.save_interval == 0
                or j == num_updates - 1) and config.save_dir != "":
            save_path = os.path.join(config.save_dir, config.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, config.env_name + ".pt"))

        if j % config.log_interval == 0 and len(episode_rewards) > 1:

            if isinstance(action_space_type, Discrete):
                np_hist = np.histogram(np.arange(action_dist.shape[0]),
                                       weights=action_dist)
                wandb.log(
                    {
                        "Discrete Actions":
                        wandb.Histogram(np_histogram=np_hist)
                    },
                    step=total_num_steps)
            wandb.log({"Reward Min": np.min(episode_rewards)},
                      step=total_num_steps)
            wandb.log({"Summed Reward": np.sum(episode_rewards)},
                      step=total_num_steps)
            wandb.log({"Reward Mean": np.mean(episode_rewards)},
                      step=total_num_steps)
            wandb.log({"Reward Max": np.max(episode_rewards)},
                      step=total_num_steps)
            wandb.log(
                {"Number of Mean New Branches": np.mean(episode_branches)},
                step=total_num_steps)
            wandb.log({"Number of Max New Branches": np.max(episode_branches)},
                      step=total_num_steps)
            wandb.log({"Number of Min New Branches": np.min(episode_branches)},
                      step=total_num_steps)
            wandb.log(
                {
                    "Number of Mean New Branches of Plant 1":
                    np.mean(episode_branch1)
                },
                step=total_num_steps)
            wandb.log(
                {
                    "Number of Mean New Branches of Plant 2":
                    np.mean(episode_branch2)
                },
                step=total_num_steps)
            wandb.log(
                {
                    "Number of Total Displacement of Light":
                    np.sum(episode_light_move)
                },
                step=total_num_steps)
            wandb.log({"Mean Light Displacement": episode_light_move},
                      step=total_num_steps)
            wandb.log({"Mean Light Width": episode_light_width},
                      step=total_num_steps)
            wandb.log(
                {
                    "Number of Steps in Episode with Tree is as close as possible":
                    np.sum(episode_success)
                },
                step=total_num_steps)
            wandb.log({"Entropy": dist_entropy}, step=total_num_steps)
            wandb.log(
                {
                    "Displacement of Light Position":
                    wandb.Histogram(episode_light_move)
                },
                step=total_num_steps)
            wandb.log(
                {
                    "Displacement of Beam Width":
                    wandb.Histogram(episode_light_width)
                },
                step=total_num_steps)
            wandb.log({"Mean Plant Pixel": np.mean(episode_plantpixel)},
                      step=total_num_steps)
            wandb.log({"Summed Plant Pixel": np.sum(episode_plantpixel)},
                      step=total_num_steps)
            wandb.log(
                {"Plant Pixel Histogram": wandb.Histogram(episode_plantpixel)},
                step=total_num_steps)

            episode_rewards.clear()
            episode_length.clear()
            episode_branches.clear()
            episode_branch2.clear()
            episode_branch1.clear()
            episode_light_move.clear()
            episode_light_width.clear()
            episode_success.clear()
            episode_plantpixel.clear()

        if (config.eval_interval is not None and len(episode_rewards) > 1
                and j % config.eval_interval == 0):
            ob_rms = getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            evaluate(actor_critic, ob_rms, config.env_name, config.seed,
                     config.num_processes, eval_log_dir, device,
                     config.custom_gym)

    ob_rms = getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
    evaluate(actor_critic,
             ob_rms,
             config.env_name,
             config.seed,
             config.num_processes,
             eval_log_dir,
             device,
             config.custom_gym,
             gif=True)
示例#10
0
def main():
    args = get_args()

    args.env_name = "Torcs-v1"

    args.algo = 'ppo'
    args.use_gae = True
    args.log_interval = 1
    args.num_steps = 2048
    args.num_processes = 1
    args.lr = 3e-4
    args.entropy_coef = 0
    args.value_loss_coef = 0.5
    args.ppo_epoch = 10
    args.num_mini_batch = 32
    args.gamma = 0.99
    args.gae_lambda = 0.95
    args.num_env_steps = 1000000
    args.use_linear_lr_decay = True
    args.use_proper_time_limits = True
    args.save_dir = "saved"
    args.seed = 0
    args.cuda = False

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    # envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
    #                      args.gamma, args.log_dir, device, False)
    # envs = gym.make(args.env_name)
    # envs.seed(args.seed)

    actor_critic = Policy(24,
                          3,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)
    # actor_critic = torch.load("saved/ppo/Torcs-v0_new_mp.pt")
    # print(actor_critic)

    agent = algo.PPO(actor_critic,
                     args.clip_param,
                     args.ppo_epoch,
                     args.num_mini_batch,
                     args.value_loss_coef,
                     args.entropy_coef,
                     lr=args.lr,
                     eps=args.eps,
                     max_grad_norm=args.max_grad_norm)

    # rollouts = RolloutStorage(args.num_steps, args.num_processes,
    #                           envs.observation_space.shape, envs.action_space,
    #                           actor_critic.recurrent_hidden_state_size)

    # obs = envs.reset()
    # obs = torch.from_numpy(obs)
    # rollouts.obs[0].copy_(obs)
    # rollouts.to(device)
    acc_r = 0

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    done = [False]
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        print(j, "update")
        os.system("pkill torcs")
        p_job = partial(job,
                        args=args,
                        device=device,
                        shared_model=actor_critic)
        pool = mp.Pool()
        res = pool.map(p_job, range(12))
        pool.close()
        pool.join()

        for rollouts in res:
            with torch.no_grad():
                next_value = actor_critic.get_value(
                    rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                    rollouts.masks[-1]).detach()

            rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                     args.gae_lambda,
                                     args.use_proper_time_limits)

            value_loss, action_loss, dist_entropy = agent.update(rollouts)

            rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass
            # print(getattr(utils.get_vec_normalize(envs), 'ob_rms', None))
            torch.save(
                [
                    actor_critic
                    #,getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
                ],
                os.path.join(save_path, args.env_name + "_new_mp.pt"))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            s = "{},{:.2f},{:.2f}\n".format(j, np.mean(episode_rewards),
                                            np.median(episode_rewards))
            with open("logs/{}_new_mp.csv".format(args.env_name), 'a') as fl:
                fl.write(s)
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))
示例#11
0
def train(args):
    torch.manual_seed(args.seed)
    torch.set_num_threads(1)
    device = torch.device('cpu')

    os.makedirs(args.save_dir, exist_ok=True)

    training_log_path = os.path.join(args.save_dir, 'logs.txt')
    fp_log = open(training_log_path, 'w')
    fp_log.close()

    envs = make_vec_envs(args.env_name,
                         args.seed,
                         args.num_processes,
                         args.gamma,
                         None,
                         device,
                         False,
                         args=args)

    render_env = gym.make(args.env_name, args=args)
    render_env.seed(args.seed)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    else:
        raise NotImplementedError

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    episode_lens = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])
                    episode_lens.append(info['episode']['l'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            model_save_dir = os.path.join(args.save_dir, 'models')
            os.makedirs(model_save_dir, exist_ok=True)
            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ],
                       os.path.join(
                           model_save_dir,
                           args.env_name + '_iter{}'.format(j) + ".pt"))

        # save logs of every episode
        fp_log = open(training_log_path, 'a')
        total_num_steps = (j + 1) * args.num_processes * args.num_steps
        len_mean, len_min, len_max = np.mean(episode_lens), np.min(
            episode_lens), np.max(episode_lens)
        reward_mean, reward_min, reward_max = np.mean(episode_rewards), np.min(
            episode_rewards), np.max(episode_rewards)
        fp_log.write(
            'iterations: {}, mean(len): {:.1f}, min(len): {}, max(len): {}, mean(reward): {:.3f}, min(reward): {:.3f}, max(reward): {:.3f}, value_loss: {:.3f}, action_loss: {:.3f}\n'
            .format(total_num_steps, len_mean, len_min, len_max, reward_mean,
                    reward_min, reward_max, value_loss, action_loss))
        fp_log.close()

        # logging to console
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {}, time {} minutes \n Last {} training episodes: mean/median length {:1f}/{}, min/max length {}/{} mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        (end - start) / 60., len(episode_rewards),
                        np.mean(episode_lens), np.median(episode_lens),
                        np.min(episode_lens), np.max(episode_lens),
                        np.mean(episode_rewards), np.median(episode_rewards),
                        np.min(episode_rewards), np.max(episode_rewards),
                        dist_entropy, value_loss, action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(args, actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, device)

        if (args.render_interval is not None and args.render_interval > 0
                and j % args.render_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            render(render_env, actor_critic, ob_rms, deterministic=True)

    render_env.close()
    envs.close()
示例#12
0
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    save_name = '%s_%s' % (args.env_name, args.algo)
    if args.postfix != '':
        save_name += ('_' + args.postfix)

    logger_filename = os.path.join(log_dir, save_name)
    logger = utils.create_logger(logger_filename)

    torch.set_num_threads(1)
    device = torch.device("cuda:%d" % args.gpu if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name,
                         args.seed,
                         args.num_processes,
                         args.gamma,
                         args.log_dir,
                         device,
                         False,
                         4,
                         obs_type="grid" if args.grid else "image",
                         skip_frames=args.num_skip_frames)

    if args.load_dir != None:
        actor_critic, ob_rms = \
                torch.load(os.path.join(args.load_dir), map_location=lambda storage, loc: storage)
        vec_norm = utils.get_vec_normalize(envs)
        if vec_norm is not None:
            vec_norm.ob_rms = ob_rms
        print("load pretrained...")
    else:
        actor_critic = Policy(envs.observation_space.shape,
                              envs.action_space,
                              base="grid" if args.grid else None,
                              base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir,
            "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

        gail_train_loader = torch.utils.data.DataLoader(
            gail.ExpertDataset(file_name,
                               num_trajectories=4,
                               subsample_frequency=20),
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    lines = deque(maxlen=10)
    start = time.time()
    kk = 0
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    # learning_start = 1000
    learning_start = 0
    best_reward = -100
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        explore = exploration_rate(j - learning_start, 'exp')
        # print(j)
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # if j < learning_start:
            #     action[0, 0] = random.randint(0, envs.action_space.n - 1)
            # elif random.uniform(0, 1) < explore:
            #     action[0, 0] = random.randint(0, envs.action_space.n - 1)
            # else:
            #     pass

            # Obser reward and next obs
            # action[0, 0] = 1
            # envs.take_turns()
            obs, reward, done, infos = envs.step(action)
            # print(obs)

            # im = Image.fromarray(obs[0].reshape(224 * 4, -1).cpu().numpy().astype(np.uint8))
            # im.save("samples/%d.png" % kk)
            # kk += 1
            # info = infos[0]
            # if len(info) > 0:
            #     print(info)
            # print(done)
            # print(infos)
            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])
                if 'sent' in info.keys():
                    lines.append(info['sent'])

            # kk += 1
            # print(action.shape)
            # print(obs.shape)
            # print(done.shape)
            # if done[0]:
            #     print(time.time() - start)
            #     print(kk)
            #     exit()

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step])

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "" \
                and np.mean(episode_rewards) > best_reward:
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass
            best_reward = np.mean(episode_rewards)
            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, save_name + ".pt"))

        # print(episode_rewards)

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            if j < learning_start:
                logger.info("random action")
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            logger.info(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

            logger.info(
                ' lines sent: mean/median lines {:.1f}/{:.1f}, min/max lines {:.1f}/{:.1f}\n'
                .format(np.mean(lines), np.median(lines), np.min(lines),
                        np.max(lines)))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)
示例#13
0
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy},
                          dimh=args.dimh)
    actor_critic.to(device)

    exp_name = "%s_%s_seed%d_dimh%d_" % (args.env_name, args.algo, args.seed,
                                         args.dimh)
    if args.gail:
        exp_name += '_gail_'

    if args.split:
        exp_name += 'splitevery' + str(args.split_every)
        if args.random_split:
            exp_name += '_rsplit'
    else:
        exp_name += 'baseline'

    writer = SummaryWriter('./runs/' + exp_name)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir,
            "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

        gail_train_loader = torch.utils.data.DataLoader(
            gail.ExpertDataset(file_name,
                               num_trajectories=4,
                               subsample_frequency=20),
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    print(num_updates)
    stats = {
        'seed': args.seed,
        'experiment': exp_name,
        'env': args.env_name,
        'dimh': args.dimh,
        'split every': args.split_every,
        'random split': args.random_split,
        'steps': [],
        'mean reward': [],
        'actor neurons': [],
        'critic neurons': [],
    }
    save_dir = './experiment_results/%s/' % args.env_name
    stats_save_path = save_dir + exp_name
    check_path(save_dir)
    print('start')
    count = -1
    num_updates = 488 * 2
    meanreward = []
    for j in range(num_updates):
        #if j % 50 == 0:
        #    print('STEP', j)
        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            count += 1
            if j % 488 == 0:
                count = 0
                total = 488 * 2
            else:
                total = 488 * 2
            if args.split:
                utils.update_linear_schedule(
                    agent.optimizer, count, total,
                    agent.optimizer.lr if args.algo == "acktr" else args.lr)
            else:
                utils.update_linear_schedule(
                    agent.optimizer, j, num_updates,
                    agent.optimizer.lr if args.algo == "acktr" else args.lr)
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step])

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        # splitting
        if args.split and (j + 1) % args.split_every == 0 and j < 200:
            print("[INFO] split on iteration %d..." % j)
            agent.split(rollouts, args.random_split)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.env_name + ".pt"))
        meanreward.append(np.mean(episode_rewards))
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()

            if True:
                print(
                    "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                    .format(j, total_num_steps,
                            int(total_num_steps / (end - start)),
                            len(episode_rewards), np.mean(episode_rewards),
                            np.median(episode_rewards),
                            np.min(episode_rewards), np.max(episode_rewards),
                            dist_entropy, value_loss, action_loss))
            stats['mean reward'].append(np.mean(episode_rewards))
            stats['steps'].append(j)
            if args.split:
                a, c = agent.actor_critic.get_num_params()
                stats['actor neurons'].append(a)
                stats['critic neurons'].append(c)

        if j % 10 == 0:
            print("[INFO] saving to ", stats_save_path)
            np.save(stats_save_path, stats)

        if j % 5 == 0:
            s = (j + 1) * args.num_processes * args.num_steps
            if args.split:
                a, c = agent.actor_critic.get_num_params()
                writer.add_scalar('A neurons', a, s)
                writer.add_scalar('C neurons', c, s)
            writer.add_scalar('mean reward', np.mean(episode_rewards), s)
            writer.add_scalar('entropy loss', dist_entropy, s)
            writer.add_scalar('value loss', value_loss, s)
            writer.add_scalar('action loss', action_loss, s)

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)

    writer.close()
    import pickle
    pickle.dump(meanreward, open(stats_save_path + '.pkl', 'wb'))
示例#14
0
def main():
    # 引数の読み取り
    args = get_args()

    # 乱数のシード値を決定
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # cudaの設定(trueの場合決定論的振る舞いをする-つまりlossの値がばらつかなくなる)
    # ただし、arguments.pyで以下のように記されている
    # Sets flags for determinism when using CUDA (potentially slow!)
    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    # logファイルの出力先を決定
    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    # スレッド数の決定
    torch.set_num_threads(1)

    # deviceの設定
    device = torch.device("cuda:0" if args.cuda else "cpu")

    # 環境の初期化
    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, False)

    # ポリシーの設定
    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    # to(device) デバイスの設定: GPU or CPU
    actor_critic.to(device)

    # アルゴリズムの選択
    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)
    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir,
            "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))
        expert_dataset = gail.ExpertDataset(file_name,
                                            num_trajectories=4,
                                            subsample_frequency=20)
        drop_last = len(expert_dataset) > args.gail_batch_size
        gail_train_loader = torch.utils.data.DataLoader(
            dataset=expert_dataset,
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=drop_last)

    # Rollout(報酬の評価)の初期化
    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    # 環境の初期化
    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    # 報酬の定義
    episode_rewards = deque(maxlen=10)

    # 開始時刻とパラメータを更新する回数?を設定
    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    for j in range(num_updates):

        # 学習率にリニアな線形スケジュールを使用する場合
        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        # アクションの
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step])

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'obs_rms', None)
            ], os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            obs_rms = utils.get_vec_normalize(envs).obs_rms
            evaluate(actor_critic, obs_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)
def main(base=IAMBase, num_frame_stack=None):

    seed = 1
    env_name = "Warehouse-v0"
    num_processes = 32
    log_dir = './logs/'
    eval_interval = None
    log_interval = 10
    use_linear_lr_decay = False
    use_proper_time_limits = False
    save_dir = './trained_models/'
    use_cuda = True

    # PPO
    gamma = 0.99  # reward discount factor
    clip_param = 0.1  #0.2
    ppo_epoch = 3  #4
    num_mini_batch = 32
    value_loss_coef = 1  #0.5
    entropy_coef = 0.01
    lr = 2.5e-4  #7e-4
    eps = 1e-5
    max_grad_norm = float('inf')
    use_gae = True
    gae_lambda = 0.95
    num_steps = 8  #5

    # Store
    num_env_steps = 4e6
    save_interval = 100

    # IAM
    dset = [
        0, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
        66, 67, 68, 69, 70, 71, 72
    ]

    #gym.envs.register(env_name, entry_point="environments.warehouse.warehouse:Warehouse",
    #                    kwargs={'seed': seed, 'parameters': {"num_frames": 1}})

    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    log_dir = os.path.expanduser(log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if use_cuda else "cpu")

    envs = make_vec_envs(env_name,
                         seed,
                         num_processes,
                         gamma,
                         log_dir,
                         device,
                         False,
                         num_frame_stack=num_frame_stack)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base=base,
                          base_kwargs=({
                              'dset': dset
                          } if base == IAMBase else {}))
    actor_critic.to(device)

    agent = algo.PPO(actor_critic,
                     clip_param,
                     ppo_epoch,
                     num_mini_batch,
                     value_loss_coef,
                     entropy_coef,
                     lr=lr,
                     eps=eps,
                     max_grad_norm=max_grad_norm)

    rollouts = RolloutStorage(num_steps, num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(num_env_steps) // num_steps // num_processes
    for j in range(num_updates):

        if use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(agent.optimizer, j, num_updates, lr)

        for step in range(num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, use_gae, gamma, gae_lambda,
                                 use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % save_interval == 0 or j == num_updates - 1) and save_dir != "":
            save_path = os.path.join(save_dir, 'PPO')
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'obs_rms', None)
            ], os.path.join(save_path, env_name + ".pt"))

        if j % log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * num_processes * num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (eval_interval is not None and len(episode_rewards) > 1
                and j % eval_interval == 0):
            obs_rms = utils.get_vec_normalize(envs).obs_rms
            evaluate(actor_critic, obs_rms, env_name, seed, num_processes,
                     eval_log_dir, device)
示例#16
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")
    print(device)
    print(save_folder)

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False, args.reward_type)

    actor_critic = Policy(envs.observation_space.shape, envs.action_space)
    actor_critic.to(device)

    curiosity = None
    if use_curiosity:
        curiosity = ICM(envs.observation_space.shape[0], envs.action_space.n)
        curiosity.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         curiosity,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm,
                         use_curiosity=use_curiosity)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    cum_rew = [0] * args.num_processes
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=args.num_processes * 2)

    start = time.time()
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            if args.algo == "acktr":
                # use optimizer's learning rate since it's hard-coded in kfac.py
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       agent.optimizer.lr)
            else:
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       args.lr)

        if args.algo == 'ppo' and args.use_linear_clip_decay:
            agent.clip_param = args.clip_param * (1 - j / float(num_updates))

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = agent.actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)
            envs.render()

            cur_reward = reward

            to_write = reward.cpu().numpy()
            for i in range(args.num_processes):
                cum_rew[i] += to_write[i][0]

            if use_curiosity:
                action_one_hot = (torch.eye(14)[action]).view(-1, 14).cuda()
                _, pred_phi, actual_phi = curiosity(
                    (rollouts.obs[step], obs, action_one_hot))
                cur_reward += 0.2 * ((pred_phi - actual_phi).pow(2)).sum(
                    -1, keepdim=True).cpu() / 2

            for i, finished in enumerate(done):
                if finished:
                    percentile = infos[i]['x_pos'] / norm_pos
                    episode_rewards.append(percentile)
                    print(cum_rew[i])
                    with open(train_file[:-4] + str(i) + train_file[-4:],
                              'a',
                              newline='') as sfile:
                        writer = csv.writer(sfile)
                        writer.writerows([[cum_rew[i], percentile]])
                    cum_rew[i] = 0

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])

            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, cur_reward.detach(), masks)

        with torch.no_grad():
            next_value = agent.actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = agent.actor_critic
            if args.cuda:
                save_model = copy.deepcopy(agent.actor_critic).cpu()

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            torch.save(save_model,
                       os.path.join(save_folder, '/' + args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(
                episode_rewards) > args.num_processes:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, cumulative reward {:.3f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), np.mean(cum_rew)))


#Evaluation time :

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):

            num_proc = 1
            eval_envs = make_vec_envs(args.env_name, args.seed + num_proc,
                                      num_proc, args.gamma, args.log_dir,
                                      args.add_timestep, device, True,
                                      args.reward_type)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []
            test_rew = 0
            finish_this = False

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                num_proc,
                actor_critic.recurrent_hidden_state_size,
                device=device)

            eval_masks = torch.zeros(num_proc, 1, device=device)
            positions = deque(maxlen=400)

            while not finish_this:
                with torch.no_grad():

                    _, action, _, eval_recurrent_hidden_states = agent.actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)
                eval_envs.render()

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done]).cuda()

                #                 for i, finished in enumerate(done):
                #                     if finished:
                #                         percentile = infos[i]['x_pos']/norm_pos
                #                         eval_episode_rewards.append(percentile)
                #                         with open(eval_file, 'a', newline='') as sfile:
                #                             writer = csv.writer(sfile)
                #                             writer.writerows([[percentile]])

                test_rew += reward.cpu().numpy()[0, 0]

                for i, finished in enumerate(done):
                    if finished:
                        print('he died')
                        percentile = infos[i]['x_pos'] / norm_pos
                        eval_episode_rewards.append(percentile)
                        with open(eval_file, 'a', newline='') as sfile:
                            writer = csv.writer(sfile)
                            writer.writerows([[test_rew, percentile]])
                        finish_this = True

                #to prevent the agent from getting stuck
                positions.append(infos[0]['x_pos'])
                pos_ar = np.array(positions)
                if (len(positions) >= 200) and (pos_ar < pos_ar[-1] + 20).all(
                ) and (pos_ar > pos_ar[-1] - 20).all():
                    print("he's stuck")
                    percentile = infos[0]['x_pos'] / norm_pos
                    eval_episode_rewards.append(percentile)
                    with open(eval_file, 'a', newline='') as sfile:
                        writer = csv.writer(sfile)
                        writer.writerows([[test_rew, percentile]])
                    finish_this = True

            eval_envs.close()
            positions.clear()

            print(
                " Evaluation using {} episodes:  reward {:.3f}, distance {:.3f}\n"
                .format(len(eval_episode_rewards), test_rew,
                        np.mean(eval_episode_rewards)))
            test_rew = 0
            finish_this = False

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_env_steps)
            except IOError:
                pass
示例#17
0
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:" + str(args.cuda_id) if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir,
            "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

        expert_dataset = gail.ExpertDataset(file_name,
                                            num_trajectories=4,
                                            subsample_frequency=20)
        drop_last = len(expert_dataset) > args.gail_batch_size
        gail_train_loader = torch.utils.data.DataLoader(
            dataset=expert_dataset,
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=drop_last)

    ########## file related
    filename = args.env_name + "_" + args.algo + "_n" + str(args.max_episodes)
    if args.attack:
        filename += "_" + args.type + "_" + args.aim
        filename += "_s" + str(args.stepsize) + "_m" + str(
            args.maxiter) + "_r" + str(args.radius) + "_f" + str(args.frac)
    if args.run >= 0:
        filename += "_run" + str(args.run)

    logger = get_log(args.logdir + filename + "_" + current_time)
    logger.info(args)

    rew_file = open(args.resdir + filename + ".txt", "w")

    if args.compute:
        radius_file = open(
            args.resdir + filename + "_radius" + "_s" + str(args.stepsize) +
            "_m" + str(args.maxiter) + "_th" + str(args.dist_thres) + ".txt",
            "w")
    if args.type == "targ" or args.type == "fgsm":
        targ_file = open(args.resdir + filename + "_targ.txt", "w")

    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    if args.type == "wb":
        attack_net = WbAttacker(agent,
                                envs,
                                int(args.frac * num_updates),
                                num_updates,
                                args,
                                device=device)
    if args.type == "bb":
        attack_net = BbAttacker(agent,
                                envs,
                                int(args.frac * num_updates),
                                num_updates,
                                args,
                                device=device)
    elif args.type == "rand":
        attack_net = RandAttacker(envs,
                                  radius=args.radius,
                                  frac=args.frac,
                                  maxat=int(args.frac * num_updates),
                                  device=device)
    elif args.type == "semirand":
        attack_net = WbAttacker(agent,
                                envs,
                                int(args.frac * num_updates),
                                num_updates,
                                args,
                                device,
                                rand_select=True)
    elif args.type == "targ":
        if isinstance(envs.action_space, Discrete):
            action_dim = envs.action_space.n
            target_policy = action_dim - 1
        elif isinstance(envs.action_space, Box):
            action_dim = envs.action_space.shape[0]
            target_policy = torch.zeros(action_dim)
#            target_policy[-1] = 1
        print("target policy is", target_policy)
        attack_net = TargAttacker(agent,
                                  envs,
                                  int(args.frac * num_updates),
                                  num_updates,
                                  target_policy,
                                  args,
                                  device=device)
    elif args.type == "fgsm":
        if isinstance(envs.action_space, Discrete):
            action_dim = envs.action_space.n
            target_policy = action_dim - 1
        elif isinstance(envs.action_space, Box):
            action_dim = envs.action_space.shape[0]
            target_policy = torch.zeros(action_dim)

        def targ_policy(obs):
            return target_policy

        attack_net = FGSMAttacker(envs,
                                  agent,
                                  targ_policy,
                                  radius=args.radius,
                                  frac=args.frac,
                                  maxat=int(args.frac * num_updates),
                                  device=device)
#    if args.aim == "obs" or aim == "hybrid":
#        obs_space = gym.make(args.env_name).observation_space
#        attack_net.set_obs_range(obs_space.low, obs_space.high)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    episode = 0

    start = time.time()

    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            if args.type == "fgsm":
                #                print("before", rollouts.obs[step])
                rollouts.obs[step] = attack_net.attack(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step]).clone()
#                print("after", rollouts.obs[step])
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])
            if args.type == "targ" or args.type == "fgsm":
                if isinstance(envs.action_space, Discrete):
                    num_target = (
                        action == target_policy).nonzero()[:, 0].size()[0]
                    targ_file.write(
                        str(num_target / args.num_processes) + "\n")
                    print("percentage of target:",
                          num_target / args.num_processes)
                elif isinstance(envs.action_space, Box):
                    target_action = target_policy.repeat(action.size()[0], 1)
                    targ_file.write(
                        str(
                            torch.norm(action - target_action).item() /
                            args.num_processes) + "\n")
                    print("percentage of target:",
                          torch.sum(action).item() / args.num_processes)
            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action.cpu())
            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])
                    #                    rew_file.write("episode: {}, total reward: {}\n".format(episode, info['episode']['r']))
                    episode += 1

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step])

        if args.attack and args.type != "fgsm":
            if args.aim == "reward":
                logger.info(rollouts.rewards.flatten())
                rollouts.rewards = attack_net.attack_r_general(
                    rollouts, next_value).clone().detach()
                logger.info("after attack")
                logger.info(rollouts.rewards.flatten())
            elif args.aim == "obs":
                origin = rollouts.obs.clone()
                rollouts.obs = attack_net.attack_s_general(
                    rollouts, next_value).clone().detach()
                logger.info(origin)
                logger.info("after")
                logger.info(rollouts.obs)
            elif args.aim == "action":
                origin = torch.flatten(rollouts.actions).clone()
                rollouts.actions = attack_net.attack_a_general(
                    rollouts, next_value).clone().detach()
                logger.info("attack value")
                logger.info(torch.flatten(rollouts.actions) - origin)
            elif args.aim == "hybrid":
                res_aim, attack = attack_net.attack_hybrid(
                    rollouts, next_value, args.radius_s, args.radius_a,
                    args.radius_r)
                print("attack ", res_aim)
                if res_aim == "obs":
                    origin = rollouts.obs.clone()
                    rollouts.obs = attack.clone().detach()
                    logger.info(origin)
                    logger.info("attack obs")
                    logger.info(rollouts.obs)
                elif res_aim == "action":
                    origin = torch.flatten(rollouts.actions).clone()
                    rollouts.actions = attack.clone().detach()
                    logger.info("attack action")
                    logger.info(torch.flatten(rollouts.actions) - origin)
                elif res_aim == "reward":
                    logger.info(rollouts.rewards.flatten())
                    rollouts.rewards = attack.clone().detach()
                    logger.info("attack reward")
                    logger.info(rollouts.rewards.flatten())
        if args.compute:
            stable_radius = attack_net.compute_radius(rollouts, next_value)
            print("stable radius:", stable_radius)
            radius_file.write("update: {}, radius: {}\n".format(
                j, np.round(stable_radius, decimals=3)))
        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        if args.attack and args.type == "bb":
            attack_net.learning(rollouts)
        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0 and len(episode_rewards) >= 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))
            rew_file.write("updates: {}, mean reward: {}\n".format(
                j, np.mean(episode_rewards)))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)


#        if episode > args.max_episodes:
#            print("reach episodes limit")
#            break

    if args.attack:
        logger.info("total attacks: {}\n".format(attack_net.attack_num))
        print("total attacks: {}\n".format(attack_net.attack_num))

    rew_file.close()
    if args.compute:
        radius_file.close()
    if args.type == "targ" or args.type == "fgsm":
        targ_file.close()
示例#18
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    ## Make environments
    envs = make_vec_envs(args, device)

    ## Setup Policy / network architecture
    if args.load_path != '':
        if os.path.isfile(os.path.join(args.load_path, "best_model.pt")):
            import_name = "best_model.pt"
        else:
            import_name = "model.pt"
        online_actor_critic = torch.load(
            os.path.join(args.load_path, import_name))
        target_actor_critic = torch.load(
            os.path.join(args.load_path, import_name))
        if args.cuda:
            target_actor_critic = target_actor_critic.cuda()
            online_actor_critic = online_actor_critic.cuda()
    else:
        online_actor_critic = Policy(occ_obs_shape, sign_obs_shape,
                                     args.state_rep, envs.action_space,
                                     args.recurrent_policy)
        online_actor_critic.to(device)
        target_actor_critic = Policy(occ_obs_shape, sign_obs_shape,
                                     args.state_rep, envs.action_space,
                                     args.recurrent_policy)
        target_actor_critic.to(device)
        target_actor_critic.load_state_dict(online_actor_critic.state_dict())

    if args.penetration_type == "constant":
        target_actor_critic = online_actor_critic

    ## Choose algorithm to use
    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(online_actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(online_actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(online_actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    ## Initiate memory buffer
    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              occ_obs_shape, sign_obs_shape, envs.action_space,
                              target_actor_critic.recurrent_hidden_state_size)

    ## Start env with first observation
    occ_obs, sign_obs = envs.reset()
    if args.state_rep == 'full':
        rollouts.occ_obs[0].copy_(occ_obs)
    rollouts.sign_obs[0].copy_(sign_obs)
    rollouts.to(device)

    # Last 20 rewards - can set different queue length for different averaging
    episode_rewards = deque(maxlen=args.num_steps)
    reward_track = []
    best_eval_rewards = 0
    start = time.time()

    ## Loop over every policy updatetarget network
    for j in range(num_updates):

        ## Setup parameter decays
        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            if args.algo == "acktr":
                # use optimizer's learning rate since it's hard-coded in kfac.py
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       agent.optimizer.lr)
            else:
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       args.lr)

        if args.algo == 'ppo' and args.use_linear_clip_decay:
            agent.clip_param = args.clip_param * (1 - j / float(num_updates))

        ## Loop over num_steps environment updates to form trajectory
        for step in range(args.num_steps):
            # Sample actionspython3 main.py --algo ppo --num-steps 700000 --penetration-rate $i --env-name TrafficLight-simple-dense-v0 --lr 2.5e-4 --num-processes 8 --num-steps 128 --num-mini-batch 4 --use-linear-lr-decay --use-linear-clip-decay
            with torch.no_grad():
                # Pass observation through network and get outputs
                value, action, action_log_prob, recurrent_hidden_states = target_actor_critic.act(
                    rollouts.occ_obs[step], rollouts.sign_obs[step],
                    rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Do action in environment and save reward
            occ_obs, sign_obs, reward, done, _ = envs.step(action)
            episode_rewards.append(reward.numpy())

            # Masks the processes which are done
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])

            # Insert step information in buffer
            rollouts.insert(occ_obs, sign_obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

        ## Get state value of current env state
        with torch.no_grad():
            next_value = target_actor_critic.get_value(
                rollouts.occ_obs[-1], rollouts.sign_obs[-1],
                rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        ## Computes the num_step return (next_value approximates reward after num_step) see Supp Material of https://arxiv.org/pdf/1804.02717.pdf
        ## Can use Generalized Advantage Estimation
        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        # Update the policy with the rollouts
        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        # Clean the rollout by cylcing last elements to first ones
        rollouts.after_update()

        if (args.penetration_type == "linear") and (j % update_period == 0):
            target_actor_critic.load_state_dict(
                online_actor_critic.state_dict())

        ## Save model}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.3f}/{:.3f}, min/max reward {:.3f}/{:.3f}\n".
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":

            # A really ugly way to save a model to CPU
            save_model = target_actor_critic
            if args.cuda:
                save_model = copy.deepcopy(target_actor_critic).cpu()

            torch.save(save_model, os.path.join(save_path, "model.pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if args.vis:
            # Add the average reward of update to reward tracker
            reward_track.append(np.mean(episode_rewards))

        ## Log progress
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.3f}/{:.3f}, min/max reward {:.3f}/{:.3f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy))

        ## Evaluate model on new environments for 10 rewards
        percentage = 100 * total_num_steps // args.num_env_steps
        if (args.eval_interval is not None and percentage > 1
                and (j % args.eval_interval == 0 or j == num_updates - 1)):
            print("###### EVALUATING #######")
            args_eval = copy.deepcopy(args)
            args_eval.num_processes = 1
            eval_envs = make_vec_envs(args_eval, device, no_logging=True)

            eval_episode_rewards = []

            occ_obs, sign_obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args_eval.num_processes,
                target_actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args_eval.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 3000:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = target_actor_critic.act(
                        occ_obs,
                        sign_obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                occ_obs, sign_obs, reward, done, infos = eval_envs.step(action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])

                eval_episode_rewards.append(reward)

            eval_envs.close()

            if np.mean(eval_episode_rewards) > best_eval_rewards:
                best_eval_rewards = np.mean(eval_episode_rewards)
                save_model = target_actor_critic
                if args.cuda:
                    save_model = copy.deepcopy(target_actor_critic).cpu()
                torch.save(save_model, os.path.join(save_path,
                                                    'best_model.pt'))

    ## Visualize tracked rewards(over num_steps) over time
    if args.vis:
        visualize(reward_track, args.algo, save_path)
示例#19
0
def learn(env, max_timesteps, timesteps_per_batch, clip_param):
    ppo_epoch = 5
    num_step = timesteps_per_batch
    save_interval = 100
    seed = 1000
    batch_size = 64

    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    log_dir = os.path.expanduser('/tmp/gym/')
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda")

    envs = make_vec_envs(env, seed, 8, 0.95, log_dir, device, False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': False})
    actor_critic.to(device)

    agent = algo.PPO(actor_critic,
                     clip_param,
                     ppo_epoch,
                     batch_size,
                     0.5,
                     0.01,
                     lr=0.00025,
                     eps=1e-05,
                     max_grad_norm=0.5)

    rollouts = RolloutStorage(num_step, 8, envs.observation_space.shape,
                              envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(torch.tensor(obs))
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(max_timesteps) // num_step // 8
    for j in range(num_updates):

        # decrease learning rate linearly
        utils.update_linear_schedule(agent.optimizer, j, num_updates, 0.00025)

        for step in range(num_step):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, True, 0.99, 0.95, False)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % save_interval == 0
                or j == num_updates - 1) and "./trained_models/" != "":
            save_path = os.path.join("./trained_models/", 'ppo')
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, 'UniversalPolicy' + ".pt"))

        if j % 1 == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * 8 * num_step
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))
        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)
    '''
示例#20
0
def main():
    args = get_args()
    import random
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    np.random.seed(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    logdir = args.env_name + '_' + args.algo + '_num_arms_' + str(
        args.num_processes) + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
    if args.use_privacy:
        logdir = logdir + '_privacy'
    elif args.use_noisygrad:
        logdir = logdir + '_noisygrad'
    elif args.use_pcgrad:
        logdir = logdir + '_pcgrad'
    elif args.use_testgrad:
        logdir = logdir + '_testgrad'
    elif args.use_median_grad:
        logdir = logdir + '_mediangrad'
    logdir = os.path.join('runs', logdir)
    logdir = os.path.join(os.path.expanduser(args.log_dir), logdir)
    utils.cleanup_log_dir(logdir)

    # Ugly but simple logging
    log_dict = {
        'task_steps': args.task_steps,
        'grad_noise_ratio': args.grad_noise_ratio,
        'max_task_grad_norm': args.max_task_grad_norm,
        'use_noisygrad': args.use_noisygrad,
        'use_pcgrad': args.use_pcgrad,
        'use_testgrad': args.use_testgrad,
        'use_testgrad_median': args.use_testgrad_median,
        'testgrad_quantile': args.testgrad_quantile,
        'median_grad': args.use_median_grad,
        'use_meanvargrad': args.use_meanvargrad,
        'meanvar_beta': args.meanvar_beta,
        'no_special_grad_for_critic': args.no_special_grad_for_critic,
        'use_privacy': args.use_privacy,
        'seed': args.seed,
        'recurrent': args.recurrent_policy,
        'obs_recurrent': args.obs_recurrent,
        'cmd': ' '.join(sys.argv[1:])
    }
    for eval_disp_name, eval_env_name in EVAL_ENVS.items():
        log_dict[eval_disp_name] = []

    summary_writer = SummaryWriter()
    summary_writer.add_hparams(
        {
            'task_steps': args.task_steps,
            'grad_noise_ratio': args.grad_noise_ratio,
            'max_task_grad_norm': args.max_task_grad_norm,
            'use_noisygrad': args.use_noisygrad,
            'use_pcgrad': args.use_pcgrad,
            'use_testgrad': args.use_testgrad,
            'use_testgrad_median': args.use_testgrad_median,
            'testgrad_quantile': args.testgrad_quantile,
            'median_grad': args.use_median_grad,
            'use_meanvargrad': args.use_meanvargrad,
            'meanvar_beta': args.meanvar_beta,
            'no_special_grad_for_critic': args.no_special_grad_for_critic,
            'use_privacy': args.use_privacy,
            'seed': args.seed,
            'recurrent': args.recurrent_policy,
            'obs_recurrent': args.obs_recurrent,
            'cmd': ' '.join(sys.argv[1:])
        }, {})

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    print('making envs...')
    envs = make_vec_envs(args.env_name,
                         args.seed,
                         args.num_processes,
                         args.gamma,
                         args.log_dir,
                         device,
                         False,
                         steps=args.task_steps,
                         free_exploration=args.free_exploration,
                         recurrent=args.recurrent_policy,
                         obs_recurrent=args.obs_recurrent,
                         multi_task=True)

    val_envs = make_vec_envs(args.val_env_name,
                             args.seed,
                             args.num_processes,
                             args.gamma,
                             args.log_dir,
                             device,
                             False,
                             steps=args.task_steps,
                             free_exploration=args.free_exploration,
                             recurrent=args.recurrent_policy,
                             obs_recurrent=args.obs_recurrent,
                             multi_task=True)

    eval_envs_dic = {}
    for eval_disp_name, eval_env_name in EVAL_ENVS.items():
        eval_envs_dic[eval_disp_name] = make_vec_envs(
            eval_env_name[0],
            args.seed,
            args.num_processes,
            None,
            logdir,
            device,
            True,
            steps=args.task_steps,
            recurrent=args.recurrent_policy,
            obs_recurrent=args.obs_recurrent,
            multi_task=True,
            free_exploration=args.free_exploration)
    prev_eval_r = {}
    print('done')
    if args.hard_attn:
        actor_critic = Policy(envs.observation_space.shape,
                              envs.action_space,
                              base=MLPHardAttnBase,
                              base_kwargs={
                                  'recurrent':
                                  args.recurrent_policy or args.obs_recurrent
                              })
    else:
        actor_critic = Policy(envs.observation_space.shape,
                              envs.action_space,
                              base=MLPAttnBase,
                              base_kwargs={
                                  'recurrent':
                                  args.recurrent_policy or args.obs_recurrent
                              })
    actor_critic.to(device)

    if (args.continue_from_epoch > 0) and args.save_dir != "":
        save_path = os.path.join(args.save_dir, args.algo)
        actor_critic_, loaded_obs_rms_ = torch.load(
            os.path.join(
                save_path, args.env_name +
                "-epoch-{}.pt".format(args.continue_from_epoch)))
        actor_critic.load_state_dict(actor_critic_.state_dict())

    if args.algo != 'ppo':
        raise "only PPO is supported"
    agent = algo.PPO(actor_critic,
                     args.clip_param,
                     args.ppo_epoch,
                     args.num_mini_batch,
                     args.value_loss_coef,
                     args.entropy_coef,
                     lr=args.lr,
                     eps=args.eps,
                     num_tasks=args.num_processes,
                     attention_policy=False,
                     max_grad_norm=args.max_grad_norm,
                     weight_decay=args.weight_decay)
    val_agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.val_lr,
                         eps=args.eps,
                         num_tasks=args.num_processes,
                         attention_policy=True,
                         max_grad_norm=args.max_grad_norm,
                         weight_decay=args.weight_decay)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    val_rollouts = RolloutStorage(args.num_steps, args.num_processes,
                                  val_envs.observation_space.shape,
                                  val_envs.action_space,
                                  actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    val_obs = val_envs.reset()
    val_rollouts.obs[0].copy_(val_obs)
    val_rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    save_copy = True
    for j in range(args.continue_from_epoch,
                   args.continue_from_epoch + num_updates):

        # policy rollouts
        for step in range(args.num_steps):
            # Sample actions
            actor_critic.eval()
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])
            actor_critic.train()

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])
                    for k, v in info['episode'].items():
                        summary_writer.add_scalar(
                            f'training/{k}', v,
                            j * args.num_processes * args.num_steps +
                            args.num_processes * step)

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        actor_critic.eval()
        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()
        actor_critic.train()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        if save_copy:
            prev_weights = copy.deepcopy(actor_critic.state_dict())
            prev_opt_state = copy.deepcopy(agent.optimizer.state_dict())
            prev_val_opt_state = copy.deepcopy(
                val_agent.optimizer.state_dict())
            save_copy = False

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # validation rollouts
        for val_iter in range(args.val_agent_steps):
            for step in range(args.num_steps):
                # Sample actions
                actor_critic.eval()
                with torch.no_grad():
                    value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                        val_rollouts.obs[step],
                        val_rollouts.recurrent_hidden_states[step],
                        val_rollouts.masks[step])
                actor_critic.train()

                # Obser reward and next obs
                obs, reward, done, infos = val_envs.step(action)

                # If done then clean the history of observations.
                masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                           for done_ in done])
                bad_masks = torch.FloatTensor(
                    [[0.0] if 'bad_transition' in info.keys() else [1.0]
                     for info in infos])
                val_rollouts.insert(obs, recurrent_hidden_states, action,
                                    action_log_prob, value, reward, masks,
                                    bad_masks)

            actor_critic.eval()
            with torch.no_grad():
                next_value = actor_critic.get_value(
                    val_rollouts.obs[-1],
                    val_rollouts.recurrent_hidden_states[-1],
                    val_rollouts.masks[-1]).detach()
            actor_critic.train()

            val_rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                         args.gae_lambda,
                                         args.use_proper_time_limits)

            val_value_loss, val_action_loss, val_dist_entropy = val_agent.update(
                val_rollouts)
            val_rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'obs_rms', None)
            ], os.path.join(save_path,
                            args.env_name + "-epoch-{}.pt".format(j)))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))
        revert = False
        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            actor_critic.eval()
            obs_rms = utils.get_vec_normalize(envs).obs_rms
            eval_r = {}
            printout = f'Seed {args.seed} Iter {j} '
            for eval_disp_name, eval_env_name in EVAL_ENVS.items():
                eval_r[eval_disp_name] = evaluate(
                    actor_critic,
                    obs_rms,
                    eval_envs_dic,
                    eval_disp_name,
                    args.seed,
                    args.num_processes,
                    eval_env_name[1],
                    logdir,
                    device,
                    steps=args.task_steps,
                    recurrent=args.recurrent_policy,
                    obs_recurrent=args.obs_recurrent,
                    multi_task=True,
                    free_exploration=args.free_exploration)
                if eval_disp_name in prev_eval_r:
                    diff = np.array(eval_r[eval_disp_name]) - np.array(
                        prev_eval_r[eval_disp_name])
                    if eval_disp_name == 'many_arms':
                        if np.sum(diff > 0) - np.sum(
                                diff < 0) < args.val_improvement_threshold:
                            print('no update')
                            revert = True

                summary_writer.add_scalar(f'eval/{eval_disp_name}',
                                          np.mean(eval_r[eval_disp_name]),
                                          (j + 1) * args.num_processes *
                                          args.num_steps)
                log_dict[eval_disp_name].append([
                    (j + 1) * args.num_processes * args.num_steps,
                    eval_r[eval_disp_name]
                ])
                printout += eval_disp_name + ' ' + str(
                    np.mean(eval_r[eval_disp_name])) + ' '
            # summary_writer.add_scalars('eval_combined', eval_r, (j+1) * args.num_processes * args.num_steps)
            if revert:
                actor_critic.load_state_dict(prev_weights)
                agent.optimizer.load_state_dict(prev_opt_state)
                val_agent.optimizer.load_state_dict(prev_val_opt_state)
            else:
                print(printout)
                prev_eval_r = eval_r.copy()
            save_copy = True
            actor_critic.train()

    save_obj(log_dict, os.path.join(logdir, 'log_dict.pkl'))
    envs.close()
    val_envs.close()
    for eval_disp_name, eval_env_name in EVAL_ENVS.items():
        eval_envs_dic[eval_disp_name].close()
示例#21
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args_iko.cuda else "cpu")

    if args_iko.vis:
        from visdom import Visdom
        viz = Visdom(port=args_iko.port)
        win = None

    envs = make_vec_envs(args_iko.env_name, args_iko.seed,
                         args_iko.num_processes, args_iko.gamma,
                         args_iko.log_dir, args_iko.add_timestep, device,
                         False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args_iko.recurrent_policy})
    actor_critic.to(device)

    action_shape = 3
    reward_model = RewardModel(11 * 11 * 6, 1, 64, 64)
    reward_model.to(device)

    if args_iko.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args_iko.value_loss_coef,
                               args_iko.entropy_coef,
                               lr=args_iko.lr,
                               eps=args_iko.eps,
                               alpha=args_iko.alpha,
                               max_grad_norm=args_iko.max_grad_norm)
    elif args_iko.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args_iko.clip_param,
                         args_iko.ppo_epoch,
                         args_iko.num_mini_batch,
                         args_iko.value_loss_coef,
                         args_iko.entropy_coef,
                         args_iko.use_singh,
                         reward_model,
                         lr=args_iko.lr,
                         eps=args_iko.eps,
                         max_grad_norm=args_iko.max_grad_norm)
    elif args_iko.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args_iko.value_loss_coef,
                               args_iko.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args_iko.num_steps, args_iko.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    for j in range(num_updates):

        if args_iko.use_linear_lr_decay:
            # decrease learning rate linearly
            if args_iko.algo == "acktr":
                # use optimizer's learning rate since it's hard-coded in kfac.py
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       agent.optimizer.lr)
            else:
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       args_iko.lr)

        if args_iko.algo == 'ppo' and args_iko.use_linear_clip_decay:
            agent.clip_param = args_iko.clip_param * (1 -
                                                      j / float(num_updates))

        reward_train = []
        reward_block_penalty = []
        reward_bel_gt = []
        reward_bel_gt_nonlog = []
        reward_infogain = []
        reward_bel_ent = []
        reward_hit = []
        reward_dist = []
        reward_inv_dist = []

        for step in range(args_iko.num_steps):
            # Sample actions
            # print(step, args_iko.num_steps)
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)
            reward_train.append(reward)
            # print("infos is ", infos)
            # reward_b.append(infos[0]['auxillary_reward'])
            # print("infos is ",infos[0]['auxillary_reward'])
            reward_block_penalty.append(infos[0]['reward_block_penalty'])
            reward_bel_gt.append(infos[0]['reward_bel_gt'])
            reward_bel_gt_nonlog.append(infos[0]['reward_bel_gt_nonlog'])
            reward_infogain.append(infos[0]['reward_infogain'])
            reward_bel_ent.append(infos[0]['reward_bel_ent'])
            reward_hit.append(infos[0]['reward_hit'])
            reward_dist.append(infos[0]['reward_dist'])
            reward_inv_dist.append(infos[0]['reward_inv_dist'])
            # print(reward)

            reward.to(device)
            reward_model.to(device)
            if args_iko.use_singh:
                # print("using learning IR")
                my_reward = reward_model(obs.clone().to(device),
                                         action.clone().float()).detach()
                my_reward.to(device)
                reward = reward + args_iko.singh_coef * my_reward.type(
                    torch.FloatTensor)

            # for info in infos:
            #     if 'episode' in info.keys():
            #         episode_rewards.append(info['episode']['r'])
            #         print("infos is ",infos[0]['auxillary_reward'])
            #         print("info is",info['episode']['r'] )

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

        # print("mean reward_a", np.mean(reward_train))
        # print("mean reward_block_penalty", np.mean(reward_block_penalty))
        # print("mean reward_bel_gt", np.mean(reward_bel_gt))
        # print("mean reward_bel_gt_nonlog", np.mean(reward_bel_gt_nonlog))
        # print("mean reward_infogain", np.mean(reward_infogain))
        # print("mean reward_bel_ent", np.mean(reward_bel_ent))
        # print("mean reward_hit", np.mean(reward_hit))
        # print("mean reward_dist", np.mean(reward_dist))
        # print("mean reward_inv_dist", np.mean(reward_inv_dist))

        total_num_steps = (j + 1) * args_iko.num_processes * args_iko.num_steps
        writer.add_scalar('mean_reward_train', np.mean(reward_train),
                          total_num_steps)
        writer.add_scalar('mean_reward_block_penalty',
                          np.mean(reward_block_penalty), total_num_steps)
        writer.add_scalar('mean_reward_bel_gt', np.mean(reward_bel_gt),
                          total_num_steps)
        writer.add_scalar('mean_reward_bel_gt_nonlog',
                          np.mean(reward_bel_gt_nonlog), total_num_steps)
        writer.add_scalar('mean_reward_infogain', np.mean(reward_infogain),
                          total_num_steps)
        writer.add_scalar('mean_reward_bel_ent', np.mean(reward_bel_ent),
                          total_num_steps)
        writer.add_scalar('mean_reward_hit', np.mean(reward_hit),
                          total_num_steps)
        writer.add_scalar('mean_reward_dist', np.mean(reward_dist),
                          total_num_steps)
        writer.add_scalar('mean_reward_inv_dist', np.mean(reward_inv_dist),
                          total_num_steps)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args_iko.use_gae, args_iko.gamma,
                                 args_iko.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args_iko.save_interval == 0
                or j == num_updates - 1) and args_iko.save_dir != "":
            save_path = os.path.join(args_iko.save_dir, args_iko.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args_iko.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            torch.save(
                save_model,
                os.path.join(
                    save_path, 'ugl' + str(args_iko.use_gt_likelihood) +
                    'block-pen-' + str(args_iko.penalty_for_block) + '_' +
                    'explore-' + str(args_iko.rew_explore) + '_' + 'bel-new-' +
                    str(args_iko.rew_bel_new) + '_' + 'bel-ent-' +
                    str(args_iko.rew_bel_ent) + '_' + 'infogain-' +
                    str(args_iko.rew_infogain) + '_' + 'bel-gt-nolog-' +
                    str(args_iko.rew_bel_gt_nonlog) + '_' + 'bel-gt-' +
                    str(args_iko.rew_bel_gt) + '_' + 'dist-' +
                    str(args_iko.rew_dist) + '_' + 'hit-' +
                    str(args_iko.rew_hit) + '_' + 'inv-dist-' +
                    str(args_iko.rew_inv_dist) + args_iko.algo + ".pt"))

        total_num_steps = (j + 1) * args_iko.num_processes * args_iko.num_steps

        if j % args_iko.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print("mean reward_a", np.mean(reward_a))
            print("mean_reward_b", np.mean(reward_b))
            # print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n".
            #     format(j, total_num_steps,
            #            int(total_num_steps / (end - start)),
            #            len(episode_rewards),
            #            np.mean(episode_rewards),
            #            np.median(episode_rewards),
            #            np.min(episode_rewards),
            #            np.max(episode_rewards), dist_entropy,
            #            value_loss, action_loss))
            # writer.add_scalar('mean_reward', np.mean(episode_rewards), total_num_steps)
            # writer.add_scalar('min_reward', np.min(episode_rewards), total_num_steps)
            # writer.add_scalar('max_reward', np.max(episode_rewards), total_num_steps)
            # writer.add_scalar('success_rate', np.mean(episode_successes), total_num_steps)

        if (args_iko.eval_interval is not None and len(episode_rewards) > 1
                and j % args_iko.eval_interval == 0):
            eval_envs = make_vec_envs(args_iko.env_name,
                                      args_iko.seed + args_iko.num_processes,
                                      args_iko.num_processes, args_iko.gamma,
                                      eval_log_dir, args_iko.add_timestep,
                                      device, True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args_iko.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args_iko.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), np.mean(eval_episode_rewards)))

        if args_iko.vis and j % args_iko.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args_iko.log_dir,
                                  args_iko.env_name, args_iko.algo,
                                  args_iko.num_env_steps)
            except IOError:
                pass
    writer.close()
def main(env, scene_path):
    try:
        os.makedirs(args.log_dir)
    except OSError:
        files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv'))
        for f in files:
            os.remove(f)
    save_path = os.path.join(args.save_dir, args.algo)

    eval_x = []
    eval_y = []

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    initial_policies = torch.load(os.path.join(args.load_dir, args.algo,
                                               args.initial_policy + ".pt")) \
        if args.initial_policy else None

    if args.reuse_residual:
        residual, ob_rms, initial_policies = initial_policies
    else:
        residual = None
        ob_rms = None

    pose_estimator = torch.load(os.path.join(args.load_dir, "pe",
                                             args.pose_estimator + ".pt")) \
        if args.pose_estimator else None

    envs = make_vec_envs(env,
                         scene_path,
                         args.seed,
                         args.num_processes,
                         args.gamma,
                         args.log_dir,
                         device,
                         False,
                         initial_policies,
                         pose_estimator=pose_estimator,
                         init_control=not args.dense_ip)
    if args.reuse_residual:
        vec_norm = get_vec_normalize(envs)
        if vec_norm is not None:
            vec_norm.eval()
            vec_norm.ob_rms = ob_rms

    base_kwargs = {'recurrent': args.recurrent_policy}
    base = residual.base if args.reuse_residual else None
    dist = residual.dist if args.reuse_residual else None
    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs=base_kwargs,
                          zero_last_layer=True,
                          base=base,
                          dist=dist)
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm,
                         burn_in=initial_policies is not None
                         and not args.reuse_residual)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=64)

    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    total_num_steps = 0
    j = 0
    max_succ = -1
    max_mean_rew = -math.inf
    mean_ep_rew = -math.inf
    evals_without_improv = 0

    start = time.time()
    start_update = start
    while (not use_metric
           and j < num_updates) or (use_metric
                                    and max_succ < args.trg_succ_rate):
        if args.eval_interval is not None and j % args.eval_interval == 0:
            print("Evaluating current policy...")
            i = 0
            total_successes = 0
            max_trials = 50
            eval_recurrent_hidden_states = torch.zeros(
                args.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)
            while i + args.num_processes <= max_trials:

                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                obs, _, dones, infos = envs.step(action)

                if np.all(dones):  # Rigid - assumes episodes are fixed length
                    rews = []
                    for info in infos:
                        rews.append(info['rew_success'])
                    i += args.num_processes
                    rew = sum([int(rew > 0) for rew in rews])
                    total_successes += rew

            p_succ = (100 * total_successes / i)
            eval_x += [total_num_steps]
            eval_y += [p_succ]

            end = time.time()
            print(
                f"Evaluation: {total_successes} successful out of {i} episodes - "
                f"{p_succ:.2f}% successful. Eval length: {end - start_update}")
            torch.save([eval_x, eval_y],
                       os.path.join(args.save_as + "_eval.pt"))
            start_update = end

            if p_succ > max_succ:
                max_succ = p_succ
                max_mean_rew = mean_ep_rew
                evals_without_improv = 0
            elif mean_ep_rew > max_mean_rew:
                print("Unimproved success rate, higher reward")
                max_mean_rew = mean_ep_rew
                evals_without_improv = 0
            else:
                evals_without_improv += 1

            if evals_without_improv == 10 or max_succ >= args.trg_succ_rate:
                save_model = actor_critic
                if args.cuda:
                    save_model = copy.deepcopy(actor_critic).cpu()

                save_model = [
                    save_model,
                    getattr(get_vec_normalize(envs), 'ob_rms', None),
                    initial_policies
                ]
                extra = "_final" if evals_without_improv == 5 else ""
                torch.save(
                    save_model,
                    os.path.join(save_path, args.save_as + f"{extra}.pt"))
                break

        # save for every interval-th episode or for the last epoch
        if ((not use_metric and
             (j % args.save_interval == 0 or j == num_updates - 1)) or
            (use_metric
             and evals_without_improv == 0)) and args.save_dir != "":
            os.makedirs(save_path, exist_ok=True)

            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            if pose_estimator is not None:
                save_model = [save_model, pose_estimator, initial_policies]
            else:
                save_model = [
                    save_model,
                    getattr(get_vec_normalize(envs), 'ob_rms', None),
                    initial_policies
                ]

            torch.save(save_model, os.path.join(save_path,
                                                args.save_as + ".pt"))
            # torch.save(save_model, os.path.join(save_path, args.save_as + f"{j * args.num_processes * args.num_steps}.pt"))

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            if args.algo == "acktr":
                # use optimizer's learning rate since it's hard-coded in kfac.py
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       agent.optimizer.lr)
            else:
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       args.lr)

        if args.algo == 'ppo' and args.use_linear_clip_decay:
            agent.clip_param = args.clip_param * (1 - j / float(num_updates))

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            mean_ep_rew = np.mean(episode_rewards)
            if mean_ep_rew > max_mean_rew:
                print("Improved max mean reward")
                max_mean_rew = mean_ep_rew
                evals_without_improv = 0
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), mean_ep_rew,
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))
            print("Update length: ", end - start_update)
            start_update = end

        if args.vis and (j % args.vis_interval == 0 or
                         (not use_metric and j == num_updates - 1)):
            try:
                # Sometimes monitor doesn't properly flush the outputs
                visdom_plot(args.log_dir, args.save_as, args.algo,
                            total_num_steps)
            except IOError:
                pass

        j += 1

    if use_metric:
        if max_succ >= args.trg_succ_rate:
            print(
                f"Achieved greater than {args.trg_succ_rate}% success, advancing curriculum."
            )
        else:
            print(
                f"Policy converged with max success rate < {args.trg_succ_rate}%"
            )
    # Copy logs to permanent location so new graphs can be drawn.
    copy_tree(args.log_dir, os.path.join('logs', args.save_as))
    envs.close()
    return total_num_steps
示例#23
0
def main():
    if os.path.isdir(args.load_policy):
        args.load_policy = find_policy(args.load_policy)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)
    if args.load_policy is not None:
        actor_critic, ob_rms = torch.load(args.load_policy)
        vec_norm = get_vec_normalize(envs)
        if vec_norm is not None:
            vec_norm.eval()
            vec_norm.ob_rms = ob_rms
    else:
        actor_critic = Policy(envs.observation_space.shape,
                              envs.action_space,
                              base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(
        maxlen=(args.num_processes if args.num_processes > 10 else 10))

    start = time.time()
    snapshot_counter = 0
    last_delete = -1
    try:
        os.makedirs(os.path.join(args.save_dir, args.algo))
    except OSError:
        pass
    log_out_file = open(os.path.join(args.save_dir, args.algo, 'log_info.txt'),
                        'w')
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            if args.algo == "acktr":
                # use optimizer's learning rate since it's hard-coded in kfac.py
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       agent.optimizer.lr)
            else:
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       args.lr)

        if args.algo == 'ppo' and args.use_linear_clip_decay:
            agent.clip_param = args.clip_param * (1 - j / float(num_updates))

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            torch.save(
                save_model,
                os.path.join(save_path,
                             args.env_name + "epoch_{:07d}.pt".format(j)))
            snapshot_counter += 1
            last_delete += 1
            if snapshot_counter > 100:
                os.system('rm ' + os.path.join(
                    save_path, args.env_name +
                    'epoch_{:07d}.py'.format(last_delete)))
                snapshot_counter -= 1

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            log_info = "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n".\
                format(j, total_num_steps,
                       int(total_num_steps / (end - start)),
                       len(episode_rewards),
                       np.mean(episode_rewards),
                       np.median(episode_rewards),
                       np.min(episode_rewards),
                       np.max(episode_rewards), dist_entropy,
                       value_loss, action_loss)
            print(log_info)
            sys.stdout.flush()
            log_out_file.write(log_info)
            log_out_file.flush()

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(args.env_name,
                                      args.seed + args.num_processes,
                                      args.num_processes, args.gamma,
                                      eval_log_dir, args.add_timestep, device,
                                      True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), np.mean(eval_episode_rewards)))
            log_out_file.write(
                " Evaluation using {} episodes: mean reward {:.5f}\n".format(
                    len(eval_episode_rewards), np.mean(eval_episode_rewards)))
            log_out_file.flush()
            sys.stdout.flush()

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_env_steps)
            except IOError:
                pass
示例#24
0
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")
    receipts = StorageReceipt()
    make_env = lambda tasks: MiniWoBGraphEnvironment(
        base_url=os.environ.get("BASE_URL", f"file://{MINIWOB_HTML}/"),
        levels=tasks,
        level_tracker=LevelTracker(tasks),
        wait_ms=500,
    )

    task = args.env_name
    if args.env_name == "PongNoFrameskip-v4":
        args.env_name = "clickbutton"
        task = "miniwob/click-button.html"
    if task == "levels":
        tasks = MINIWOB_CHALLENGES
    else:
        tasks = [[task]]
    print("Selected tasks:", tasks)
    NUM_ACTIONS = 1
    envs = make_vec_envs(
        [make_env(tasks[i % len(tasks)]) for i in range(args.num_processes)],
        receipts)

    if os.path.exists("./datadir/autoencoder.pt"):
        dom_autoencoder = torch.load("./datadir/autoencoder.pt")
        dom_encoder = dom_autoencoder.encoder
        for param in dom_encoder.parameters():
            param.requires_grad = False
    else:
        print("No dom encoder")
        dom_encoder = None
    actor_critic = Policy(
        envs.observation_space.shape,
        gym.spaces.Discrete(NUM_ACTIONS),  # envs.action_space,
        base=GNNBase,
        base_kwargs={
            "dom_encoder": dom_encoder,
            "recurrent": args.recurrent_policy
        },
    )
    actor_critic.dist = NodeObjective()
    actor_critic.to(device)

    if args.algo == "a2c":
        agent = algo.A2C_ACKTR(
            actor_critic,
            args.value_loss_coef,
            args.entropy_coef,
            lr=args.lr,
            eps=args.eps,
            alpha=args.alpha,
            max_grad_norm=args.max_grad_norm,
        )
    elif args.algo == "ppo":
        agent = algo.PPO(
            actor_critic,
            args.clip_param,
            args.ppo_epoch,
            args.num_mini_batch,
            args.value_loss_coef,
            args.entropy_coef,
            lr=args.lr,
            eps=args.eps,
            max_grad_norm=args.max_grad_norm,
        )
    elif args.algo == "acktr":
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(envs.observation_space.shape[0], 100,
                                   device)

        rr = ReplayRepository("/code/miniwob-plusplus-demos/*turk/*")
        ds = rr.get_dataset()
        print("GAIL Replay Dataset", ds)
        gail_train_loader = torch_geometric.data.DataLoader(
            ds, batch_size=args.gail_batch_size, shuffle=True, drop_last=True)

    from tensorboardX import SummaryWriter
    import datetime

    ts_str = datetime.datetime.fromtimestamp(
        time.time()).strftime("%Y-%m-%d_%H-%M-%S")
    tensorboard_writer = SummaryWriter(
        log_dir=os.path.join("/tmp/log", ts_str))

    rollouts = ReceiptRolloutStorage(
        args.num_steps,
        args.num_processes,
        (1, ),  # envs.observation_space.shape,
        envs.action_space,
        actor_critic.recurrent_hidden_state_size,
        receipts,
    )

    # resume from last save
    if args.save_dir != "":
        save_path = os.path.join(args.save_dir, args.algo)
        try:
            os.makedirs(save_path)
        except OSError:
            pass

        model_path = os.path.join(save_path, args.env_name + ".pt")
        if False and os.path.exists(model_path):
            print("Loadng previous model:", model_path)
            actor_critic = torch.load(model_path)
            actor_critic.train()

    obs = envs.reset()
    rollouts.obs[0].copy_(torch.tensor(obs))
    rollouts.to(device)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    print("Iterations:", num_updates, args.num_steps)
    for j in range(num_updates):
        episode_rewards = deque(maxlen=args.num_steps * args.num_processes)
        if j and last_action_time + 5 < time.time():
            # task likely timed out
            print("Reseting tasks")
            obs = envs.reset()
            rollouts.obs[0].copy_(torch.tensor(obs))
            rollouts.recurrent_hidden_states[0].copy_(
                torch.zeros_like(rollouts.recurrent_hidden_states[0]))
            rollouts.masks[0].copy_(torch.zeros_like(rollouts.masks[0]))

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer,
                j,
                num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr,
            )

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    receipts.redeem(rollouts.obs[step]),
                    rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step],
                )

            # Obser reward and next obs
            last_action_time = time.time()
            obs, reward, done, infos = envs.step(action)

            for e, i in enumerate(infos):
                if i.get("real_action") is not None:
                    action[e] = i["real_action"]
                if i.get("bad_transition"):
                    action[e] = torch.zeros_like(action[e])

            for info in infos:
                if "episode" in info.keys():
                    episode_rewards.append(info["episode"]["r"])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if "bad_transition" in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(
                torch.tensor(obs),
                recurrent_hidden_states,
                action,
                action_log_prob,
                value,
                torch.tensor(reward).unsqueeze(1),
                masks,
                bad_masks,
            )

        with torch.no_grad():
            next_value = actor_critic.get_value(
                receipts.redeem(rollouts.obs[-1]),
                rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1],
            ).detach()

        if args.gail:
            # if j >= 10:
            #    envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                obsfilt = lambda x, update: x  # utils.get_vec_normalize(envs)._obfilt
                gl = discr.update(gail_train_loader, rollouts, obsfilt)
            print("Gail loss:", gl)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    receipts.redeem(rollouts.obs[step]),
                    rollouts.actions[step],
                    args.gamma,
                    rollouts.masks[step],
                )

        rollouts.compute_returns(
            next_value,
            args.use_gae,
            args.gamma,
            args.gae_lambda,
            args.use_proper_time_limits,
        )

        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        obs_shape = rollouts.obs.size()[2:]
        obs = rollouts.obs[:-1].view(-1, *obs_shape)
        obs = obs[torch.randint(0, obs.size(0), (1, 32))]

        rollouts.after_update()

        receipts.prune(rollouts.obs)

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            model_path = os.path.join(save_path, args.env_name + ".pt")
            torch.save(actor_critic, model_path)
            print("Saved model:", model_path)

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(
                    j,
                    total_num_steps,
                    int(total_num_steps / (end - start)),
                    len(episode_rewards),
                    np.mean(episode_rewards),
                    np.median(episode_rewards),
                    np.min(episode_rewards),
                    np.max(episode_rewards),
                    dist_entropy,
                    value_loss,
                    action_loss,
                ))

            from pprint import pprint

            pprint(LevelTracker.global_scoreboard)

            # tensorboard_writer.add_histogram(
            #    "task_ranks", torch.tensor(predictor._difficulty_rank), total_num_steps
            # )
            tensorboard_writer.add_histogram("value", value, total_num_steps)
            tensorboard_writer.add_histogram("x", actor_critic.base.last_x,
                                             total_num_steps)
            tensorboard_writer.add_histogram("query",
                                             actor_critic.base.last_query,
                                             total_num_steps)
            tensorboard_writer.add_histogram("inputs_at",
                                             actor_critic.base.last_inputs_at,
                                             total_num_steps)

            tensorboard_writer.add_scalar("mean_reward",
                                          np.mean(episode_rewards),
                                          total_num_steps)
            tensorboard_writer.add_scalar("median_reward",
                                          np.median(episode_rewards),
                                          total_num_steps)
            tensorboard_writer.add_scalar("min_reward",
                                          np.min(episode_rewards),
                                          total_num_steps)
            tensorboard_writer.add_scalar("max_reward",
                                          np.max(episode_rewards),
                                          total_num_steps)
            tensorboard_writer.add_scalar("dist_entropy", dist_entropy,
                                          total_num_steps)
            tensorboard_writer.add_scalar("value_loss", value_loss,
                                          total_num_steps)
            tensorboard_writer.add_scalar("action_loss", action_loss,
                                          total_num_steps)

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(
                actor_critic,
                ob_rms,
                args.env_name,
                args.seed,
                args.num_processes,
                eval_log_dir,
                device,
            )
示例#25
0
def main():

    realEval = True  #False

    gettrace = getattr(sys, 'gettrace', None)

    parser = argparse.ArgumentParser(description='RL')
    parser.add_argument('--action-type',
                        type=int,
                        default=-1,
                        help='action type to play (default: -1)')

    parser.add_argument('--tasks-difficulty-from',
                        type=int,
                        default=0,
                        help='tasks_difficulty_from')

    parser.add_argument('--tasks-difficulty-to',
                        type=int,
                        default=100000,
                        help='tasks-difficulty-to')

    parser.add_argument('--verboseLevel',
                        type=int,
                        default=5,
                        help='verboseLevel')

    parser.add_argument('--filesNamesSuffix',
                        default="",
                        help='filesNamesSuffix')

    parser.add_argument('--nobest-exit',
                        type=int,
                        default=10000,
                        help='nobest_exit')

    args = get_args(parser)

    args.algo = 'ppo'
    args.env_name = 'QuadruppedWalk-v1'  #'RoboschoolAnt-v1' #'QuadruppedWalk-v1' #'RoboschoolAnt-v1' #'QuadruppedWalk-v1'
    args.use_gae = True
    args.num_steps = 2048
    #args.num_processes = 4
    args.num_processes = 4
    if gettrace():
        args.num_processes = 1
    args.lr = 0.0001
    args.entropy_coef = 0.0
    args.value_loss_coef = 0.5
    args.ppo_epoch = 4
    args.num_mini_batch = 256
    args.gamma = 0.99
    args.gae_lambda = 0.95
    args.clip_param = 0.2
    args.use_linear_lr_decay = True  #True #True #True #True
    args.use_proper_time_limits = True
    args.save_dir = "./trained_models/" + args.env_name + "/"
    args.load_dir = "./trained_models/" + args.env_name + "/"
    args.log_dir = "./logs/robot"
    if gettrace():
        args.save_dir = "./trained_models/" + args.env_name + "debug/"
        args.load_dir = "./trained_models/" + args.env_name + "debug/"
        args.log_dir = "./logs/robot_d"
    args.log_interval = 30
    args.hidden_size = 64
    args.last_hidden_size = args.hidden_size
    args.recurrent_policy = False  #True
    args.save_interval = 20
    #args.seed = 1
    reward_shaping = 0.01
    allowMutate = False

    if args.seed == -1:
        args.seed = time.clock_gettime_ns(time.CLOCK_REALTIME)

    quadruppedEnv.settings.tasks_difficulty_from = args.tasks_difficulty_from
    quadruppedEnv.settings.tasks_difficulty_to = args.tasks_difficulty_to

    # 0 is a walk
    # 1 is a balance
    # 2 multitasks
    # 3 multitask experiments
    trainType = 14
    filesNamesSuffix = ""
    if args.action_type >= 0:
        trainType = args.action_type

    makeEnvFunction = makeEnv.make_env_with_best_settings
    if trainType == 1:
        filesNamesSuffix = "balance_"
        makeEnvFunction = makeEnv.make_env_for_balance

    if trainType == 2:
        filesNamesSuffix = "analytical_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_analytical

    if trainType == 3:
        filesNamesSuffix = "analytical2_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_analytical2

    if trainType == 4:
        filesNamesSuffix = "frontback_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_front_back

    if trainType == 5:
        filesNamesSuffix = "leftright_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_left_right

    if trainType == 6:
        filesNamesSuffix = "all_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_all

    if trainType == 7:
        filesNamesSuffix = "rotate_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_rotate

    if trainType == 8:
        filesNamesSuffix = "compound_"
        makeEnvFunction = make_env_multinetwork

    if trainType == 9:
        import pickle
        realEval = False
        allowMutate = False
        args.use_linear_lr_decay = True  #False
        args.num_env_steps = 5000000
        filesNamesSuffix = "test_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_test

    if trainType == 10:
        import pickle
        realEval = False
        allowMutate = False
        args.use_linear_lr_decay = True  #False
        args.num_env_steps = 5000000
        filesNamesSuffix = "zoo_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_test_zoo

    if trainType == 11:
        args.hidden_size = 128  #64 #128
        args.last_hidden_size = args.hidden_size

        import pickle
        if gettrace():
            args.num_processes = 1
        else:
            args.num_processes = 8
        realEval = False
        allowMutate = False
        args.lr = 0.00001
        args.use_linear_lr_decay = True  #False
        args.num_env_steps = 10000000
        filesNamesSuffix = "zigote2_updown_"
        print("Samples preload")
        global samplesEnvData
        samplesEnvData = pickle.load(
            open("./QuadruppedWalk-v1_MoveNoPhys.samples", "rb"))
        # samplesEnvData = pickle.load( open( "./QuadruppedWalk-v1.samples", "rb" ) )
        makeEnvFunction = makeSamplesEnv

    if trainType == 12:
        import pickle
        args.lr = 0.00001
        args.hidden_size = 64
        args.last_hidden_size = args.hidden_size
        filesNamesSuffix = "zigote2_front_back_"
        args.clip_param = 0.9
        args.value_loss_coef = 0.9
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_train
        #makeEnvFunction = makeEnv.make_env_with_best_settings_for_record
        #makeEnv.samplesEnvData = pickle.load( open( "./QuadruppedWalk-v1_MoveNoPhys.samples", "rb" ) )

    if trainType == 13:
        filesNamesSuffix = "all_bytasks_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_all

    if trainType == 14:
        #args.lr = 0.00001
        #args.num_env_steps = 000000
        #args.clip_param = 0.5
        #args.value_loss_coef  =0.8
        #random.seed(time.clock_gettime_ns(time.CLOCK_REALTIME))
        #args.num_steps = random.choice([256,512,1024,2048,4096])
        #args.num_mini_batch = random.choice([32,64,256,512])
        #args.ppo_epoch  = random.choice([2,4,8,10])
        #args.clip_param = random.choice([0.2,0.4,0.6,0.8])
        #args.value_loss_coef  =random.choice([0.4,0.5,0.6,0.8])
        #args.lr = random.choice([0.00001,0.0001,0.00005,0.0005])

        args.num_steps = 2048
        args.num_mini_batch = 64
        args.ppo_epoch = 8
        args.lr = 0.0001

        args.hidden_size = 64
        args.last_hidden_size = args.hidden_size
        #
        filesNamesSuffix = args.filesNamesSuffix
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_all
        '''
        num_steps: 1024 num_mini_batch 64 ppo_epoch 2
        clip_param: 0.2 value_loss_coef 0.6 lr 0.0001
        '''

    if trainType == 15:
        args.num_env_steps = 5000000
        filesNamesSuffix = "zigote_updown_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_train_analytic

    if trainType == 16:
        args.lr = 0.00001
        filesNamesSuffix = "compound_tasks_"
        makeEnvFunction = make_env_multinetwork

    reward_shaper = DefaultRewardsShaper(scale_value=reward_shaping)

    print("ActionType ", trainType, " ", filesNamesSuffix, "seed", args.seed,
          "num env steps:", args.num_env_steps, " tasks_dif",
          args.tasks_difficulty_from, args.tasks_difficulty_to)

    print("Num processes:", args.num_processes)

    print("num_steps:", args.num_steps, "num_mini_batch", args.num_mini_batch,
          "ppo_epoch", args.ppo_epoch)
    print("clip_param:", args.clip_param, "value_loss_coef",
          args.value_loss_coef, "lr", args.lr)

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    args.log_dir = "/tmp/tensorboard/"
    #TesnorboardX
    writer = SummaryWriter(log_dir=args.log_dir + 'runs/{}_PPO_{}_{}'.format(
        datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name,
        "ppo"))

    writer.add_scalar('options/num_steps', args.num_steps, 0)
    writer.add_scalar('options/num_mini_batch', args.num_mini_batch, 0)
    writer.add_scalar('options/ppo_epoch', args.ppo_epoch, 0)
    writer.add_scalar('options/clip_param', args.clip_param, 0)
    writer.add_scalar('options/value_loss_coef', args.value_loss_coef, 0)
    writer.add_scalar('options/lr', args.lr, 0)

    device = torch.device("cuda:0" if args.cuda else "cpu")
    torch.set_num_threads(1)

    load_dir = os.path.join(args.load_dir, args.algo)

    multiNetworkName = ["frontback_", "all_", "leftright_", "rotate_"]
    if trainType == 8:
        for net in multiNetworkName:
            bestFilename = os.path.join(
                load_dir, "{}_{}{}_best.pt".format(args.env_name, net,
                                                   args.hidden_size))
            ac, _ = torch.load(bestFilename)
            policies.append(PPOPlayer(ac, device))
            print("Policy multi loaded: ", bestFilename)

    multiNetworkName2 = [
        "all_bytasks_0_",
        "all_bytasks_1_",
        "all_bytasks_2_",
        "all_bytasks_3_",
        "all_bytasks_4_",
        "all_bytasks_5_",
        "all_bytasks_6_",
        "all_bytasks_7_",
        "all_bytasks_8_",
        "all_bytasks_9_",
        "all_bytasks_10_",
        "all_bytasks_11_",
        "all_bytasks_12_",
    ]
    if trainType == 16:
        for net in multiNetworkName2:
            bestFilename = os.path.join(
                load_dir, "{}_{}{}_best.pt".format(args.env_name, net,
                                                   args.hidden_size))
            ac, _ = torch.load(bestFilename)
            policies.append(PPOPlayer(ac, device))
            print("Policy multi loaded: ", bestFilename)

    envs = make_vec_envs(args.env_name,
                         args.seed,
                         args.num_processes,
                         args.gamma,
                         None,
                         device,
                         False,
                         normalizeOb=False,
                         normalizeReturns=False,
                         max_episode_steps=args.num_steps,
                         makeEnvFunc=makeEnvFunction,
                         num_frame_stack=1,
                         info_keywords=(
                             'episode_steps',
                             'episode_reward',
                             'progress',
                             'servo',
                             'distToTarget',
                         ))
    #print(envs.observation_space.shape,envs.action_space)
    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={
                              'recurrent': args.recurrent_policy,
                              'hidden_size': args.hidden_size,
                              'last_hidden_size': args.last_hidden_size,
                              'activation_layers_type': "Tanh"
                          })
    '''
#    if args.load_dir not None:
    load_path = os.path.join(args.load_dir, args.algo)
    actor_critic, ob_rms = torch.load(os.path.join(load_path, args.env_name + ".pt"))
    '''
    load_path = os.path.join(
        load_dir, "{}_{}{}_best.pt".format(args.env_name, filesNamesSuffix,
                                           args.hidden_size))
    #load_path = os.path.join(load_path, "{}_{}{}.pt".format(args.env_name,filesNamesSuffix,args.hidden_size))
    preptrained_path = "../Train/trained_models/QuadruppedWalk-v1/Train_QuadruppedWalk-v1_256.pth"
    loadPretrained = False
    if loadPretrained and os.path.isfile(preptrained_path):
        print("Load preptrained")
        abj = torch.load(preptrained_path)
        print(abj)
        print(actor_critic.base)
        actor_critic.base.load_state_dict()
        actor_critic.base.eval()
    if os.path.isfile(load_path) and not loadPretrained:
        actor_critic, ob_rms = torch.load(load_path)
        actor_critic.eval()
        print("----NN loaded: ", load_path, " -----")
    else:
        bestFilename = os.path.join(
            load_dir,
            "{}_{}{}_best_pretrain.pt".format(args.env_name, filesNamesSuffix,
                                              args.hidden_size))
        if os.path.isfile(bestFilename):
            actor_critic, ob_rms = torch.load(bestFilename)
            actor_critic.eval()
            print("----NN loaded: ", bestFilename, " -----")

    maxReward = -10000.0
    maxSteps = 0
    minDistance = 50000.0

    actor_critic.to(device)

    agent = algo.PPO(actor_critic,
                     args.clip_param,
                     args.ppo_epoch,
                     args.num_mini_batch,
                     args.value_loss_coef,
                     args.entropy_coef,
                     lr=args.lr,
                     eps=args.eps,
                     max_grad_norm=args.max_grad_norm)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir,
            "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

        gail_train_loader = torch.utils.data.DataLoader(
            gail.ExpertDataset(file_name,
                               num_trajectories=4,
                               subsample_frequency=20),
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    deque_maxLen = 10

    episode_rewards = deque(maxlen=deque_maxLen)
    episode_steps = deque(maxlen=deque_maxLen)
    episode_rewards_alive = deque(maxlen=deque_maxLen)
    episode_rewards_progress = deque(maxlen=deque_maxLen)
    episode_rewards_servo = deque(maxlen=deque_maxLen)
    episode_dist_to_target = deque(maxlen=deque_maxLen)
    '''
    load_path = os.path.join(args.load_dir, args.algo)
    load_path = os.path.join(load_path, args.env_name + ".pt")
    actor_critic, ob_rms = torch.load(load_path)

    actor_critic.to(device)
    actor_critic.eval()
    #ob_rms.eval()
    '''
    '''
    args.use_gym_monitor = 1
    args.monitor_dir = "./results/"
    monitor_path = os.path.join(args.monitor_dir, args.algo)
    monitor_path = os.path.join(monitor_path, args.env_name)

    args.
    if args.use_gym_monitor:
        env = wrappers.Monitor(
            env, monitor_path, video_callable=False, force=True)
    '''
    i_episode = 0

    save_path = os.path.join(args.save_dir, args.algo)
    try:
        os.makedirs(save_path)
    except OSError:
        pass

    trainOnSamplesAndExit = False  #False
    if trainOnSamplesAndExit:
        import pickle
        print("---------------------------------------")
        print("Samples preload")
        data = pickle.load(open("./QuadruppedWalk-v1_UpDown.samples", "rb"))
        #data = pickle.load( open( "../QuadruppedWalk-v1_NN.samples", "rb" ) )

        learning_rate = 0.0001
        max_episodes = 100
        max_timesteps = 4000
        betas = (0.9, 0.999)
        log_interval = 1

        envSamples = SamplesEnv(data)
        envSamples.numSteps = max_timesteps

        # create a stochastic gradient descent optimizer
        optimizer = torch.optim.Adam(actor_critic.base.actor.parameters(),
                                     lr=learning_rate,
                                     betas=betas)
        #optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9)
        # create a loss function
        criterion = nn.MSELoss(reduction="sum")

        # run the main training loop
        for epoch in range(max_episodes):
            state = envSamples.reset()
            time_step = 0
            testReward = 0
            testSteps = 0
            loss_sum = 0
            loss_max = 0

            for t in range(max_timesteps):
                time_step += 1

                nn_state = torch.FloatTensor((state).reshape(1, -1)).to(device)

                optimizer.zero_grad()
                net_out = actor_critic.base.forwardActor(nn_state)
                net_out = actor_critic.dist.fc_mean(net_out)

                state, reward, done, info = envSamples.step(
                    net_out.detach().numpy())
                sim_action = envSamples.recordedActions

                sim_action_t = torch.FloatTensor([sim_action]).to(device)

                loss = criterion(net_out, sim_action_t)
                loss.backward()
                optimizer.step()
                loss_sum += loss.mean()
                loss_max = max(loss_max, loss.max())

                testReward += reward
                testSteps += 1

                if done:
                    if epoch % log_interval == 0:
                        #print(best_action_t*scaleActions-net_out*scaleActions)
                        if args.verboseLevel > 0:
                            print(
                                'Train Episode: {} t:{} Reward:{} Loss: mean:{:.6f} max: {:.6f}'
                                .format(epoch, t, testReward, loss_sum / t,
                                        loss_max))
                            print(info)
                        reward = 0
                    break
        bestFilename = os.path.join(
            save_path,
            "{}_{}{}_best_pretrain.pt".format(args.env_name, filesNamesSuffix,
                                              args.hidden_size))
        torch.save([
            actor_critic,
            getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
        ], bestFilename)
        exit(0)

    skipWriteBest = True

    if args.verboseLevel > 0:
        printNetwork(actor_critic.base.actor)

    lock(actor_critic, first=False, last=False)
    #if trainType==9:
    #allowMutate = False
    #lock(actor_critic,first=True,last=False)
    #mutate(actor_critic,power=0.00,powerLast=0.3)

    if args.verboseLevel > 0:
        printNetwork(actor_critic.base.actor)
    #from torchsummary import summary

    #summary(actor_critic.base.actor, (1, 48, 64))

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    episodeBucketIndex = 0

    maxReward = -10000000000
    numEval = 10
    if realEval:
        envEval = makeEnvFunction(args.env_name)
        if hasattr(envEval.env, "tasks") and len(envEval.env.tasks):
            numEval = max(numEval, len(envEval.env.tasks))
        maxReward = evaluate_policy(envEval,
                                    actor_critic,
                                    numEval * 2,
                                    render=False,
                                    device=device,
                                    verbose=args.verboseLevel)
        print("MaxReward on start", maxReward)

    noMaxRewardCount = 0

    updateIndex = 0

    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        episode_r = 0.0
        stepsDone = 0

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            #envs.venv.venv.venv.envs[0].render()

            if args.verboseLevel > 0:
                index = 0
                for d in done:
                    if d:
                        print(infos[index], flush=True)
                    index += 1

            episodeDone = False
            '''
            index = 0
            for d in done:
                if d:
                    print("")
                    print(infos[index])
                index+=1
            '''

            for info in infos:
                if 'reward' in info.keys():
                    episodeDone = True
                    i_episode += 1
                    episode_rewards.append(info['reward'])
                    writer.add_scalar('reward/episode', info['reward'],
                                      i_episode)
                    #print("E:",i_episode," T:",info['episode_steps'], " R:", info['episode_reward'], " D:",info['distToTarget'])
                if 'steps' in info.keys():
                    episode_steps.append(info['steps'])
                    writer.add_scalar('reward/steps', info['steps'], i_episode)
                if 'alive' in info.keys():
                    episode_rewards_alive.append(info['alive'])
                    writer.add_scalar('reward/alive', info['alive'], i_episode)
                if 'prog' in info.keys():
                    episode_rewards_progress.append(info['prog'])
                    writer.add_scalar('reward/progress', info['prog'],
                                      i_episode)
                if 'servo' in info.keys():
                    episode_rewards_servo.append(info['servo'])
                    writer.add_scalar('reward/servo', info['servo'], i_episode)
                if 'd2T' in info.keys():
                    episode_dist_to_target.append(info['d2T'])
                    writer.add_scalar('reward/distToTarget', info['d2T'],
                                      i_episode)

                for val in info.keys():
                    if val not in [
                            "reward", "steps", "alive", "prog", "servo", "d2T",
                            'epos', 't'
                    ]:
                        writer.add_scalar('reward/' + val, info[val],
                                          i_episode)

            #if episodeDone and i_episode%10==0:
            #    print(i_episode,"({:.1f}/{}/{:.2f}) ".format(episode_rewards[-1],episode_steps[-1],episode_dist_to_target[-1]),end='',flush=True)

            if episodeDone:
                episodeBucketIndex += 1
                if args.verboseLevel > 0:
                    print("Mean:", Fore.WHITE, np.mean(episode_rewards),
                          Style.RESET_ALL, " Median:", Fore.WHITE,
                          np.median(episode_rewards), Style.RESET_ALL,
                          " max reward:", maxReward)

                #'''len(episode_rewards) and np.mean(episode_rewards)>maxReward and'''
                if realEval:
                    if episodeBucketIndex % args.log_interval == 0 and episodeBucketIndex > args.log_interval:
                        print("Step:",
                              (j + 1) * args.num_processes * args.num_steps)
                        if skipWriteBest == False:
                            evalReward = evaluate_policy(
                                envEval,
                                actor_critic,
                                numEval,
                                device=device,
                                verbose=args.verboseLevel)

                            writer.add_scalar('reward/eval', evalReward,
                                              i_episode)

                            if evalReward > maxReward:
                                maxReward = evalReward
                                #maxReward = np.mean(episode_rewards)

                                bestFilename = os.path.join(
                                    save_path, "{}_{}{}_best.pt".format(
                                        args.env_name, filesNamesSuffix,
                                        args.hidden_size))
                                print(
                                    "Writing best reward:", Fore.GREEN,
                                    "({:.1f}/{:.1f}/{:.1f}/{}/{:.2f}) ".format(
                                        maxReward, np.mean(episode_rewards),
                                        np.median(episode_rewards),
                                        np.mean(episode_steps),
                                        episode_dist_to_target[-1]),
                                    Style.RESET_ALL, bestFilename)
                                torch.save([
                                    actor_critic,
                                    getattr(utils.get_vec_normalize(envs),
                                            'ob_rms', None)
                                ], bestFilename)
                                noMaxRewardCount = 0
                            else:
                                noMaxRewardCount += 1
                                if allowMutate:
                                    if noMaxRewardCount == 5:
                                        print("Mutation low last layer")
                                        lock(actor_critic,
                                             first=False,
                                             last=False)
                                        mutate(actor_critic,
                                               power=0.00,
                                               powerLast=0.01)
                                    if noMaxRewardCount == 8:
                                        print("Mutation low non last")
                                        lock(actor_critic,
                                             first=False,
                                             last=False)
                                        mutate(actor_critic,
                                               power=0.01,
                                               powerLast=0.0)
                                    if noMaxRewardCount == 11:
                                        print("Mutation low all")
                                        lock(actor_critic,
                                             first=False,
                                             last=False)
                                        mutate(actor_critic,
                                               power=0.02,
                                               powerLast=0.2)
                                    if noMaxRewardCount == 14:
                                        print("Mutation hi all")
                                        lock(actor_critic,
                                             first=False,
                                             last=False)
                                        mutate(actor_critic,
                                               power=0.03,
                                               powerLast=0.03)
                                        noMaxRewardCount = 0
                                if noMaxRewardCount == args.nobest_exit:
                                    exit(0)
                        else:
                            skipWriteBest = False
                else:
                    if len(episode_rewards) and np.mean(
                            episode_rewards
                    ) > maxReward and j > args.log_interval:
                        if skipWriteBest == False:
                            maxReward = np.mean(episode_rewards)
                            writer.add_scalar('reward/maxReward', maxReward,
                                              i_episode)

                            bestFilename = os.path.join(
                                save_path, "{}_{}{}_best.pt".format(
                                    args.env_name, filesNamesSuffix,
                                    args.hidden_size))
                            if len(episode_dist_to_target):
                                print(
                                    "Writing best reward:", Fore.GREEN,
                                    "({:.1f}/{:.1f}/{}/{:.2f}) ".format(
                                        np.mean(episode_rewards),
                                        np.median(episode_rewards),
                                        np.mean(episode_steps),
                                        episode_dist_to_target[-1]),
                                    Style.RESET_ALL, bestFilename)
                            else:
                                print(
                                    "Writing best reward:", Fore.GREEN,
                                    "({:.1f}/{:.1f}/{}) ".format(
                                        np.mean(episode_rewards),
                                        np.median(episode_rewards),
                                        np.mean(episode_steps)),
                                    Style.RESET_ALL, bestFilename)

                            torch.save([
                                actor_critic,
                                getattr(utils.get_vec_normalize(envs),
                                        'ob_rms', None)
                            ], bestFilename)
                        else:
                            skipWriteBest = False
            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            shaped_reward = reward_shaper(reward)
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, shaped_reward, masks,
                            bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step])

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        writer.add_scalar('reward/value_loss', value_loss, updateIndex)
        writer.add_scalar('reward/action_loss', action_loss, updateIndex)
        writer.add_scalar('reward/dist_entropy', dist_entropy, updateIndex)

        updateIndex += 1

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            '''
            fileName = os.path.join(save_path, "{}_{}{}.pt".format(args.env_name,filesNamesSuffix,args.hidden_size))
            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], fileName)
            print("Saved:",fileName, " cur avg rewards:",np.mean(episode_rewards))

            fileName = os.path.join(save_path, "{}_{}{}_actor.pt".format(args.env_name,filesNamesSuffix,args.hidden_size))
            torch.save(actor_critic.state_dict, fileName)
            print("Saved:",fileName)
            '''
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            if args.verboseLevel > 0:
                print("")
                print("Updates {}, num timesteps {}, FPS {}".format(
                    j, total_num_steps, int(total_num_steps / (end - start))))
                print(" Last {} training episodes:".format(
                    len(episode_rewards)))

                print(
                    " reward mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}".
                    format(np.mean(episode_rewards),
                           np.median(episode_rewards), np.min(episode_rewards),
                           np.max(episode_rewards)))

                print(" steps mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}".
                      format(np.mean(episode_steps), np.median(episode_steps),
                             np.min(episode_steps), np.max(episode_steps)))

                if len(episode_rewards_alive):
                    print(
                        " alive mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}"
                        .format(np.mean(episode_rewards_alive),
                                np.median(episode_rewards_alive),
                                np.min(episode_rewards_alive),
                                np.max(episode_rewards_alive)))

                if len(episode_rewards_progress):
                    print(
                        " progress mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}"
                        .format(np.mean(episode_rewards_progress),
                                np.median(episode_rewards_progress),
                                np.min(episode_rewards_progress),
                                np.max(episode_rewards_progress)))

                if len(episode_rewards_servo):
                    print(
                        " servo mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}"
                        .format(np.mean(episode_rewards_servo),
                                np.median(episode_rewards_servo),
                                np.min(episode_rewards_servo),
                                np.max(episode_rewards_servo)))

                if len(episode_dist_to_target):
                    print(
                        " dist to target mean/median {:.3f}/{:.3f} min/max {:.3f}/{:.3f}"
                        .format(np.mean(episode_dist_to_target),
                                np.median(episode_dist_to_target),
                                np.min(episode_dist_to_target),
                                np.max(episode_dist_to_target)))

                print(
                    " Reward/Steps {:.3f} Progress/Steps: {:.3f} entropy {:.1f} value_loss {:.5f} action_loss {:.5f}\n"
                    .format(
                        np.mean(episode_rewards) / np.mean(episode_steps),
                        (0 if len(episode_rewards_progress) == 0 else
                         np.mean(episode_rewards_progress) /
                         np.mean(episode_steps)), dist_entropy, value_loss,
                        action_loss))
示例#26
0
    def run(self):
        args = self.args
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)
        print("CUDA is available: ", torch.cuda.is_available())
        if args.cuda:
            print("CUDA enabled")
            torch.backends.cudnn.benchmark = False
            torch.backends.cudnn.deterministic = True
        else:
            if args.cuda_deterministic:
                print("Warning CUDA is requested but is not available")
            else:
                print("CUDA disabled")

        log_dir = os.path.expanduser(args.log_dir)
        eval_log_dir = log_dir + "_eval"
        utils.cleanup_log_dir(log_dir)
        utils.cleanup_log_dir(eval_log_dir)
        print("get_num_thread", torch.get_num_threads())

        device = torch.device("cuda:0" if args.cuda else "cpu")

        envs = make_vec_envs(args.env_name, self.config_parameters, args.seed,
                             args.num_processes, args.gamma, args.log_dir,
                             device, False)

        actor_critic = create_IAM_model(envs, args, self.config_parameters)
        actor_critic.to(device)

        if args.algo == 'a2c':
            agent = algo.A2C_ACKTR(actor_critic,
                                   args.value_loss_coef,
                                   args.entropy_coef,
                                   lr=args.lr,
                                   eps=args.eps,
                                   alpha=args.alpha,
                                   max_grad_norm=args.max_grad_norm)
        # This algorithm should be used for the reproduction project.
        elif args.algo == 'ppo':
            agent = algo.PPO(actor_critic,
                             args.clip_param,
                             args.ppo_epoch,
                             args.num_mini_batch,
                             args.value_loss_coef,
                             args.entropy_coef,
                             lr=args.lr,
                             eps=args.eps,
                             max_grad_norm=args.max_grad_norm)
        elif args.algo == 'acktr':
            agent = algo.A2C_ACKTR(actor_critic,
                                   args.value_loss_coef,
                                   args.entropy_coef,
                                   acktr=True)

        if args.gail:
            assert len(envs.observation_space.shape) == 1
            discr = gail.Discriminator(
                envs.observation_space.shape[0] + envs.action_space.shape[0],
                100, device)
            file_name = os.path.join(
                args.gail_experts_dir,
                "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

            expert_dataset = gail.ExpertDataset(file_name,
                                                num_trajectories=4,
                                                subsample_frequency=20)
            drop_last = len(expert_dataset) > args.gail_batch_size
            gail_train_loader = torch.utils.data.DataLoader(
                dataset=expert_dataset,
                batch_size=args.gail_batch_size,
                shuffle=True,
                drop_last=drop_last)

        rollouts = RolloutStorage(args.num_steps, args.num_processes,
                                  envs.observation_space.shape,
                                  envs.action_space,
                                  actor_critic.recurrent_hidden_state_size)

        obs = envs.reset()
        rollouts.obs[0].copy_(obs)
        rollouts.to(device)
        # Always return the average of the last 100 steps. This means the average is sampled.
        episode_rewards = deque(maxlen=100)

        start = time.time()
        num_updates = int(
            args.num_env_steps) // args.num_steps // args.num_processes
        for j in range(num_updates):

            if args.use_linear_lr_decay:
                # decrease learning rate linearly
                utils.update_linear_schedule(
                    agent.optimizer, j, num_updates,
                    agent.optimizer.lr if args.algo == "acktr" else args.lr)

            for step in range(args.num_steps):
                # Sample actions
                with torch.no_grad():
                    value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                        rollouts.obs[step],
                        rollouts.recurrent_hidden_states[step],
                        rollouts.masks[step])

                # Obser reward and next obs
                obs, reward, done, infos = envs.step(action)

                for info in infos:
                    if 'episode' in info.keys():
                        episode_rewards.append(info['episode']['r'])

                # If done then clean the history of observations.
                masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                           for done_ in done])
                bad_masks = torch.FloatTensor(
                    [[0.0] if 'bad_transition' in info.keys() else [1.0]
                     for info in infos])
                rollouts.insert(obs, recurrent_hidden_states, action,
                                action_log_prob, value, reward, masks,
                                bad_masks)

            with torch.no_grad():
                next_value = actor_critic.get_value(
                    rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                    rollouts.masks[-1]).detach()

            if args.gail:
                if j >= 10:
                    envs.venv.eval()

                gail_epoch = args.gail_epoch
                if j < 10:
                    gail_epoch = 100  # Warm up
                for _ in range(gail_epoch):
                    discr.update(gail_train_loader, rollouts,
                                 utils.get_vec_normalize(envs)._obfilt)

                for step in range(args.num_steps):
                    rollouts.rewards[step] = discr.predict_reward(
                        rollouts.obs[step], rollouts.actions[step], args.gamma,
                        rollouts.masks[step])

            rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                     args.gae_lambda,
                                     args.use_proper_time_limits)

            value_loss, action_loss, dist_entropy = agent.update(rollouts)

            rollouts.after_update()

            # save for every interval-th episode or for the last epoch
            if (j % args.save_interval == 0
                    or j == num_updates - 1) and args.save_dir != "":
                save_path = os.path.join(args.save_dir, args.algo)
                try:
                    os.makedirs(save_path)
                except OSError:
                    pass

                torch.save([
                    actor_critic,
                    getattr(utils.get_vec_normalize(envs), 'obs_rms', None)
                ], os.path.join(save_path, self.model_file_name))

            if j % args.log_interval == 0 and len(episode_rewards) > 1:
                total_num_steps = (j + 1) * args.num_processes * args.num_steps
                end = time.time()
                elapsed_time = end - start
                data = [
                    j,  # Updates
                    total_num_steps,  # timesteps
                    int(total_num_steps / elapsed_time),  # FPS
                    len(episode_rewards),  # Only useful for print statement
                    np.mean(episode_rewards),  # mean of rewards
                    np.median(episode_rewards),  # median of rewards
                    np.min(episode_rewards),  # min rewards
                    np.max(episode_rewards),  # max rewards
                    dist_entropy,
                    value_loss,
                    action_loss,
                    elapsed_time
                ]
                output = ''.join([str(x) + ',' for x in data])
                self.data_saver.append(output)
                print(
                    f"Updates {data[0]}, num timesteps {data[1]}, FPS {data[2]}, elapsed time {int(data[11])} sec. Last {data[3]} training episodes: mean/median reward {data[4]:.2f}/{data[5]:.2f}, min/max reward {data[6]:.1f}/{data[7]:.1f}",
                    end="\r")

            if (args.eval_interval is not None and len(episode_rewards) > 1
                    and j % args.eval_interval == 0):
                obs_rms = utils.get_vec_normalize(envs).obs_rms
                evaluate(actor_critic, obs_rms, args.env_name, args.seed,
                         args.num_processes, eval_log_dir, device)
示例#27
0
def main():
    args = get_args()
    toke = tokenizer()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    tobs = torch.zeros((args.num_processes, trace_size), dtype=torch.long)
    #print (tobs.dtype)
    rollouts.obs[0].copy_(obs)
    rollouts.tobs[0].copy_(tobs)

    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):

            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.tobs[step],
                    rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)
            tobs = []
            envs.render()
            for info in infos:
                if 'episode' in info.keys():
                    #print ("episode ", info['episode'])
                    episode_rewards.append(info['episode']['r'])
                trace = [x.inst for x in info['trace']]
                trace = trace[0:trace_size]
                word_to_ix = toke.tokenize(trace)
                seq = prepare_sequence(trace, word_to_ix)
                if len(seq) < trace_size:
                    seq = torch.zeros((trace_size), dtype=torch.long)
                seq = seq[:trace_size]
                #print (seq.dtype)
                tobs.append(seq)
            tobs = torch.stack(tobs)
            #print (tobs)
            #print (tobs.size())
            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, tobs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.tobs[-1],
                rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)
示例#28
0
def onpolicy_main():
    print("onpolicy main")

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    summary_name = args.log_dir + '{0}_{1}'
    writer = SummaryWriter(summary_name.format(args.env_name, args.save_name))

    # Make vector env
    envs = make_vec_envs(
        args.env_name,
        args.seed,
        args.num_processes,
        args.gamma,
        args.log_dir,
        device,
        False,
        env_kwargs=env_kwargs,
    )

    # agly ways to access to the environment attirubutes
    if args.env_name.find('doorenv') > -1:
        if args.num_processes > 1:
            visionnet_input = envs.venv.venv.visionnet_input
            nn = envs.venv.venv.nn
            env_name = envs.venv.venv.xml_path
        else:
            visionnet_input = envs.venv.venv.envs[
                0].env.env.env.visionnet_input
            nn = envs.venv.venv.envs[0].env.env.env.nn
            env_name = envs.venv.venv.envs[0].env.env.env.xml_path
        dummy_obs = np.zeros(nn * 2 + 3)
    else:
        dummy_obs = envs.observation_space
        visionnet_input = None
        nn = None

    if pretrained_policy_load:
        print("loading", pretrained_policy_load)
        actor_critic, ob_rms = torch.load(pretrained_policy_load)
    else:
        actor_critic = Policy(dummy_obs.shape,
                              envs.action_space,
                              base_kwargs={'recurrent': args.recurrent_policy})

    if visionnet_input:
        visionmodel = load_visionmodel(env_name, args.visionmodel_path,
                                       VisionModelXYZ())
        actor_critic.visionmodel = visionmodel.eval()
    actor_critic.nn = nn
    actor_critic.to(device)

    #disable normalizer
    vec_norm = get_vec_normalize(envs)
    vec_norm.eval()

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              dummy_obs.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    full_obs = envs.reset()
    initial_state = full_obs[:, :envs.action_space.shape[0]]

    if args.env_name.find('doorenv') > -1 and visionnet_input:
        obs = actor_critic.obs2inputs(full_obs, 0)
    else:
        if knob_noisy:
            obs = add_noise(full_obs, 0)
        else:
            obs = full_obs

    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(agent.optimizer, j, num_updates,
                                         args.lr)

        pos_control = False
        total_switches = 0
        prev_selection = ""
        for step in range(args.num_steps):
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])
                next_action = action

            if pos_control:
                frame_skip = 2
                if step % (512 / frame_skip - 1) == 0:
                    current_state = initial_state
                next_action = current_state + next_action
                for kk in range(frame_skip):
                    full_obs, reward, done, infos = envs.step(next_action)

                current_state = full_obs[:, :envs.action_space.shape[0]]
            else:
                full_obs, reward, done, infos = envs.step(next_action)

            # convert img to obs if door_env and using visionnet
            if args.env_name.find('doorenv') > -1 and visionnet_input:
                obs = actor_critic.obs2inputs(full_obs, j)
            else:
                if knob_noisy:
                    obs = add_noise(full_obs, j)
                else:
                    obs = full_obs

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        rollouts.after_update()

        writer.add_scalar("Value loss", value_loss, j)
        writer.add_scalar("action loss", action_loss, j)
        writer.add_scalar("dist entropy loss", dist_entropy, j)
        writer.add_scalar("Episode rewards", np.mean(episode_rewards), j)

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass
            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ],
                       os.path.join(
                           save_path, args.env_name +
                           "_{}.{}.pt".format(args.save_name, j)))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)

        DR = False  # True #Domain Randomization
        ################## for multiprocess world change ######################
        if DR:
            print("changing world")

            envs.close_extras()
            envs.close()
            del envs

            envs = make_vec_envs(
                args.env_name,
                args.seed,
                args.num_processes,
                args.gamma,
                args.log_dir,
                device,
                False,
                env_kwargs=env_kwargs,
            )

            full_obs = envs.reset()
            if args.env_name.find('doorenv') > -1 and visionnet_input:
                obs = actor_critic.obs2inputs(full_obs, j)
            else:
                obs = full_obs
示例#29
0
def train_maml_like_ppo_(
    init_model,
    args,
    learning_rate,
    num_episodes=20,
    num_updates=1,
    vis=False,
    run_idx=0,
    use_linear_lr_decay=False,
):
    num_steps = num_episodes * 100

    torch.set_num_threads(1)
    device = torch.device("cpu")

    envs = make_vec_envs(ENV_NAME, seeding.create_seed(None), NUM_PROC,
                         args.gamma, None, device, allow_early_resets=True, normalize=args.norm_vectors)
    raw_env = navigation_2d.unpeele_navigation_env(envs, 0)

    # raw_env.set_arguments(args.rm_nogo, args.reduce_goals, True, args.large_nogos)
    new_task = raw_env.sample_tasks(run_idx)
    raw_env.reset_task(new_task[0])

    # actor_critic = Policy(
    #     envs.observation_space.shape,
    #     envs.action_space,
    #     base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic = copy.deepcopy(init_model)
    actor_critic.to(device)

    agent = algo.PPO(
        actor_critic,
        args.clip_param,
        args.ppo_epoch,
        args.num_mini_batch,
        args.value_loss_coef,
        args.entropy_coef,
        lr=learning_rate,
        eps=args.eps,
        max_grad_norm=args.max_grad_norm)

    rollouts = RolloutStorage(num_steps, NUM_PROC,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    fitnesses = []

    for j in range(num_updates):

        # if args.use_linear_lr_decay:
        #    # decrease learning rate linearly
        #    utils.update_linear_schedule(
        #        agent.optimizer, j, num_updates,
        #        agent.optimizer.lr if args.algo == "acktr" else args.lr)
        min_c_rew = float("inf")
        vis = []
        offending = []
        for step in range(num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)
            if done[0]:
                c_rew = infos[0]["cummulative_reward"]
                vis.append((infos[0]['path'], infos[0]['goal']))
                offending.extend(infos[0]['offending'])
                if c_rew < min_c_rew:
                    min_c_rew = c_rew
            # If done then clean the history of observations.
            masks = torch.FloatTensor(
                [[0.0] if done_ else [1.0] for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        ob_rms = utils.get_vec_normalize(envs)
        if ob_rms is not None:
            ob_rms = ob_rms.ob_rms

        fits, info = evaluate(actor_critic, ob_rms, envs, NUM_PROC, device)
        print(f"fitness {fits} update {j+1}")
        if (j+1) % 1 == 0:
            vis_path(vis, eval_path_rec=info['path'], offending=offending)
        fitnesses.append(fits)

    return fitnesses[-1], info[0]['reached'], None
示例#30
0
def inner_loop_ppo(
    weights,
    args,
    learning_rate,
    num_steps,
    num_updates,
    run_idx,
    input_envs,
):

    torch.set_num_threads(1)
    device = torch.device("cpu")
    #print(input_envs.venv.spec._kwargs['config']['goal_locations'])
    #env_name = register_set_goal(run_idx)

    #envs = make_vec_envs(env_name, np.random.randint(2**32), NUM_PROC,
    #                     args.gamma, None, device, allow_early_resets=True, normalize=args.norm_vectors)
    actor_critic = init_ppo(input_envs, log(args.init_sigma))
    actor_critic.to(device)

    # apply the weights to the model
    apply_from_list(weights, actor_critic)


    agent = algo.PPO(
        actor_critic,
        args.clip_param,
        args.ppo_epoch,
        args.num_mini_batch,
        args.value_loss_coef,
        args.entropy_coef,
        lr=learning_rate,
        eps=args.eps,
        max_grad_norm=args.max_grad_norm)

    rollouts = RolloutStorage(num_steps, NUM_PROC,
                              input_envs.observation_space.shape, input_envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = input_envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    fitnesses = []
    violation_cost = 0

    for j in range(num_updates):

        episode_step_counter = 0
        for step in range(num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states, (final_action, _) = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])
            # Obser reward and next obs
            obs, reward, done, infos = input_envs.step(final_action)
            episode_step_counter += 1

            # Count the cost
            total_reward = reward
            for info in infos:
                violation_cost += info['cost']
                total_reward -= info['cost']

            # If done then clean the history of observations.
            masks = torch.FloatTensor(
                [[0.0] if done_ else [1.0] for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, total_reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        ob_rms = utils.get_vec_normalize(input_envs)
        if ob_rms is not None:
            ob_rms = ob_rms.ob_rms

        fits, info = evaluate(actor_critic, ob_rms, input_envs, NUM_PROC, device)
        fitnesses.append(fits)

    return (fitnesses[-1]), 0, 0