Пример #1
0
def eval_fn(load_path, args, env_name='fruitbot', distribution_mode='easy', num_levels=500, start_level=500, log_dir='./tmp/procgen', comm=None, num_trials=3, gui=False):

    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    use_vf_clipping = True
    vf_coef = 0.5
    max_grad_norm = 0.5

    mpi_rank_weight = 1
    log_interval = 1
    seed=None

    log_comm = comm.Split(0, 0)
    format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
    logger.configure(comm=log_comm, dir=log_dir, format_strs=format_strs)

    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=1, env_name=env_name, num_levels=num_levels, start_level=start_level, distribution_mode=distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv, filename=None, keep_buf=100,
    )

    venv = VecNormalize(venv=venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn(x, depths=[16,32,32], emb_size=256)

    logger.info(f"evaluating")

    set_global_seeds(seed)

    policy = build_policy(venv, conv_fn)

    # Get the nb of env
    nenvs = venv.num_envs
    # Get state_space and action_space
    ob_space = venv.observation_space
    ac_space = venv.action_space

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    # Instantiate the model object (that creates act_model and train_model)
    from .alternate_ppo2.model import Model
    model_fn = Model

    model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
                    nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
                    max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight)

    if os.path.isfile(load_path):
        alt_ppo2.eval(
            network=conv_fn,
            nsteps=nsteps,
            ent_coef=ent_coef,
            vf_coef=vf_coef,
            max_grad_norm=max_grad_norm,
            gamma=gamma,
            lam=lam,
            log_interval=log_interval,
            nminibatches=nminibatches,
            noptepochs=ppo_epochs,
            load_path=load_path,
            mpi_rank_weight=mpi_rank_weight,
            comm=comm,
            clip_vf=use_vf_clipping,
            lr=learning_rate,
            cliprange=clip_range,
            policy=policy,
            nenvs=nenvs,
            ob_space=ob_space,
            ac_space=ac_space,
            nbatch=nbatch,
            nbatch_train=nbatch_train,
            model_fn=model_fn,
            model=model,
            num_trials=num_trials,
            num_levels=num_levels,
            start_level=start_level,
            gui=gui,
            args=args
        )
    elif os.path.isdir(load_path):
        for file in os.listdir(load_path):
            log_comm = comm.Split(0, 0)
            format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
            logger.configure(comm=log_comm, dir=log_dir+'/'+file, format_strs=format_strs)
            alt_ppo2.eval(
                network=conv_fn,
                nsteps=nsteps,
                ent_coef=ent_coef,
                vf_coef=vf_coef,
                max_grad_norm=max_grad_norm,
                gamma=gamma,
                lam=lam,
                log_interval=log_interval,
                nminibatches=nminibatches,
                noptepochs=ppo_epochs,
                load_path=load_path+'/'+file,
                mpi_rank_weight=mpi_rank_weight,
                comm=comm,
                clip_vf=use_vf_clipping,
                lr=learning_rate,
                cliprange=clip_range,
                policy=policy,
                nenvs=nenvs,
                ob_space=ob_space,
                ac_space=ac_space,
                nbatch=nbatch,
                nbatch_train=nbatch_train,
                model_fn=model_fn,
                model=model,
                num_trials=num_trials,
                num_levels=num_levels,
                start_level=start_level,
                gui=gui,
                args=args
            )
    else:
        print('Model path does not exist.')
    return
Пример #2
0
def train(comm=None, *, save_dir=None, **kwargs):
    """
    Train a model using Baselines' PPO2, and to save a checkpoint file in the
    required format.

    There is one required kwarg: either env_name (for env_kind="procgen") or
    env_id (for env_kind="atari").

    Models for the paper were trained with 16 parallel MPI workers.

    Note: this code has not been well-tested.
    """
    kwargs.setdefault("env_kind", "procgen")
    kwargs.setdefault("num_envs", 64)
    kwargs.setdefault("learning_rate", 5e-4)
    kwargs.setdefault("entropy_coeff", 0.01)
    kwargs.setdefault("gamma", 0.999)
    kwargs.setdefault("lambda", 0.95)
    kwargs.setdefault("num_steps", 256)
    kwargs.setdefault("num_minibatches", 8)
    kwargs.setdefault("library", "baselines")
    kwargs.setdefault("save_all", False)
    kwargs.setdefault("ppo_epochs", 3)
    kwargs.setdefault("clip_range", 0.2)
    kwargs.setdefault("timesteps_per_proc", 1_000_000_000)
    kwargs.setdefault("cnn", "clear")
    kwargs.setdefault("use_lstm", 0)
    kwargs.setdefault("stack_channels", "16_32_32")
    kwargs.setdefault("emb_size", 256)
    kwargs.setdefault("epsilon_greedy", 0.0)
    kwargs.setdefault("reward_scale", 1.0)
    kwargs.setdefault("frame_stack", 1)
    kwargs.setdefault("use_sticky_actions", 0)
    kwargs.setdefault("clip_vf", 1)
    kwargs.setdefault("reward_processing", "none")
    kwargs.setdefault("save_interval", 10)

    if comm is None:
        comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    setup_mpi_gpus()

    if save_dir is None:
        save_dir = tempfile.mkdtemp(prefix="rl_clarity_train_")

    create_env_kwargs = kwargs.copy()
    num_envs = create_env_kwargs.pop("num_envs")
    venv = create_env(num_envs, **create_env_kwargs)

    library = kwargs["library"]
    if library == "baselines":
        reward_processing = kwargs["reward_processing"]
        if reward_processing == "none":
            pass
        elif reward_processing == "clip":
            venv = VecClipReward(venv=venv)
        elif reward_processing == "normalize":
            venv = VecNormalize(venv=venv, ob=False, per_env=False)
        else:
            raise ValueError(f"Unsupported reward processing: {reward_processing}")

        scope = "ppo2_model"

        def update_fn(update, params=None):
            if rank == 0:
                save_interval = kwargs["save_interval"]
                if save_interval > 0 and update % save_interval == 0:
                    print("Saving...")
                    params = get_tf_params(scope)
                    save_path = save_data(
                        save_dir=save_dir,
                        args_dict=kwargs,
                        params=params,
                        step=(update if kwargs["save_all"] else None),
                    )
                    print(f"Saved to: {save_path}")

        sess = create_tf_session()
        sess.__enter__()

        if kwargs["use_lstm"]:
            raise ValueError("Recurrent networks not yet supported.")
        arch = get_arch(**kwargs)

        from baselines.ppo2 import ppo2

        ppo2.learn(
            env=venv,
            network=arch,
            total_timesteps=kwargs["timesteps_per_proc"],
            save_interval=0,
            nsteps=kwargs["num_steps"],
            nminibatches=kwargs["num_minibatches"],
            lam=kwargs["lambda"],
            gamma=kwargs["gamma"],
            noptepochs=kwargs["ppo_epochs"],
            log_interval=1,
            ent_coef=kwargs["entropy_coeff"],
            mpi_rank_weight=1.0,
            clip_vf=bool(kwargs["clip_vf"]),
            comm=comm,
            lr=kwargs["learning_rate"],
            cliprange=kwargs["clip_range"],
            update_fn=update_fn,
            init_fn=None,
            vf_coef=0.5,
            max_grad_norm=0.5,
        )
    else:
        raise ValueError(f"Unsupported library: {library}")

    return save_dir
Пример #3
0
def main():

    parser = argparse.ArgumentParser(
        description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='fruitbot')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='easy',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=50)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)
    parser.add_argument('--run_id', '-id', type=int, default=99)
    parser.add_argument('--nupdates', type=int, default=0)
    parser.add_argument('--total_tsteps', type=int, default=0)
    parser.add_argument('--log_interval', type=int, default=5)
    parser.add_argument('--load_id', type=int, default=int(-1))
    parser.add_argument('--nrollouts', '-nroll', type=int, default=0)
    parser.add_argument('--test', default=False, action="store_true")
    parser.add_argument('--use_model',
                        type=int,
                        default=1,
                        help="either model #1 or #2")
    parser.add_argument('--train_level', type=int, default=50)

    args = parser.parse_args()
    #timesteps_per_proc
    if args.nupdates:
        timesteps_per_proc = int(args.nupdates * num_envs * nsteps)
    if not args.total_tsteps:
        args.total_tsteps = TIMESTEPS_PER_PROC  ## use global 20_000_000 if not specified in args!
    if args.nrollouts:
        total_timesteps = int(args.nrollouts * num_envs * nsteps)

    run_ID = 'run_' + str(args.run_id).zfill(2)
    if args.test:
        args.log_interval = 1
        args.total_tsteps = 1_000_000
        run_ID += '_test{}_model{}'.format(args.load_id, args.use_model)

    load_path = None
    if args.load_id > -1:
        load_path = join(SAVE_PATH, args.env_name,
                         'saved_ensemble_v{}.tar'.format(args.load_id))

    test_worker_interval = args.test_worker_interval
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    is_test_worker = False
    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
            test_worker_interval - 1)

    mpi_rank_weight = 0 if is_test_worker else 1
    num_levels = 0 if is_test_worker else args.num_levels

    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else []

    if args.test:
        logpath = join('log2/ensemble', args.env_name, 'test', run_ID)
    else:
        logpath = join('log2/ensemble', args.env_name, 'train', run_ID)
        save_path = join(SAVE_PATH, args.env_name,
                         'saved_ensemble_v{}.tar'.format(args.run_id))
        logger.info("\n Model will be saved to file {}".format(save_path))

    if not os.path.exists(logpath):
        os.system("mkdir -p %s" % logpath)
    logger.configure(dir=logpath, format_strs=format_strs)

    fpath = join(logpath, 'args_{}.json'.format(run_ID))
    with open(fpath, 'w') as fh:
        json.dump(vars(args), fh, indent=4, sort_keys=True)
    print("\nSaved args at:\n\t{}\n".format(fpath))

    logger.info("creating tf session")
    setup_mpi_gpus()

    if not args.test:
        config = tf.compat.v1.ConfigProto(\
            allow_soft_placement=True,
            log_device_placement=True)# device_count={'GPU':0})
        config.gpu_options.allow_growth = True  #pylint: disable=E1101
        sess = tf.compat.v1.Session(config=config)
        logger.info("creating environment")
        venv = ProcgenEnv(num_envs=num_envs,
                          env_name=args.env_name,
                          num_levels=num_levels,
                          start_level=args.start_level,
                          distribution_mode=args.distribution_mode)
        venv = VecExtractDictObs(venv, "rgb")
        venv = VecMonitor(
            venv=venv,
            filename=None,
            keep_buf=100,
        )
        venv = VecNormalize(venv=venv, ob=False)
        train(run_ID, save_path, load_path, venv, sess, logger, args)
    else:
        use_model = args.use_model  ## 1 or 2
        alt_flag = use_model - 1
        test_all(alt_flag, load_path, logger, args)
Пример #4
0
def main():
    num_envs = 64  # 16?
    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    timesteps_per_proc = 20_000_000
    use_vf_clipping = True

    parser = argparse.ArgumentParser(
        description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='fruitbot')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='easy',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=50)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)
    parser.add_argument('--log_interval', type=int, default=10)
    parser.add_argument('--nupdates', type=int, default=0)
    parser.add_argument('--total_tsteps', type=int, default=0)
    parser.add_argument('--load_id', type=int, default=int(-1))
    parser.add_argument('--run_id', '-id', type=int, default=0)
    parser.add_argument('--use', type=str, default="randcrop")
    parser.add_argument('--arch', type=str, default="impala")
    parser.add_argument('--no_bn', dest='use_batch_norm', action='store_false')
    parser.add_argument('--dropout', type=float, default=0)
    parser.add_argument('--netrand', type=float, default=0)
    parser.set_defaults(use_batch_norm=True)

    args = parser.parse_args()
    arch = args.arch
    dropout = args.dropout
    use_batch_norm = args.use_batch_norm
    netrand = args.netrand

    if args.nupdates:
        timesteps_per_proc = int(args.nupdates * num_envs * nsteps)
    if not args.total_tsteps:
        args.total_tsteps = timesteps_per_proc  ## use global 20_000_000 if not specified in args!

    run_ID = 'run_' + str(args.run_id).zfill(2)
    agent_str = args.use
    LOG_DIR = join("log", agent_str, "train")
    save_model = join("log", agent_str,
                      "saved_{}_v{}.tar".format(agent_str, args.run_id))
    load_path = None
    if args.load_id > -1:
        load_path = join("log", agent_str,
                         "saved_{}_v{}.tar".format(agent_str, args.load_id))

    test_worker_interval = args.test_worker_interval
    comm = MPI.COMM_WORLD
    is_test_worker = False
    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
            test_worker_interval - 1)
    num_levels = 0 if is_test_worker else args.num_levels

    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else []

    logpath = join(LOG_DIR, run_ID)
    if not os.path.exists(logpath):
        os.system("mkdir -p %s" % logpath)
    logger.configure(dir=logpath, format_strs=format_strs)

    fpath = join(LOG_DIR, 'args_{}.json'.format(run_ID))
    with open(fpath, 'w') as fh:
        json.dump(vars(args), fh, indent=4, sort_keys=True)
    logger.info("\nSaved args at:\n\t{}\n".format(fpath))

    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=args.env_name,
                      num_levels=num_levels,
                      start_level=args.start_level,
                      distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv,
        filename=None,
        keep_buf=100,
    )
    venv = VecNormalize(venv=venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.compat.v1.Session(config=config)

    logger.info(venv.observation_space)
    logger.info("training")
    with sess.as_default():
        model = learn(
            agent_str=agent_str,
            use_netrand=netrand,
            sess=sess,
            env=venv,
            network=None,
            total_timesteps=args.total_tsteps,
            save_interval=1000,
            nsteps=nsteps,
            nminibatches=nminibatches,
            lam=lam,
            gamma=gamma,
            noptepochs=ppo_epochs,
            log_interval=args.log_interval,
            ent_coef=ent_coef,
            lr=learning_rate,
            arch=arch,
            use_batch_norm=use_batch_norm,
            dropout=dropout,
            cliprange=clip_range,
            save_path=save_model,
            load_path=load_path,
            vf_coef=0.5,
            max_grad_norm=0.5,
            clip_vf=use_vf_clipping,
            update_fn=None,
            init_fn=None,
            comm=comm,
        )
        model.save(save_model)
Пример #5
0
def train(args):
    args.cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    log_dir = os.path.expanduser(args.log_dir)
    utils.cleanup_log_dir(log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    log_file = '-{}-{}-reproduce-s{}'.format(args.run_name, args.env_name, args.seed)

    venv = ProcgenEnv(num_envs=args.num_processes, env_name=args.env_name, \
        num_levels=args.num_levels, start_level=args.start_level, \
        distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")
    venv = VecMonitor(venv=venv, filename=None, keep_buf=100)
    venv = VecNormalize(venv=venv, ob=False)
    envs = VecPyTorchProcgen(venv, device)
    
    obs_shape = envs.observation_space.shape
    actor_critic = Policy(
        obs_shape,
        envs.action_space.n,
        base_kwargs={'recurrent': False, 'hidden_size': args.hidden_size})        
    actor_critic.to(device)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                                envs.observation_space.shape, envs.action_space,
                                actor_critic.recurrent_hidden_state_size,
                                aug_type=args.aug_type, split_ratio=args.split_ratio)
        
    batch_size = int(args.num_processes * args.num_steps / args.num_mini_batch)

    if args.use_ucb:
        aug_id = data_augs.Identity
        aug_list = [aug_to_func[t](batch_size=batch_size) 
            for t in list(aug_to_func.keys())]

        agent = algo.UCBDrAC(
            actor_critic,
            args.clip_param,
            args.ppo_epoch,
            args.num_mini_batch,
            args.value_loss_coef,
            args.entropy_coef,
            lr=args.lr,
            eps=args.eps,
            max_grad_norm=args.max_grad_norm,
            aug_list=aug_list,
            aug_id=aug_id,
            aug_coef=args.aug_coef,
            num_aug_types=len(list(aug_to_func.keys())),
            ucb_exploration_coef=args.ucb_exploration_coef,
            ucb_window_length=args.ucb_window_length)

    elif args.use_meta_learning: 
        aug_id = data_augs.Identity
        aug_list = [aug_to_func[t](batch_size=batch_size) \
            for t in list(aug_to_func.keys())]

        aug_model = AugCNN()
        aug_model.to(device) 

        agent = algo.MetaDrAC(
            actor_critic,
            aug_model,
            args.clip_param,
            args.ppo_epoch,
            args.num_mini_batch,
            args.value_loss_coef,
            args.entropy_coef,
            meta_grad_clip=args.meta_grad_clip,
            meta_num_train_steps=args.meta_num_train_steps,
            meta_num_test_steps=args.meta_num_test_steps,
            lr=args.lr,
            eps=args.eps,
            max_grad_norm=args.max_grad_norm,
            aug_id=aug_id,
            aug_coef=args.aug_coef)

    elif args.use_rl2: 
        aug_id = data_augs.Identity
        aug_list = [aug_to_func[t](batch_size=batch_size) 
            for t in list(aug_to_func.keys())]

        rl2_obs_shape = [envs.action_space.n + 1]
        rl2_learner = Policy(
            rl2_obs_shape,
            len(list(aug_to_func.keys())),
            base_kwargs={'recurrent': True, 'hidden_size': args.rl2_hidden_size})    
        rl2_learner.to(device)
 
        agent = algo.RL2DrAC(
                actor_critic,
                rl2_learner,
                args.clip_param,
                args.ppo_epoch,
                args.num_mini_batch,
                args.value_loss_coef,
                args.entropy_coef,
                args.rl2_entropy_coef,
                lr=args.lr,
                eps=args.eps,
                rl2_lr=args.rl2_lr,
                rl2_eps=args.rl2_eps,
                max_grad_norm=args.max_grad_norm,
                aug_list=aug_list,
                aug_id=aug_id,
                aug_coef=args.aug_coef,
                num_aug_types=len(list(aug_to_func.keys())), 
                recurrent_hidden_size=args.rl2_hidden_size, 
                num_actions=envs.action_space.n, 
                device=device)

    else:
        aug_id = data_augs.Identity
        aug_func = aug_to_func[args.aug_type](batch_size=batch_size)

        agent = algo.DrAC(
            actor_critic,
            args.clip_param,
            args.ppo_epoch,
            args.num_mini_batch,
            args.value_loss_coef,
            args.entropy_coef,
            lr=args.lr,
            eps=args.eps,
            max_grad_norm=args.max_grad_norm,
            aug_id=aug_id,
            aug_func=aug_func,
            aug_coef=args.aug_coef,
            env_name=args.env_name)

    checkpoint_path = os.path.join(args.save_dir, "agent" + log_file + ".pt")
    if os.path.exists(checkpoint_path) and args.preempt:
        checkpoint = torch.load(checkpoint_path)
        agent.actor_critic.load_state_dict(checkpoint['model_state_dict'])
        agent.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        init_epoch = checkpoint['epoch'] + 1
        logger.configure(dir=args.log_dir, format_strs=['csv', 'stdout'], log_suffix=log_file + "-e%s" % init_epoch)
    else:
        init_epoch = 0
        logger.configure(dir=args.log_dir, format_strs=['csv', 'stdout'], log_suffix=log_file)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    for j in range(init_epoch, num_updates):
        actor_critic.train()
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                obs_id = aug_id(rollouts.obs[step])
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    obs_id, rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)
            
            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor(
                [[0.0] if done_ else [1.0] for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])

            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            obs_id = aug_id(rollouts.obs[-1])
            next_value = actor_critic.get_value(
                obs_id, rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()
            
        rollouts.compute_returns(next_value, args.gamma, args.gae_lambda)

        if args.use_ucb and j > 0:
            agent.update_ucb_values(rollouts)
        value_loss, action_loss, dist_entropy = agent.update(rollouts)    
        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        total_num_steps = (j + 1) * args.num_processes * args.num_steps
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print("\nUpdate {}, step {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}"
                .format(j, total_num_steps,
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), dist_entropy, value_loss,
                        action_loss))
            
            logger.logkv("train/nupdates", j)
            logger.logkv("train/total_num_steps", total_num_steps)            

            logger.logkv("losses/dist_entropy", dist_entropy)
            logger.logkv("losses/value_loss", value_loss)
            logger.logkv("losses/action_loss", action_loss)

            logger.logkv("train/mean_episode_reward", np.mean(episode_rewards))
            logger.logkv("train/median_episode_reward", np.median(episode_rewards))

            ### Eval on the Full Distribution of Levels ###
            eval_episode_rewards = evaluate(args, actor_critic, device, aug_id=aug_id)

            logger.logkv("test/mean_episode_reward", np.mean(eval_episode_rewards))
            logger.logkv("test/median_episode_reward", np.median(eval_episode_rewards))

            logger.dumpkvs()

        # Save Model
        if (j > 0 and j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            try:
                os.makedirs(args.save_dir)
            except OSError:
                pass
            
            torch.save({
                    'epoch': j,
                    'model_state_dict': agent.actor_critic.state_dict(),
                    'optimizer_state_dict': agent.optimizer.state_dict(),
            }, os.path.join(args.save_dir, "agent" + log_file + ".pt")) 
Пример #6
0
def main():
    # Hyperparameters
    num_envs = 128
    learning_rate = 5e-4
    ent_coef = .01
    vf_coef = 0.5
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    max_grad_norm = 0.5
    use_vf_clipping = True

    # Parse arguments
    parser = argparse.ArgumentParser(
        description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='fruitbot')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='easy',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=50)
    parser.add_argument('--start_level', type=int, default=500)
    parser.add_argument('--test_worker_interval', type=int, default=0)
    parser.add_argument('--run_id', type=int, default=1)
    parser.add_argument('--gpus_id', type=str, default='')
    parser.add_argument('--use_bn', action='store_true')
    parser.add_argument('--use_l2reg', action='store_true')
    parser.add_argument('--l2reg_coeff', type=float, default=1e-4)
    parser.add_argument('--data_aug',
                        type=str,
                        default='no_aug',
                        choices=["no_aug", "cutout_color", "crop"])
    parser.add_argument('--use_rand_conv', action='store_true')
    parser.add_argument('--model_width',
                        type=str,
                        default='1x',
                        choices=["1x", "2x", "4x"])
    parser.add_argument('--level_setup',
                        type=str,
                        default='procgen',
                        choices=["procgen", "oracle"])
    parser.add_argument('--mix_mode',
                        type=str,
                        default='nomix',
                        choices=['nomix', 'mixreg', 'mixobs'])
    parser.add_argument('--mix_alpha', type=float, default=0.2)
    parser.add_argument('--timesteps_per_proc', type=int, default=1_000_000)
    parser.add_argument('--level_sampler_strategy',
                        type=str,
                        default='value_l1')
    parser.add_argument('--score_transform', type=str, default='rank')
    parser.add_argument('--save_dir',
                        type=str,
                        default='gdrive/MyDrive/182 Project/')
    args = parser.parse_args()

    timesteps_per_proc = args.timesteps_per_proc
    log_dir = args.save_dir

    # Setup test worker
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    test_worker_interval = args.test_worker_interval
    is_test_worker = False
    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
            test_worker_interval - 1)
    mpi_rank_weight = 0 if is_test_worker else 1

    # Setup env specs
    if args.level_setup == "procgen":
        env_name = args.env_name
        num_levels = 0 if is_test_worker else args.num_levels
        start_level = args.start_level
    elif args.level_setup == "oracle":
        env_name = args.env_name
        num_levels = 0
        start_level = args.start_level

    # Setup logger
    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout', 'tensorboard'
                   ] if log_comm.Get_rank() == 0 else []
    logger.configure(
        dir=log_dir +
        f'/{args.level_setup}/{args.mix_mode}/{env_name}/run_{args.run_id}',
        format_strs=format_strs)

    # Create env
    logger.info("creating environment")
    eval_env = ProcgenEnv(num_envs=num_envs,
                          env_name=env_name,
                          num_levels=500,
                          start_level=0,
                          distribution_mode=args.distribution_mode)
    eval_env = VecExtractDictObs(eval_env, "rgb")
    eval_env = VecMonitor(
        venv=eval_env,
        filename=None,
        keep_buf=100,
    )
    eval_env = VecNormalize(venv=eval_env, ob=False, ret=True)

    # Setup Tensorflow
    logger.info("creating tf session")
    if args.gpus_id:
        gpus_id = [x.strip() for x in args.gpus_id.split(',')]
        os.environ["CUDA_VISIBLE_DEVICES"] = gpus_id[rank]
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    # Setup model
    if args.model_width == '1x':
        depths = [16, 32, 32]
    elif args.model_width == '2x':
        depths = [32, 64, 64]
    elif args.model_width == '4x':
        depths = [64, 128, 128]
    conv_fn = lambda x: build_impala_cnn(x,
                                         depths=depths,
                                         use_bn=args.use_bn,
                                         randcnn=args.use_rand_conv and
                                         not is_test_worker)

    # Training
    logger.info("training")
    ppo2.learn = learn  # use customized "learn" function
    model = ppo2.learn(
        network=conv_fn,
        total_timesteps=timesteps_per_proc,
        num_levels=num_levels,
        start_level=start_level,
        eval_env=eval_env,
        save_interval=0,
        nsteps=nsteps,
        nminibatches=nminibatches,
        lam=lam,
        gamma=gamma,
        noptepochs=ppo_epochs,
        log_interval=1,
        ent_coef=ent_coef,
        mpi_rank_weight=mpi_rank_weight,
        clip_vf=use_vf_clipping,
        comm=comm,
        lr=learning_rate,
        cliprange=clip_range,
        update_fn=None,
        init_fn=None,
        vf_coef=vf_coef,
        max_grad_norm=max_grad_norm,
        data_aug=args.data_aug,
        level_sampler_strategy=args.level_sampler_strategy,
        score_transform=args.score_transform,
        model_fn=get_mixreg_model(mix_mode=args.mix_mode,
                                  mix_alpha=args.mix_alpha,
                                  use_l2reg=args.use_l2reg,
                                  l2reg_coeff=args.l2reg_coeff),
    )

    # Saving
    logger.info("saving final model")
    if rank == 0:
        checkdir = os.path.join(logger.get_dir(), 'checkpoints')
        os.makedirs(checkdir, exist_ok=True)
        model.save(os.path.join(checkdir, 'final_model.ckpt'))
Пример #7
0
def build_env(args, extra_args):
    if 'Lock-v0' in args.env:
        import Environments
        env = gym.make('Lock-v0')
        ep_dict = {
            'horizon': args.horizon,
            'dimension': 5,
            'switch': 0.1,
            'tabular': False
        }

        env.init(env_config=ep_dict)
        return env
    elif 'diabcombolock-v0' in args.env:
        env = build_env_homer(args, extra_args)
        return env

    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    nenv = args.num_env or ncpu
    alg = args.alg
    seed = args.seed

    env_type, env_id = get_env_type(args)

    if env_type in {'atari', 'retro'}:
        if alg == 'deepq':
            env = make_env(env_id,
                           env_type,
                           seed=seed,
                           wrapper_kwargs={'frame_stack': True})
        elif alg == 'trpo_mpi':
            env = make_env(env_id, env_type, seed=seed)
        else:
            frame_stack_size = 4
            env = make_vec_env(env_id,
                               env_type,
                               nenv,
                               seed,
                               gamestate=args.gamestate,
                               reward_scale=args.reward_scale)
            env = VecFrameStack(env, frame_stack_size)

    else:
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=1,
                                inter_op_parallelism_threads=1)
        config.gpu_options.allow_growth = True
        get_session(config=config)

        flatten_dict_observations = alg not in {'her'}
        env = make_vec_env(env_id,
                           env_type,
                           args.num_env or 1,
                           seed,
                           reward_scale=args.reward_scale,
                           flatten_dict_observations=flatten_dict_observations)

        if env_type == 'mujoco':
            env = VecNormalize(env, use_tf=True)

    return env
Пример #8
0
def build_env(args):
    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    nenv = args.num_env or ncpu
    alg = args.alg
    seed = args.seed

    env_type, env_id = get_env_type(args)
    env_thunk = lambda x: x
    if args.constraints is not None:
        assert len(args.constraints) == len(args.rewards)
        constraints = [
            constraint.CONSTRAINT_DICT[s](r)
            for (s, r) in zip(args.constraints, args.rewards)
        ]
        env_thunk = lambda env: constraint.StepMonitor(
            constraint.ConstraintEnv(env,
                                     constraints,
                                     augmentation_type=args.augmentation,
                                     log_dir=logger.get_dir()), logger.get_dir(
                                     ))

    if env_type in {'atari', 'retro'}:
        if alg == 'deepq':
            env = make_env(env_id,
                           env_type,
                           seed=seed,
                           wrapper_kwargs={'frame_stack': True})
            env = env_thunk(env)
        elif alg == 'trpo_mpi':
            env = make_env(env_id, env_type, seed=seed)
            env = env_thunk(env)
        else:
            frame_stack_size = 4
            env = make_vec_env(env_id,
                               env_type,
                               nenv,
                               seed,
                               gamestate=args.gamestate,
                               reward_scale=args.reward_scale,
                               constraint_env_thunk=env_thunk)
            env = VecFrameStack(env, frame_stack_size)

    else:
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=1,
                                inter_op_parallelism_threads=1)
        config.gpu_options.allow_growth = True
        get_session(config=config)

        flatten_dict_observations = alg not in {'her'}
        env = make_vec_env(env_id,
                           env_type,
                           args.num_env or 1,
                           seed,
                           reward_scale=args.reward_scale,
                           flatten_dict_observations=flatten_dict_observations,
                           constraint_env_thunk=env_thunk)

        if env_type == 'mujoco':
            env = VecNormalize(env)

    return env
Пример #9
0
def main():
    num_envs = 64
    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    total_timesteps = 100_000  ## now this counts steps in testing runs
    use_vf_clipping = True

    ## From random_ppo.py
    max_grad_norm = 0.5
    vf_coef = 0.5
    L2_WEIGHT = 10e-4
    FM_COEFF = 0.002
    REAL_THRES = 0.1

    parser = argparse.ArgumentParser(
        description='Process procgen testing arguments.')
    parser.add_argument('--env_name', type=str, default='fruitbot')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='easy',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=50)
    ## default starting_level set to 50 to test on unseen levels!
    parser.add_argument('--start_level', type=int, default=50)
    parser.add_argument('--run_id', '-id', type=int, default=0)
    parser.add_argument('--nrollouts', '-nroll', type=int, default=0)

    args = parser.parse_args()
    args.total_timesteps = total_timesteps
    if args.nrollouts:
        total_timesteps = int(args.nrollouts * num_envs * nsteps)
    run_ID = 'run_' + str(args.run_id).zfill(2)

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    mpi_rank_weight = 0
    num_levels = args.num_levels

    log_comm = comm.Split(0, 0)
    format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else []

    logpath = join(LOG_DIR, run_ID)
    if not os.path.exists(logpath):
        os.system("mkdir -p %s" % logpath)

    fpath = join(LOG_DIR, 'args_{}.json'.format(run_ID))
    with open(fpath, 'w') as fh:
        json.dump(vars(args), fh, indent=4, sort_keys=True)
    print("\nSaved args at:\n\t{}\n".format(fpath))

    logger.configure(dir=logpath, format_strs=format_strs)

    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=args.env_name,
                      num_levels=num_levels,
                      start_level=args.start_level,
                      distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv,
        filename=None,
        keep_buf=100,
    )
    venv = VecNormalize(venv=venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.compat.v1.Session(config=config)
    sess.__enter__()

    logger.info("Testing")
    ## Modified based on random_ppo.learn
    env = venv
    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    nrollouts = total_timesteps // nbatch

    policy = RandomCnnPolicy
    model = Model(policy=policy,
                  ob_space=ob_space,
                  ac_space=ac_space,
                  nbatch_act=nenvs,
                  nbatch_train=nbatch_train,
                  nsteps=nsteps,
                  ent_coef=ent_coef,
                  vf_coef=vf_coef,
                  max_grad_norm=max_grad_norm)

    model.load(LOAD_PATH)
    logger.info("Model pramas loaded from save")
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)

    epinfobuf10 = deque(maxlen=10)
    epinfobuf100 = deque(maxlen=100)
    # tfirststart = time.time() ## Not doing timing yet
    # active_ep_buf = epinfobuf100

    mean_rewards = []
    datapoints = []
    for rollout in range(1, nrollouts + 1):
        logger.info('collecting rollouts {}...'.format(rollout))
        clean_flag = 0  ## since we are testiing, ENABLE randomization
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
            clean_flag)
        epinfobuf10.extend(epinfos)
        epinfobuf100.extend(epinfos)

        rew_mean_10 = safemean([epinfo['r'] for epinfo in epinfobuf10])
        rew_mean_100 = safemean([epinfo['r'] for epinfo in epinfobuf100])
        ep_len_mean_10 = np.nanmean([epinfo['l'] for epinfo in epinfobuf10])
        ep_len_mean_100 = np.nanmean([epinfo['l'] for epinfo in epinfobuf100])

        logger.info('\n----', rollout)
        mean_rewards.append(rew_mean_10)
        logger.logkv('eprew10', rew_mean_10)
        logger.logkv('eprew100', rew_mean_100)
        logger.logkv('eplenmean10', ep_len_mean_10)
        logger.logkv('eplenmean100', ep_len_mean_100)
        logger.logkv("misc/total_timesteps", rollout * nbatch)

        logger.info('----\n')
        logger.dumpkvs()
    env.close()

    print("Rewards history: ", mean_rewards)
    return mean_rewards
Пример #10
0
def train_fn(env_name,
             num_envs,
             distribution_mode,
             num_levels,
             start_level,
             timesteps_per_proc,
             args,
             is_test_worker=False,
             log_dir='./tmp/procgen',
             comm=None,
             alternate_ppo=False,
             do_eval=False,
             eval_num_envs=None,
             eval_env_name=None,
             eval_num_levels=None,
             eval_start_level=None,
             eval_distribution_mode=None,
             do_test=False,
             test_num_envs=None,
             test_env_name=None,
             test_num_levels=None,
             test_start_level=None,
             test_distribution_mode=None):
    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    use_vf_clipping = True

    mpi_rank_weight = 0 if is_test_worker else 1
    num_levels = 0 if is_test_worker else num_levels

    if log_dir is not None:
        log_comm = comm.Split(1 if is_test_worker else 0, 0)
        format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
        logger.configure(comm=log_comm, dir=log_dir, format_strs=format_strs)

    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=env_name,
                      num_levels=num_levels,
                      start_level=start_level,
                      distribution_mode=distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv,
        filename=None,
        keep_buf=100,
    )

    venv = VecNormalize(venv=venv, ob=False)

    eval_env = None
    if do_eval:
        eval_env = ProcgenEnv(num_envs=eval_num_envs,
                              env_name=eval_env_name,
                              num_levels=eval_num_levels,
                              start_level=eval_start_level,
                              distribution_mode=eval_distribution_mode)
        eval_env = VecExtractDictObs(eval_env, "rgb")

        eval_env = VecMonitor(
            venv=eval_env,
            filename=None,
            keep_buf=100,
        )

        eval_env = VecNormalize(venv=eval_env, ob=False)

    test_env = None
    if do_test:
        test_env = ProcgenEnv(num_envs=test_num_envs,
                              env_name=test_env_name,
                              num_levels=test_num_levels,
                              start_level=test_start_level,
                              distribution_mode=test_distribution_mode)
        test_env = VecExtractDictObs(test_env, "rgb")

        test_env = VecMonitor(
            venv=test_env,
            filename=None,
            keep_buf=100,
        )

        test_env = VecNormalize(venv=test_env, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256)

    logger.info("training")
    if alternate_ppo:
        alt_ppo2.learn(env=venv,
                       eval_env=eval_env,
                       test_env=test_env,
                       network=conv_fn,
                       total_timesteps=timesteps_per_proc,
                       save_interval=1,
                       nsteps=nsteps,
                       nminibatches=nminibatches,
                       lam=lam,
                       gamma=gamma,
                       noptepochs=ppo_epochs,
                       log_interval=1,
                       ent_coef=ent_coef,
                       mpi_rank_weight=mpi_rank_weight,
                       clip_vf=use_vf_clipping,
                       comm=comm,
                       lr=learning_rate,
                       cliprange=clip_range,
                       update_fn=None,
                       init_fn=None,
                       vf_coef=0.5,
                       max_grad_norm=0.5,
                       args=args,
                       load_path=args.resume_path)
    else:
        ppo2.learn(env=venv,
                   eval_env=eval_env,
                   network=conv_fn,
                   total_timesteps=timesteps_per_proc,
                   save_interval=1,
                   nsteps=nsteps,
                   nminibatches=nminibatches,
                   lam=lam,
                   gamma=gamma,
                   noptepochs=ppo_epochs,
                   log_interval=1,
                   ent_coef=ent_coef,
                   mpi_rank_weight=mpi_rank_weight,
                   clip_vf=use_vf_clipping,
                   comm=comm,
                   lr=learning_rate,
                   cliprange=clip_range,
                   update_fn=None,
                   init_fn=None,
                   vf_coef=0.5,
                   max_grad_norm=0.5,
                   args=args)
Пример #11
0
def main():
    num_envs = 64
    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    timesteps_per_proc = 20_000_000
    use_vf_clipping = True

    parser = argparse.ArgumentParser(
        description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='fruitbot')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='easy',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=50)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)
    parser.add_argument('--run_id', '-id', type=int, default=99)
    parser.add_argument('--nupdates', type=int, default=0)
    parser.add_argument('--total_tsteps', type=int, default=0)
    parser.add_argument('--log_interval', type=int, default=20)
    parser.add_argument('--load_id', type=int, default=int(-1))

    args = parser.parse_args()
    args.total_tsteps = timesteps_per_proc
    if args.nupdates:
        timesteps_per_proc = int(args.nupdates * num_envs * nsteps)
    if not args.total_tsteps:
        args.total_tsteps = timesteps_per_proc  ## use global 20_000_000 if not specified in args!

    run_ID = 'run_' + str(args.run_id).zfill(2)

    save_model = join(SAVE_PATH, "saved_recenter_v{}.tar".format(args.run_id))
    load_path = None
    if args.load_id > -1:
        load_path = 'log/recenter/recenter_v{}.tar'.format(args.load_id)
    test_worker_interval = args.test_worker_interval

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    is_test_worker = False

    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
            test_worker_interval - 1)

    mpi_rank_weight = 0 if is_test_worker else 1
    num_levels = 0 if is_test_worker else args.num_levels

    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else []

    logpath = join(LOG_DIR, run_ID)
    if not os.path.exists(logpath):
        os.system("mkdir -p %s" % logpath)
    logger.configure(dir=logpath, format_strs=format_strs)

    fpath = join(LOG_DIR, 'args_{}.json'.format(run_ID))
    with open(fpath, 'w') as fh:
        json.dump(vars(args), fh, indent=4, sort_keys=True)
    print("\nSaved args at:\n\t{}\n".format(fpath))

    logger.info("\n Saving model to file {}".format(save_model))

    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=args.env_name,
                      num_levels=num_levels,
                      start_level=args.start_level,
                      distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv,
        filename=None,
        keep_buf=100,
    )
    venv = VecNormalize(venv=venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.compat.v1.ConfigProto(
        log_device_placement=True)  #device_count={'GPU':0})
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.compat.v1.Session(config=config)
    # sess.__enter__()

    logger.info(venv.observation_space)
    logger.info("training")
    with sess.as_default():
        model = recenter_ppo.learn(
            sess=sess,
            env=venv,
            network=None,
            total_timesteps=args.total_tsteps,
            save_interval=2,
            nsteps=nsteps,
            nminibatches=nminibatches,
            lam=lam,
            gamma=gamma,
            noptepochs=ppo_epochs,
            log_interval=args.log_interval,
            ent_coef=ent_coef,
            # clip_vf=use_vf_clipping,
            lr=learning_rate,
            cliprange=clip_range,
            # update_fn=None,
            # init_fn=None,
            save_path=save_model,
            load_path=load_path,
            vf_coef=0.5,
            max_grad_norm=0.5,
        )
        model.save(save_model)
Пример #12
0
def main():
    num_envs = 64
    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    timesteps_per_proc = 50_000_000
    use_vf_clipping = True

    parser = argparse.ArgumentParser(
        description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='coinrun')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='easy',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=500)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)

    args = parser.parse_args()

    test_worker_interval = args.test_worker_interval

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    is_test_worker = False

    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
            test_worker_interval - 1)

    mpi_rank_weight = 0 if is_test_worker else 1
    num_levels = 0 if is_test_worker else args.num_levels

    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
    logger.configure(dir=LOG_DIR, format_strs=format_strs)

    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=args.env_name,
                      num_levels=num_levels,
                      start_level=args.start_level,
                      distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv,
        filename=None,
        keep_buf=100,
    )

    venv = VecNormalize(venv=venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256)

    logger.info("training")
    ppo2.learn(
        env=venv,
        network=conv_fn,
        total_timesteps=timesteps_per_proc,
        save_interval=0,
        nsteps=nsteps,
        nminibatches=nminibatches,
        lam=lam,
        gamma=gamma,
        noptepochs=ppo_epochs,
        log_interval=1,
        ent_coef=ent_coef,
        mpi_rank_weight=mpi_rank_weight,
        clip_vf=use_vf_clipping,
        comm=comm,
        lr=learning_rate,
        cliprange=clip_range,
        update_fn=None,
        init_fn=None,
        vf_coef=0.5,
        max_grad_norm=0.5,
    )
Пример #13
0
def learn(*,
          network,
          total_timesteps,
          num_levels=50,
          start_level=500,
          eval_env=None,
          seed=None,
          nsteps=2048,
          ent_coef=0.0,
          lr=3e-4,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.2,
          save_interval=0,
          load_path=None,
          model_fn=None,
          update_fn=None,
          init_fn=None,
          mpi_rank_weight=1,
          comm=None,
          num_processes=64,
          num_steps=256,
          level_replay_temperature=0.1,
          level_replay_rho=1.0,
          level_replay_nu=0.5,
          level_replay_alpha=1.0,
          staleness_coef=0.1,
          staleness_temperature=1.0,
          level_sampler_strategy='value_l1',
          score_transform='rank',
          **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

    Parameters:
    ----------

    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                      See common/models.py/lstm for more details on using recurrent nets in policies

    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                      nenv is number of environment copies simulated in parallel)

    total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)

    ent_coef: float                   policy entropy coefficient in the optimization objective

    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                      training and 0 is the end of the training.

    vf_coef: float                    value function loss coefficient in the optimization objective

    max_grad_norm: float or None      gradient norm clipping coefficient

    gamma: float                      discounting factor

    lam: float                        advantage estimation discounting factor (lambda in the paper)

    log_interval: int                 number of timesteps between logging events

    nminibatches: int                 number of training minibatches per update. For recurrent policies,
                                      should be smaller or equal than number of environments run in parallel.

    noptepochs: int                   number of training epochs per update

    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                      and 0 is the end of the training

    save_interval: int                number of timesteps between saving events

    load_path: str                    path to load the model from

    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.



    '''

    set_global_seeds(seed)

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    level_sampler_args = dict(num_actors=num_processes,
                              strategy=level_sampler_strategy,
                              replay_schedule='proportionate',
                              score_transform=score_transform,
                              temperature=level_replay_temperature,
                              rho=level_replay_rho,
                              nu=level_replay_nu,
                              alpha=level_replay_alpha,
                              staleness_coef=staleness_coef,
                              staleness_transform='power',
                              staleness_temperature=staleness_temperature)

    env = ProcgenEnv(num_envs=num_processes, env_name='fruitbot', \
        num_levels=1, start_level=start_level, \
        distribution_mode='easy',
        paint_vel_info=False)
    env = VecExtractDictObs(env, "rgb")
    env = VecMonitor(venv=env, filename=None, keep_buf=100)
    env = VecNormalize(venv=env, ob=False, ret=True)

    seeds = [start_level + i for i in range(num_levels)]

    level_sampler = LevelSampler(seeds, env.observation_space,
                                 env.action_space, **level_sampler_args)

    env = VecProcgen(env, level_sampler=level_sampler)

    rollouts = RolloutStorage(num_steps, num_processes,
                              env.observation_space.shape, env.action_space)

    level_seeds = np.zeros(num_processes)
    obs, level_seeds = env.reset()
    level_seeds = level_seeds.reshape(-1, 1)
    rollouts.obs[0] = obs

    policy = build_policy(env, network, **network_kwargs)

    # Get the nb of env
    nenvs = env.num_envs

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0)

    # Instantiate the model object (that creates act_model and train_model)
    if model_fn is None:
        from baselines.ppo2.model import Model
        model_fn = Model

    model = model_fn(policy=policy,
                     ob_space=ob_space,
                     ac_space=ac_space,
                     nbatch_act=nenvs,
                     nbatch_train=nbatch_train,
                     nsteps=nsteps,
                     ent_coef=ent_coef,
                     vf_coef=vf_coef,
                     max_grad_norm=max_grad_norm,
                     comm=comm,
                     mpi_rank_weight=mpi_rank_weight)

    if load_path is not None:
        model.load(load_path)
    # Instantiate the runner object
    runner = Runner(env=env,
                    model=model,
                    nsteps=nsteps,
                    gamma=gamma,
                    lam=lam,
                    rollouts=rollouts)
    if eval_env is not None:
        eval_runner = EvalRunner(env=eval_env,
                                 model=model,
                                 nsteps=nsteps,
                                 gamma=gamma,
                                 lam=lam)

    epinfobuf = deque(maxlen=100)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=100)

    if init_fn is not None:
        init_fn()

    # Start total timer
    tfirststart = time.perf_counter()

    nupdates = total_timesteps // nbatch
    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0
        # Start timer
        tstart = time.perf_counter()
        frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        lrnow = lr(frac)
        # Calculate the cliprange
        cliprangenow = cliprange(frac)

        if update % log_interval == 0 and is_mpi_root:
            logger.info('Stepping environment...')

        # Get minibatch
        obs, returns, masks, actions, values, neglogpacs, states, epinfos, = runner.run(
            level_seeds=level_seeds)  #pylint: disable=E0632
        if eval_env is not None:
            eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos, = eval_runner.run(
            )  #pylint: disable=E0632

        if update % log_interval == 0 and is_mpi_root: logger.info('Done.')

        epinfobuf.extend(epinfos)
        if eval_env is not None:
            eval_epinfobuf.extend(eval_epinfos)

        # Update level sampler
        level_sampler.update_with_rollouts(rollouts)

        rollouts.after_update()
        level_sampler.after_update()

        # Here what we're going to do is for each minibatch calculate the loss and append it.
        mblossvals = []
        if states is None:  # nonrecurrent version
            # Index of each element of batch_size
            # Create the indices array
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                # Randomize the indexes
                np.random.shuffle(inds)
                # 0 to batch_size with batch_train_size step
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow,
                                                  *slices))
        else:  # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(
                        model.train(lrnow, cliprangenow, *slices, mbstates))

        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)
        # End timer
        tnow = time.perf_counter()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))

        if update_fn is not None:
            update_fn(update)

        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, returns)
            logger.logkv("misc/serial_timesteps", update * nsteps)
            logger.logkv("misc/nupdates", update)
            logger.logkv("misc/total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            logger.logkv("misc/explained_variance", float(ev))
            logger.logkv('eprewmean',
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean',
                         safemean([epinfo['l'] for epinfo in epinfobuf]))
            if eval_env is not None:
                logger.logkv(
                    'eval_eprewmean',
                    safemean([epinfo['r'] for epinfo in eval_epinfobuf]))
                logger.logkv(
                    'eval_eplenmean',
                    safemean([epinfo['l'] for epinfo in eval_epinfobuf]))
            logger.logkv('misc/time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv('loss/' + lossname, lossval)

            logger.dumpkvs()
        if save_interval and (update % save_interval == 0 or update
                              == 1) and logger.get_dir() and is_mpi_root:
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)

    return model
Пример #14
0
def main():

    parser = argparse.ArgumentParser(description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='fruitbot', help='env to run on from procgen')
    parser.add_argument('--num_envs', type=int, default=64, help='number of environments run simultaneously')
    parser.add_argument('--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"], help='level difficulty')
    parser.add_argument('--num_levels', type=int, default=0, help='number of levels to train/test on')
    parser.add_argument('--start_level', type=int, default=0, help='start level (used to avoid testing on seen levels)')
    parser.add_argument('--num_timesteps', type=int, default=0, help='number of timesteps total to train/test on')
    parser.add_argument('--save_frequency', type=int, default=0, help='checkpoint frequency')
    parser.add_argument('--model_loc', type=str, default=None, help='location of pretrained model')
    parser.add_argument('--results_loc', type=str, default=None, help='location of where to save current model/logs')

    parser.add_argument('--eval', type=bool, default=False, help='if true, does not update model')
    parser.add_argument('--data_aug', type=str, default='normal', help='whether to apply data augmentation')
    parser.add_argument('--gray_p', type=float, default=0.8, help='p value for grayscale data augmentation')

    parser.add_argument('--value_fn', type=str, default='fc', choices=['fc', 'gmm', 'lbmdp'], help='value function for ppo2 critic')
    parser.add_argument('--cnn_fn', type=str, default='impala_cnn', choices=['impala_cnn', 'nature_cnn', 'impala_cnn_lstm', 'lstm'], help='cnn for featurization')
    parser.add_argument('--entropy_fn', type=str, default='constant', choices=['constant', 'scaled'], help='function for entropy loss coefficient')


    parser.add_argument('--ent_coef', type=float, default=0.01, help='coefficient applied to entropy loss')
    parser.add_argument('--ent_scalar', type=float, default=1, help='coefficient applied within sigmoid to scaled entropy coefficient')
    parser.add_argument('--seed', type=int, default=None, help='seed for tensorflow')
    parser.add_argument('--gamma', type=float, default=0.999, help='discount factor')
    parser.add_argument('--lam', type=float, default=0.95, help='advantage discount factor')
    parser.add_argument('--lr',  type=float, default=5e-4, help='learning rate for Adam')
    parser.add_argument('--imp_h1', type=float, default=16, help='impala cnn first hidden state')
    parser.add_argument('--imp_h2', type=float, default=64, help='impala cnn second hidden state')
    parser.add_argument('--imp_h3', type=float, default=64, help='impala cnn third hidden state')


    args = parser.parse_args()

    logger.configure(dir=args.results_loc, format_strs=['csv', 'stdout'])
    logger.info("Creating Environment")
    venv = ProcgenEnv(num_envs=args.num_envs, env_name=args.env_name, num_levels=args.num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, 'rgb')
    venv = VecMonitor(
        venv=venv,
        filename=None,
        keep_buf=100,
    )
    venv = VecNormalize(venv=venv, ob=False)

    logger.info("Creating Tensorflow Session")
    config = tf.ConfigProto()
    sess = tf.Session(config=config)
    sess.__enter__()

    if args.cnn_fn == 'impala_cnn':
        conv_fn = lambda x: build_impala_cnn(x, depths=[args.imp_h1,args.imp_h2,args.imp_h3], emb_size=256)
    elif args.cnn_fn == 'nature_cnn':
        conv_fn = lambda x: nature_cnn(x)
    elif args.cnn_fn == 'impala_cnn_lstm':
        conv_fn = impala_cnn_lstm()
    elif args.cnn_fn == 'lstm':
        conv_fn = lstm()
    else:
        conv_fn = mlp()

    logger.info("Training")
    learn(
        network=conv_fn,
        env=venv,
        total_timesteps=args.num_timesteps,
        eval_env = None,
        seed=args.seed,
        nsteps=256,
        ent_coef=args.ent_coef,
        lr=args.lr,
        vf_coef=0.5,
        max_grad_norm=0.5,
        gamma=args.gamma,
        lam=args.lam,
        log_interval=args.save_frequency,
        nminibatches=4,
        noptepochs=3,
        cliprange=0.2,
        save_interval=0,
        load_path=args.model_loc,
        data_aug=args.data_aug,
        args=args,
    )