Пример #1
0
def learn(*,
          network,
          env,
          total_timesteps,
          eval_env=None,
          seed=None,
          nsteps=8,
          ent_coef=0.0,
          lr=3e-4,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.2,
          save_interval=0,
          load_path=None,
          model_fn=None,
          update_fn=None,
          init_fn=None,
          mpi_rank_weight=1,
          comm=None,
          **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

    Parameters:
    ----------

    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                      See common/models.py/lstm for more details on using recurrent nets in policies

    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                      nenv is number of environment copies simulated in parallel)

    total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)

    ent_coef: float                   policy entropy coefficient in the optimization objective

    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                      training and 0 is the end of the training.

    vf_coef: float                    value function loss coefficient in the optimization objective

    max_grad_norm: float or None      gradient norm clipping coefficient

    gamma: float                      discounting factor

    lam: float                        advantage estimation discounting factor (lambda in the paper)

    log_interval: int                 number of timesteps between logging events

    nminibatches: int                 number of training minibatches per update. For recurrent policies,
                                      should be smaller or equal than number of environments run in parallel.

    noptepochs: int                   number of training epochs per update

    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                      and 0 is the end of the training

    save_interval: int                number of timesteps between saving events

    load_path: str                    path to load the model from

    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.



    '''
    print(
        "PPO2 is running ****************************************************************"
    )

    set_global_seeds(seed)

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    policy = build_policy(env, network, **network_kwargs)
    print("Type of Policy in ppo2.py {}".format((policy)))

    # Get the nb of env
    #nenvs = env.num_envs
    nenvs = 1

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0)

    # Instantiate the model object (that creates act_model and train_model)
    if model_fn is None:
        from baselines.ppo2.model import Model
        model_fn = Model
    print("Making model")
    model = model_fn(policy=policy,
                     ob_space=ob_space,
                     ac_space=ac_space,
                     nbatch_act=nenvs,
                     nbatch_train=nbatch_train,
                     nsteps=nsteps,
                     ent_coef=ent_coef,
                     vf_coef=vf_coef,
                     max_grad_norm=max_grad_norm,
                     comm=comm,
                     mpi_rank_weight=mpi_rank_weight)

    if load_path is not None:
        model.load(load_path)
    # Instantiate the runner object
    print(
        "Runner is initiated ===============================================")
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
    print("Runner successfully got initiated -------------------------------")
    if eval_env is not None:
        eval_runner = Runner(env=eval_env,
                             model=model,
                             nsteps=nsteps,
                             gamma=gamma,
                             lam=lam)
        print("Eval runner is called")

    epinfobuf = deque(maxlen=100)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=100)

    if init_fn is not None:
        init_fn()

    # Start total timer
    tfirststart = time.perf_counter()
    print("Number of timesteps {}".format(total_timesteps))
    print("Number of batches {}".format(nbatch))

    nupdates = total_timesteps // nbatch
    print("Number of updates {}".format(nupdates))
    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0
        # Start timer
        tstart = time.perf_counter()
        frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        lrnow = lr(frac)
        # Calculate the cliprange
        cliprangenow = cliprange(frac)

        if update % log_interval == 0 and is_mpi_root:
            logger.info('Stepping environment...')
        print("Inside the for loop ----------------")

        # Get minibatch
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )  #pylint: disable=E0632
        if eval_env is not None:
            eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run(
            )  #pylint: disable=E0632

        if update % log_interval == 0 and is_mpi_root: logger.info('Done.')

        epinfobuf.extend(epinfos)
        if eval_env is not None:
            eval_epinfobuf.extend(eval_epinfos)

        # Here what we're going to do is for each minibatch calculate the loss and append it.
        mblossvals = []
        if states is None:  # nonrecurrent version
            # Index of each element of batch_size
            # Create the indices array
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                # Randomize the indexes
                np.random.shuffle(inds)
                # 0 to batch_size with batch_train_size step
                print("Number of batches is {} and number of nbatch train {}".
                      format(nbatch, nbatch_train))
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    print("Trainig the policy")
                    mblossvals.append(model.train(lrnow, cliprangenow,
                                                  *slices))
                    print("Policy Trained")
        else:  # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(
                        model.train(lrnow, cliprangenow, *slices, mbstates))

        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)
        # End timer
        tnow = time.perf_counter()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))

        if update_fn is not None:
            update_fn(update)

        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, returns)
            logger.logkv("misc/serial_timesteps", update * nsteps)
            logger.logkv("misc/nupdates", update)
            logger.logkv("misc/total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            logger.logkv("misc/explained_variance", float(ev))
            logger.logkv('eprewmean',
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean',
                         safemean([epinfo['l'] for epinfo in epinfobuf]))
            if eval_env is not None:
                logger.logkv(
                    'eval_eprewmean',
                    safemean([epinfo['r'] for epinfo in eval_epinfobuf]))
                logger.logkv(
                    'eval_eplenmean',
                    safemean([epinfo['l'] for epinfo in eval_epinfobuf]))
            logger.logkv('misc/time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv('loss/' + lossname, lossval)

            logger.dumpkvs()
        if save_interval and (update % save_interval == 0 or update
                              == 1) and logger.get_dir() and is_mpi_root:
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)

    return model
Пример #2
0
def learn(*,
          network,
          env,
          reward_giver,
          expert_dataset,
          g_step,
          d_step,
          d_stepsize=3e-4,
          total_timesteps,
          eval_env=None,
          seed=None,
          nsteps=2048,
          ent_coef=0.0,
          lr=3e-4,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.2,
          save_interval=0,
          load_path=None,
          model_fn=None,
          update_fn=None,
          init_fn=None,
          mpi_rank_weight=1,
          comm=None,
          **network_kwargs):

    # from PPO learn
    set_global_seeds(seed)

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    policy = build_policy(env, network, **network_kwargs)

    # nenvs = env.num_envs
    nenvs = 1

    ob_space = env.observation_space
    ac_space = env.action_space

    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0)

    if model_fn is None:
        from baselines.ppo2.model import Model
        model_fn = Model

    model = model_fn(policy=policy,
                     ob_space=ob_space,
                     ac_space=ac_space,
                     nbatch_act=nenvs,
                     nbatch_train=nbatch_train,
                     nsteps=nsteps,
                     ent_coef=ent_coef,
                     vf_coef=vf_coef,
                     max_grad_norm=max_grad_norm,
                     comm=comm,
                     mpi_rank_weight=mpi_rank_weight)

    if load_path is not None:
        model.load(load_path)

    runner = Runner(env=env,
                    model=model,
                    nsteps=nsteps,
                    gamma=gamma,
                    lam=lam,
                    reward_giver=reward_giver)
    if eval_env is not None:
        eval_runner = Runner(env=eval_env,
                             model=model,
                             nsteps=nsteps,
                             gamma=gamma,
                             lam=lam)

    epinfobuf = deque(maxlen=100)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=100)

    if init_fn is not None:
        init_fn()

    tfirststart = time.perf_counter()

    nupdates = total_timesteps // nbatch

    # from TRPO MPI
    nworkers = MPI.COMM_WORLD.Get_size()

    ob = model.act_model.X
    ac = model.A

    d_adam = MpiAdam(reward_giver.get_trainable_variables())

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    # from PPO
    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0
        tstart = time.perf_counter()
        frac = 1.0 - (update - 1.0) / nupdates
        lrnow = lr(frac)
        cliprangenow = cliprange(frac)

        logger.log("Optimizing Policy...")
        for _ in range(g_step):
            if update % log_interval == 0 and is_mpi_root:
                logger.info('Stepping environment...')

            obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
            )
            if eval_env is not None:
                eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run(
                )

            if update % log_interval == 0 and is_mpi_root: logger.info('Done.')

            epinfobuf.extend(epinfos)
            if eval_env is not None:
                eval_epinfobuf.extend(eval_epinfos)

            mblossvals = []
            if states is None:
                inds = np.arange(nbatch)
                for _ in range(noptepochs):
                    np.random.shuffle(inds)
                    for start in range(0, nbatch, nbatch_train):
                        end = start + nbatch_train
                        mbinds = inds[start:end]
                        slices = (arr[mbinds]
                                  for arr in (obs, returns, masks, actions,
                                              values, neglogpacs))
                        mblossvals.append(
                            model.train(lrnow, cliprangenow, *slices))
            else:
                assert False  # make sure we're not going here, so any bugs aren't from here
                assert nenvs % nminibatches == 0
                envsperbatch = nenvs // nminibatches
                envinds = np.arange(nenvs)
                flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
                for _ in range(noptepochs):
                    np.random.shuffle(envinds)
                    for start in range(0, nenvs, envsperbatch):
                        end = start + envsperbatch
                        mbenvinds = envinds[start:end]
                        mbflatinds = flatinds[mbenvinds].ravel()
                        slices = (arr[mbflatinds]
                                  for arr in (obs, returns, masks, actions,
                                              values, neglogpacs))
                        mbstates = states[mbenvinds]
                        mblossvals.append(
                            model.train(lrnow, cliprangenow, *slices,
                                        mbstates))

        lossvals = np.mean(mblossvals, axis=0)
        tnow = time.perf_counter()
        fps = int(nbatch / (tnow - tstart))

        # TRPO MPI
        logger.log("Optimizing Disciminator...")
        logger.log(fmt_row(13, reward_giver.loss_name))
        ob_expert, ac_expert = expert_dataset.get_next_batch(len(obs))
        batch_size = len(obs) // d_step
        d_losses = []
        for ob_batch, ac_batch in dataset.iterbatches(
            (obs, actions),
                include_final_partial_batch=False,
                batch_size=batch_size):
            ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch))
            if hasattr(reward_giver, "obs_rms"):
                reward_giver.obs_rms.update(
                    np.concatenate((ob_batch, ob_expert), 0))
            *newlosses, g = reward_giver.lossandgrad(ob_batch, ac_batch,
                                                     ob_expert, ac_expert)
            d_adam.update(allmean(g), d_stepsize)
            d_losses.append(newlosses)

        logger.log(fmt_row(13, np.mean(d_losses, axis=0)))

        if update_fn is not None:
            update_fn(update)

        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, returns)
            logger.logkv("misc/serial_timesteps", update * nsteps)
            logger.logkv("misc/nupdates", update)
            logger.logkv("misc/total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            logger.logkv("misc/explained_variance", float(ev))
            logger.logkv("eprewmean",
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv("eplenmean",
                         safemean([epinfo['l'] for epinfo in epinfobuf]))
            if eval_env is not None:
                logger.logkv(
                    "eval_eprewmean",
                    safemean([epinfo['r'] for epinfo in eval_epinfobuf]))
                logger.logkv(
                    "eval_eplenmean",
                    safemean([epinfo['l'] for epinfo in eval_epinfobuf]))
            logger.logkv("misc/time_elapsed", tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv("loss/" + lossname, lossval)

            logger.dumpkvs()

        if save_interval and (update % save_interval == 0 or update
                              == 1) and logger.get_dir() and is_mpi_root:
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i' % update)
            print("Saving to", savepath)
            model.save(savepath)

    return model
Пример #3
0
def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
            vf_coef=0.5,  max_grad_norm=0.5, gamma=0.99, lam=0.95,
            log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
            save_interval=0, save_path=None,load_path=None):

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
                    nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
                    max_grad_norm=max_grad_norm)

    time_string = int(time.time())
    if save_interval and save_path:
        import cloudpickle
        with open(osp.join(save_path, 'make_model_{}.pkl'.format(time_string)), 'wb') as fh:
            fh.write(cloudpickle.dumps(make_model))
    model = make_model()
    if load_path is not None:
        model.load(load_path)
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)

    epinfobuf = deque(maxlen=100)
    tfirststart = time.time()

    nupdates = total_timesteps//nbatch
    for update in range(1, nupdates+1):
        assert nbatch % nminibatches == 0
        nbatch_train = nbatch // nminibatches
        tstart = time.time()
        frac = 1.0 - (update - 1.0) / nupdates
        lrnow = lr(frac)
        cliprangenow = cliprange(frac)
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632
        epinfobuf.extend(epinfos)
        mblossvals = []
        if states is None: # nonrecurrent version
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                np.random.shuffle(inds)
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow, *slices))
        else: # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            envsperbatch = nbatch_train // nsteps
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates))

        lossvals = np.mean(mblossvals, axis=0)
        tnow = time.time()
        fps = int(nbatch / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, returns)
            logger.logkv("serial_timesteps", update*nsteps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update*nbatch)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))
            logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.logkv('time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv(lossname, lossval)
            logger.dumpkvs()
        if save_interval and (update % save_interval == 0 or update == 1) and save_path:
            checkdir = osp.join(save_path, 'checkpoints_{}'.format(time_string))
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i'%update)
            print('Saving to', savepath)
            model.save(savepath)
    if save_path:
        checkdir = osp.join(save_path, 'checkpoints_{}'.format(time_string))
        os.makedirs(checkdir, exist_ok=True)
        savepath = osp.join(checkdir, '%.5i' % update)
        print('Final save to', savepath)
        model.save(savepath)
    env.close()
Пример #4
0
def learn(*,
          network,
          env,
          total_timesteps,
          per_mdp_optimal_policies='ppo2',
          seed=None,
          nsteps=2048,
          gamma=0.99,
          log_interval=10,
          save_interval=0,
          load_path=None,
          **network_kwargs):
    '''
    Learn policy using Posterior Sampling Reinforcement Learning algorithm (TODO: link)

    Parameters:
    ----------

    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                      See common/models.py/lstm for more details on using recurrent nets in policies

    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.

    total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)

    gamma: float                      discounting factor

    lam: float                        advantage estimation discounting factor (lambda in the paper)

    log_interval: int                 number of timesteps between logging events

    save_interval: int                number of timesteps between saving events

    load_path: str                    path to load the model from

    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.



    '''

    set_global_seeds(seed)

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    policy = build_policy(env, network, **network_kwargs)

    # Get the nb of env
    nenvs = env.num_envs

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    # Instantiate the model object (that creates act_model and train_model)
    if model_fn is None:
        from baselines.ppo2.model import Model
        model_fn = Model

    model = model_fn(policy=policy,
                     ob_space=ob_space,
                     ac_space=ac_space,
                     nbatch_act=nenvs,
                     nbatch_train=nbatch_train,
                     nsteps=nsteps,
                     ent_coef=ent_coef,
                     vf_coef=vf_coef,
                     max_grad_norm=max_grad_norm)

    if load_path is not None:
        model.load(load_path)
    # Instantiate the runner object
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
    if eval_env is not None:
        eval_runner = Runner(env=eval_env,
                             model=model,
                             nsteps=nsteps,
                             gamma=gamma,
                             lam=lam)

    epinfobuf = deque(maxlen=100)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=100)

    # Start total timer
    tfirststart = time.time()

    nupdates = total_timesteps // nbatch
    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0
        # Start timer
        tstart = time.time()
        frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        lrnow = lr(frac)
        # Calculate the cliprange
        cliprangenow = cliprange(frac)
        # Get minibatch
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )  #pylint: disable=E0632
        if eval_env is not None:
            eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run(
            )  #pylint: disable=E0632

        epinfobuf.extend(epinfos)
        if eval_env is not None:
            eval_epinfobuf.extend(eval_epinfos)

        # Here what we're going to do is for each minibatch calculate the loss and append it.
        mblossvals = []
        if states is None:  # nonrecurrent version
            # Index of each element of batch_size
            # Create the indices array
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                # Randomize the indexes
                np.random.shuffle(inds)
                # 0 to batch_size with batch_train_size step
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow,
                                                  *slices))
        else:  # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            envsperbatch = nbatch_train // nsteps
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(
                        model.train(lrnow, cliprangenow, *slices, mbstates))

        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)
        # End timer
        tnow = time.time()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, returns)
            logger.logkv("serial_timesteps", update * nsteps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))
            logger.logkv('eprewmean',
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean',
                         safemean([epinfo['l'] for epinfo in epinfobuf]))
            if eval_env is not None:
                logger.logkv(
                    'eval_eprewmean',
                    safemean([epinfo['r'] for epinfo in eval_epinfobuf]))
                logger.logkv(
                    'eval_eplenmean',
                    safemean([epinfo['l'] for epinfo in eval_epinfobuf]))
            logger.logkv('time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv(lossname, lossval)
            if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
                logger.dumpkvs()
        if save_interval and (update % save_interval == 0
                              or update == 1) and logger.get_dir() and (
                                  MPI is None
                                  or MPI.COMM_WORLD.Get_rank() == 0):
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)
    return model
Пример #5
0
    def train(self,
              num_steps,
              player,
              replay_buffer,
              optimize_op,
              train_interval=1,
              target_interval=8192,
              batch_size=32,
              min_buffer_size=20000,
              tf_schedules=(),
              handle_ep=lambda steps, rew: None,
              timeout=None):
        """
        Run an automated training loop.

        This is meant to provide a convenient way to run a
        standard training loop without any modifications.
        You may get more flexibility by writing your own
        training loop.

        Args:
          num_steps: the number of timesteps to run.
          player: the Player for gathering experience.
          replay_buffer: the ReplayBuffer for experience.
          optimize_op: a TF Op to optimize the model.
          train_interval: timesteps per training step.
          target_interval: number of timesteps between
            target network updates.
          batch_size: the size of experience mini-batches.
          min_buffer_size: minimum replay buffer size
            before training is performed.
          tf_schedules: a sequence of TFSchedules that are
            updated with the number of steps taken.
          handle_ep: called with information about every
            completed episode.
          timeout: if set, this is a number of seconds
            after which the training loop should exit.
        """
        sess = self.online_net.session
        sess.run(self.update_target)
        steps_taken = 0
        next_target_update = target_interval
        next_train_step = train_interval
        start_time = time.time()

        eprew_buf = deque(maxlen=100)
        eplen_buf = deque(maxlen=100)
        loss_buf = deque(maxlen=self.log_interval)
        n_updates = 0

        if self.data_aug != 'no_aug' and self.mpi_rank_weight > 0:
            if self.data_aug == "cutout_color":
                self.aug_func = Cutout_Color(batch_size=batch_size)
            elif self.data_aug == "crop":
                self.aug_func = Rand_Crop(batch_size=batch_size, sess=sess)
            else:
                raise ValueError("Invalid value for argument data_aug.")

        while steps_taken < num_steps:
            if timeout is not None and time.time() - start_time > timeout:
                return
            transitions = player.play()
            for trans in transitions:
                if trans['is_last']:
                    eprew_buf.append(trans['total_reward'])
                    eplen_buf.append(trans['episode_step'] + 1)
                    # handle_ep(trans['episode_step'] + 1, trans['total_reward'])
                replay_buffer.add_sample(trans)
                steps_taken += 1
                for sched in tf_schedules:
                    sched.add_time(sess, 1)
                if replay_buffer.size >= min_buffer_size and steps_taken >= next_train_step:
                    next_train_step = steps_taken + train_interval
                    batch = replay_buffer.sample(batch_size)
                    feed_dict = self.feed_dict(batch)
                    _, losses = sess.run((optimize_op, self.losses),
                                         feed_dict=feed_dict)
                    # gather batch
                    if self.mix_mode == 'mixreg':
                        batch = [batch[i] for i in feed_dict[self.indices_ph]]
                    replay_buffer.update_weights(batch, losses)
                    loss_buf.append(np.mean(losses))
                    n_updates += 1
                    # logging
                    if n_updates % self.log_interval == 0:
                        logger.logkv('misc/is_test_work', self.mpi_rank_weight == 0)
                        logger.logkv('eprewmean', np.mean(eprew_buf))
                        logger.logkv('eplenmean', np.mean(eplen_buf))
                        logger.logkv('loss', np.mean(loss_buf))
                        logger.logkv('misc/time_elapsed', time.time() - start_time)
                        logger.logkv('misc/steps_taken', steps_taken)
                        logger.dumpkvs()
                if steps_taken >= next_target_update:
                    next_target_update = steps_taken + target_interval
                    sess.run(self.update_target)
def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
			vf_coef=0.5,  max_grad_norm=0.5, gamma=0.99, lam=0.95,
			log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
			save_interval=0, load_path=None, log_path = '', train=True):
	# logger.configure('/scratch/msy290/RL/aborg/retro_contest_agent/metalearner_for_expt/model/')
	logger.configure(log_path+'model/')

	if isinstance(lr, float): lr = constfn(lr)
	else: assert callable(lr)
	if isinstance(cliprange, float): cliprange = constfn(cliprange)
	else: assert callable(cliprange)
	total_timesteps = int(total_timesteps)
	nenvs = env.num_envs
	ob_space = env.observation_space
	ac_space = env.action_space
	nbatch = nenvs * nsteps
	nbatch_train = nbatch // nminibatches
	assert nbatch % nminibatches == 0

	make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
					nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
					max_grad_norm=max_grad_norm)
	model = make_model()
	if load_path is not None:
		model.load(load_path)
	runner = Runner(env=env, model=model, nsteps=nsteps, total_timesteps=total_timesteps, gamma=gamma, lam=lam)

	epinfobuf = deque(maxlen=100)
	tfirststart = time.time()

	# Experience replay a la PPO-ER with L=2: https://arxiv.org/abs/1710.04423
	use_experience_replay = False

	nupdates = total_timesteps//nbatch
	for update in range(1, nupdates+1):
		tstart = time.time()
		frac = 1.0 - (update - 1.0) / nupdates
		lrnow = lr(frac)
		cliprangenow = cliprange(frac)
		if not use_experience_replay or update % 2 == 1:
			obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(update) #pylint: disable=E0632
		else:
			obs2, returns2, masks2, actions2, values2, neglogpacs2, states, epinfos = runner.run(update) #pylint: disable=E0632

		epinfobuf.extend(epinfos)
		mblossvals = []


		if states is None: # nonrecurrent version
			if use_experience_replay and update != 1:
				inds = list(np.arange(nbatch * 2))
				for _ in range(noptepochs):
					random.sample(inds, nbatch)
					for start in range(0, nbatch, nbatch_train):
						end = start + nbatch_train
						mbinds = inds[start:end]
						slices = (arr[mbinds] for arr in (np.concatenate((obs, obs2)), np.concatenate((returns, returns2)), np.concatenate((masks, masks2)), np.concatenate((actions, actions2)), np.concatenate((values, values2)), np.concatenate((neglogpacs, neglogpacs2))))
						
						mblossvals.append(model.train(lrnow, cliprangenow, *slices))
			else:
				inds = np.arange(int(nbatch/obs.shape[1]))

				inds = np.tile(inds, ( obs.shape[1],1))
				inds = disarrange(inds)

				for _ in range(noptepochs):
					for start in range(0, nsteps, int(nbatch_train/nenvs)):
						end = start + int(nbatch_train/nenvs)

						n_env = obs.shape[1]

						for j in range(n_env):
							
							mbinds = inds[j][start:end]
							
							slices = (arr[mbinds] for arr in (obs[:,j,:,:,:], returns[:,j], masks[:,j], actions[:,j], values[:,j], neglogpacs[:,j]))
							if train:
								mblossvals.append(model.train(j,lrnow, cliprangenow, *slices))
							else:
								mblossvals.append(model.train(nenvs,lrnow, cliprangenow, *slices))

		else: # recurrent version
			assert nenvs % nminibatches == 0
			assert use_experience_replay == False
			envsperbatch = nenvs // nminibatches
			envinds = np.arange(nenvs)
			flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
			envsperbatch = nbatch_train // nsteps
			for _ in range(noptepochs):
				np.random.shuffle(envinds)
				for start in range(0, nenvs, envsperbatch):
					end = start + envsperbatch
					mbenvinds = envinds[start:end]
					mbflatinds = flatinds[mbenvinds].ravel()
					slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
					mbstates = states[mbenvinds]
					mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates))

		lossvals = np.mean(mblossvals, axis=0)
		tnow = time.time()
		fps = int(nbatch / (tnow - tstart))

		values = sf01(values)
		returns = sf01(returns)

		if update % log_interval == 0 or update == 1:
			ev = explained_variance(values, returns)
			logger.logkv("serial_timesteps", update*nsteps)
			logger.logkv("nupdates", update)
			logger.logkv("total_timesteps", update*nbatch)
			logger.logkv("fps", fps)
			logger.logkv("explained_variance", float(ev))
			logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf]))
			logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf]))
			logger.logkv('time_elapsed', tnow - tfirststart)
			for (lossval, lossname) in zip(lossvals, model.loss_names):
				logger.logkv(lossname, lossval)
			logger.dumpkvs()
		if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
			checkdir = osp.join(logger.get_dir(), 'checkpoints')
			os.makedirs(checkdir, exist_ok=True)
			savepath = osp.join(checkdir, '%.5i'%update)
			print('Saving to', savepath)
			model.save(savepath)
		model.filmObj.reinit()
	env.close()
def learn(
        *,
        network,
        env,
        eval_env,
        make_eval_env,
        env_id,
        total_timesteps,
        seed=None,
        nsteps=2048,
        ent_coef=0.0,
        lr=3e-4,
        vf_coef=0.5,
        max_grad_norm=0.5,
        gamma=0.99,
        lam=0.95,
        log_interval=10,
        nminibatches=4,
        noptepochs=4,
        cliprange=0.2,
        sil_update=10,
        sil_value=0.01,
        sil_alpha=0.6,
        sil_beta=0.1,
        sil_loss=0.1,

        # MBL
        # For train mbl
        mbl_train_freq=5,
        # For eval
        num_eval_episodes=5,
        eval_freq=5,
        vis_eval=False,
        eval_targs=('mbmf', ),
        #eval_targs=('mf',),
        quant=2,

        # For mbl.step
        #num_samples=(1500,),
        num_samples=(1, ),
        horizon=(2, ),
        #horizon=(2,1),
        #num_elites=(10,),
        num_elites=(1, ),
        mbl_lamb=(1.0, ),
        mbl_gamma=0.99,
        #mbl_sh=1, # Number of step for stochastic sampling
        mbl_sh=10000,
        #vf_lookahead=-1,
        #use_max_vf=False,
        reset_per_step=(0, ),

        # For get_model
        num_fc=2,
        num_fwd_hidden=500,
        use_layer_norm=False,

        # For MBL
        num_warm_start=int(1e4),
        init_epochs=10,
        update_epochs=5,
        batch_size=512,
        update_with_validation=False,
        use_mean_elites=1,
        use_ent_adjust=0,
        adj_std_scale=0.5,

        # For data loading
        validation_set_path=None,

        # For data collect
        collect_val_data=False,

        # For traj collect
        traj_collect='mf',

        # For profile
        measure_time=True,
        eval_val_err=False,
        measure_rew=True,
        save_interval=0,
        load_path=None,
        model_fn=None,
        update_fn=None,
        init_fn=None,
        mpi_rank_weight=1,
        comm=None,
        **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

    Parameters:
    ----------

    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                      See common/models.py/lstm for more details on using recurrent nets in policies

    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                      nenv is number of environment copies simulated in parallel)

    total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)

    ent_coef: float                   policy entropy coefficient in the optimization objective

    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                      training and 0 is the end of the training.

    vf_coef: float                    value function loss coefficient in the optimization objective

    max_grad_norm: float or None      gradient norm clipping coefficient

    gamma: float                      discounting factor

    lam: float                        advantage estimation discounting factor (lambda in the paper)

    log_interval: int                 number of timesteps between logging events

    nminibatches: int                 number of training minibatches per update. For recurrent policies,
                                      should be smaller or equal than number of environments run in parallel.

    noptepochs: int                   number of training epochs per update

    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                      and 0 is the end of the training

    save_interval: int                number of timesteps between saving events

    load_path: str                    path to load the model from

    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.



    '''
    if not isinstance(num_samples, tuple): num_samples = (num_samples, )
    if not isinstance(horizon, tuple): horizon = (horizon, )
    if not isinstance(num_elites, tuple): num_elites = (num_elites, )
    if not isinstance(mbl_lamb, tuple): mbl_lamb = (mbl_lamb, )
    if not isinstance(reset_per_step, tuple):
        reset_per_step = (reset_per_step, )
    if validation_set_path is None:
        if collect_val_data:
            validation_set_path = os.path.join(logger.get_dir(), 'val.pkl')
        else:
            validation_set_path = os.path.join('dataset',
                                               '{}-val.pkl'.format(env_id))
    if eval_val_err:
        eval_val_err_path = os.path.join('dataset',
                                         '{}-combine-val.pkl'.format(env_id))
    logger.log(locals())
    logger.log('MBL_SH', mbl_sh)
    logger.log('Traj_collect', traj_collect)

    if MPI is not None:
        nworkers = MPI.COMM_WORLD.Get_size()
        rank = MPI.COMM_WORLD.Get_rank()
    else:
        nworkers = 1
        rank = 0
    cpus_per_worker = 1
    U.get_session(
        config=tf.ConfigProto(allow_soft_placement=True,
                              inter_op_parallelism_threads=cpus_per_worker,
                              intra_op_parallelism_threads=cpus_per_worker))

    set_global_seeds(seed)

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    policy = build_policy(env, network, **network_kwargs)
    np.set_printoptions(precision=3)
    # Get the nb of env
    nenvs = env.num_envs
    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0)

    # Instantiate the model object (that creates act_model and train_model)
    if model_fn is None:
        model_fn = Model

    make_model = lambda: Model(policy=policy,
                               ob_space=ob_space,
                               ac_space=ac_space,
                               nbatch_act=nenvs,
                               nbatch_train=nbatch_train,
                               nsteps=nsteps,
                               ent_coef=ent_coef,
                               vf_coef=vf_coef,
                               max_grad_norm=max_grad_norm,
                               sil_update=sil_update,
                               fn_reward=None,
                               fn_obs=None,
                               sil_value=sil_value,
                               sil_alpha=sil_alpha,
                               sil_beta=sil_beta,
                               sil_loss=sil_loss,
                               comm=comm,
                               mpi_rank_weight=mpi_rank_weight,
                               ppo=True,
                               prev_pi=None)
    model = make_model()
    pi = model.sil_model

    if load_path is not None:
        model.load(load_path)

    # MBL
    # ---------------------------------------
    #viz = Visdom(env=env_id)
    win = None
    eval_targs = list(eval_targs)
    logger.log(eval_targs)

    make_model_f = get_make_mlp_model(num_fc=num_fc,
                                      num_fwd_hidden=num_fwd_hidden,
                                      layer_norm=use_layer_norm)
    mbl = MBL(env=eval_env,
              env_id=env_id,
              make_model=make_model_f,
              num_warm_start=num_warm_start,
              init_epochs=init_epochs,
              update_epochs=update_epochs,
              batch_size=batch_size,
              **network_kwargs)

    val_dataset = {'ob': None, 'ac': None, 'ob_next': None}
    if update_with_validation:
        logger.log('Update with validation')
        val_dataset = load_val_data(validation_set_path)
    if eval_val_err:
        logger.log('Log val error')
        eval_val_dataset = load_val_data(eval_val_err_path)
    if collect_val_data:
        logger.log('Collect validation data')
        val_dataset_collect = []

    def _mf_pi(ob, t=None):
        stochastic = True
        ac, vpred, _, _ = pi.step(ob, stochastic=stochastic)
        return ac, vpred

    def _mf_det_pi(ob, t=None):
        #ac, vpred, _, _ = pi.step(ob, stochastic=False)
        ac, vpred = pi._evaluate([pi.pd.mode(), pi.vf], ob)
        return ac, vpred

    def _mf_ent_pi(ob, t=None):
        mean, std, vpred = pi._evaluate([pi.pd.mode(), pi.pd.std, pi.vf], ob)
        ac = np.random.normal(mean, std * adj_std_scale, size=mean.shape)
        return ac, vpred
################### use_ent_adjust======> adj_std_scale????????pi action sample

    def _mbmf_inner_pi(ob, t=0):
        if use_ent_adjust:
            return _mf_ent_pi(ob)
        else:
            #return _mf_pi(ob)
            if t < mbl_sh: return _mf_pi(ob)
            else: return _mf_det_pi(ob)

# ---------------------------------------

# Run multiple configuration once

    all_eval_descs = []

    def make_mbmf_pi(n, h, e, l):
        def _mbmf_pi(ob):
            ac, rew = mbl.step(ob=ob,
                               pi=_mbmf_inner_pi,
                               horizon=h,
                               num_samples=n,
                               num_elites=e,
                               gamma=mbl_gamma,
                               lamb=l,
                               use_mean_elites=use_mean_elites)
            return ac[None], rew

        return Policy(step=_mbmf_pi, reset=None)

    for n in num_samples:
        for h in horizon:
            for l in mbl_lamb:
                for e in num_elites:
                    if 'mbmf' in eval_targs:
                        all_eval_descs.append(('MeanRew', 'MBL_PPO_SIL',
                                               make_mbmf_pi(n, h, e, l)))
                    #if 'mbmf' in eval_targs: all_eval_descs.append(('MeanRew-n-{}-h-{}-e-{}-l-{}-sh-{}-me-{}'.format(n, h, e, l, mbl_sh, use_mean_elites), 'MBL_TRPO-n-{}-h-{}-e-{}-l-{}-sh-{}-me-{}'.format(n, h, e, l, mbl_sh, use_mean_elites), make_mbmf_pi(n, h, e, l)))
    if 'mf' in eval_targs:
        all_eval_descs.append(
            ('MeanRew', 'PPO_SIL', Policy(step=_mf_pi, reset=None)))

    logger.log('List of evaluation targets')
    for it in all_eval_descs:
        logger.log(it[0])

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    pool = Pool(mp.cpu_count())
    warm_start_done = False
    U.initialize()
    if load_path is not None:
        pi.load(load_path)

    # Instantiate the runner object
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
    epinfobuf = deque(maxlen=40)
    if init_fn is not None: init_fn()

    if traj_collect == 'mf':
        obs = runner.run()[0]

    # Start total timer
    tfirststart = time.perf_counter()

    nupdates = total_timesteps // nbatch
    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0
        # Start timer
        if hasattr(model.train_model, "ret_rms"):
            model.train_model.ret_rms.update(returns)
        if hasattr(model.train_model, "rms"):
            model.train_model.rms.update(obs)
        tstart = time.perf_counter()
        frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        lrnow = lr(frac)
        # Calculate the cliprange
        cliprangenow = cliprange(frac)

        if update % log_interval == 0 and is_mpi_root:
            logger.info('Stepping environment...')

        # Get minibatch
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )  #pylint: disable=E0632

        # Val data collection
        if collect_val_data:
            for ob_, ac_, ob_next_ in zip(obs[:-1, 0, ...], actions[:-1, ...],
                                          obs[1:, 0, ...]):
                val_dataset_collect.append(
                    (copy.copy(ob_), copy.copy(ac_), copy.copy(ob_next_)))
        # -----------------------------
        # MBL update
        else:
            ob_mbl, ac_mbl = obs.copy(), actions.copy()

            mbl.add_data_batch(ob_mbl[:-1, ...], ac_mbl[:-1, ...], ob_mbl[1:,
                                                                          ...])
            mbl.update_forward_dynamic(require_update=(update - 1) %
                                       mbl_train_freq == 0,
                                       ob_val=val_dataset['ob'],
                                       ac_val=val_dataset['ac'],
                                       ob_next_val=val_dataset['ob_next'])
        # -----------------------------

        if update % log_interval == 0 and is_mpi_root: logger.info('Done.')

        epinfobuf.extend(epinfos)

        # Here what we're going to do is for each minibatch calculate the loss and append it.
        mblossvals = []
        if states is None:  # nonrecurrent version
            # Index of each element of batch_size
            # Create the indices array
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                # Randomize the indexes
                np.random.shuffle(inds)
                # 0 to batch_size with batch_train_size step
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow,
                                                  *slices))
            l_loss, sil_adv, sil_samples, sil_nlogp = model.sil_train(lrnow)

        else:  # recurrent version
            print("caole")
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(
                        model.train(lrnow, cliprangenow, *slices, mbstates))

        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)
        # End timer
        tnow = time.perf_counter()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))

        if update_fn is not None:
            update_fn(update)

        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, returns)
            logger.logkv("misc/serial_timesteps", update * nsteps)
            logger.logkv("misc/nupdates", update)
            logger.logkv("misc/total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            logger.logkv("misc/explained_variance", float(ev))
            logger.logkv('eprewmean',
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv("AverageReturn",
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean',
                         safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.logkv('misc/time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv('loss/' + lossname, lossval)
            if sil_update > 0:
                logger.logkv("sil_samples", sil_samples)

            if rank == 0:
                # MBL evaluation
                if not collect_val_data:
                    #set_global_seeds(seed)
                    default_sess = tf.get_default_session()

                    def multithread_eval_policy(env_, pi_, num_episodes_,
                                                vis_eval_, seed):
                        with default_sess.as_default():
                            if hasattr(env, 'ob_rms') and hasattr(
                                    env_, 'ob_rms'):
                                env_.ob_rms = env.ob_rms
                            res = eval_policy(env_, pi_, num_episodes_,
                                              vis_eval_, seed, measure_time,
                                              measure_rew)

                            try:
                                env_.close()
                            except:
                                pass
                        return res

                    if mbl.is_warm_start_done() and update % eval_freq == 0:
                        warm_start_done = mbl.is_warm_start_done()
                        if num_eval_episodes > 0:
                            targs_names = {}
                            with timed('eval'):
                                num_descs = len(all_eval_descs)
                                list_field_names = [
                                    e[0] for e in all_eval_descs
                                ]
                                list_legend_names = [
                                    e[1] for e in all_eval_descs
                                ]
                                list_pis = [e[2] for e in all_eval_descs]
                                list_eval_envs = [
                                    make_eval_env() for _ in range(num_descs)
                                ]
                                list_seed = [seed for _ in range(num_descs)]
                                list_num_eval_episodes = [
                                    num_eval_episodes for _ in range(num_descs)
                                ]
                                print(list_field_names)
                                print(list_legend_names)

                                list_vis_eval = [
                                    vis_eval for _ in range(num_descs)
                                ]

                                for i in range(num_descs):
                                    field_name, legend_name = list_field_names[
                                        i], list_legend_names[i],

                                    res = multithread_eval_policy(
                                        list_eval_envs[i], list_pis[i],
                                        list_num_eval_episodes[i],
                                        list_vis_eval[i], seed)
                                    #eval_results = pool.starmap(multithread_eval_policy, zip(list_eval_envs, list_pis, list_num_eval_episodes, list_vis_eval,list_seed))

                                    #for field_name, legend_name, res in zip(list_field_names, list_legend_names, eval_results):
                                    perf, elapsed_time, eval_rew = res
                                    logger.logkv(field_name, perf)
                                    if measure_time:
                                        logger.logkv('Time-%s' % (field_name),
                                                     elapsed_time)
                                    if measure_rew:
                                        logger.logkv(
                                            'SimRew-%s' % (field_name),
                                            eval_rew)
                                    targs_names[field_name] = legend_name

                        if eval_val_err:
                            fwd_dynamics_err = mbl.eval_forward_dynamic(
                                obs=eval_val_dataset['ob'],
                                acs=eval_val_dataset['ac'],
                                obs_next=eval_val_dataset['ob_next'])
                            logger.logkv('FwdValError', fwd_dynamics_err)

                        #logger.dump_tabular()
                        logger.dumpkvs()
                        #print(logger.get_dir())
                        #print(targs_names)
                        #if num_eval_episodes > 0:
#                            win = plot(viz, win, logger.get_dir(), targs_names=targs_names, quant=quant, opt='best')
#else: logger.dumpkvs()
# -----------
            yield pi

        if collect_val_data:
            with open(validation_set_path, 'wb') as f:
                pickle.dump(val_dataset_collect, f)
            logger.log('Save {} validation data'.format(
                len(val_dataset_collect)))
        if save_interval and (update % save_interval == 0 or update
                              == 1) and logger.get_dir() and is_mpi_root:
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)

    return model
Пример #8
0
def learn(*,
          policy,
          env,
          nsteps,
          total_episodes,
          ent_coef,
          lr,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.2,
          save_interval=0,
          keep_all_ckpt=False):

    # FIXME(cpacker):
    # Callable lr and cliprange don't work (at the moment) with the
    # total_episodes terminating condition
    if isinstance(lr, float):
        lr = constfn(lr)
    else:
        raise NotImplementedError
        assert callable(lr)
    if isinstance(cliprange, float):
        cliprange = constfn(cliprange)
    else:
        raise NotImplementedError
        assert callable(cliprange)
    # total_timesteps = int(total_timesteps)
    total_episodes = int(total_episodes)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    make_model = lambda: Model(policy=policy,
                               ob_space=ob_space,
                               ac_space=ac_space,
                               nbatch_act=nenvs,
                               nbatch_train=nbatch_train,
                               nsteps=nsteps,
                               ent_coef=ent_coef,
                               vf_coef=vf_coef,
                               max_grad_norm=max_grad_norm)
    if save_interval and logger.get_dir():
        import cloudpickle
        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
            fh.write(cloudpickle.dumps(make_model))
    model = make_model()
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)

    epinfobuf = deque(maxlen=100)
    tfirststart = time.time()

    # nupdates = total_timesteps//nbatch
    # for update in range(1, nupdates+1):
    update = 0
    episodes_so_far = 0
    old_savepath = None
    while True:
        update += 1
        if episodes_so_far > total_episodes:
            break

        assert nbatch % nminibatches == 0
        nbatch_train = nbatch // nminibatches
        tstart = time.time()
        # frac = 1.0 - (update - 1.0) / nupdates
        frac = 1.0 - (update - 1.0) / total_episodes
        lrnow = lr(frac)
        cliprangenow = cliprange(frac)
        obs, returns, masks, actions, values, neglogpacs, states, epinfos, num_episodes = runner.run(
        )  #pylint: disable=E0632

        # NOTE(cpacker): Is this the best/correct way to keep track of n_eps?
        #episodes_so_far += len(epinfos)
        episodes_so_far += num_episodes

        epinfobuf.extend(epinfos)
        mblossvals = []
        if states is None:  # nonrecurrent version
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                np.random.shuffle(inds)
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow,
                                                  *slices))
        else:  # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            envsperbatch = nbatch_train // nsteps
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(
                        model.train(lrnow, cliprangenow, *slices, mbstates))

        lossvals = np.mean(mblossvals, axis=0)
        tnow = time.time()
        fps = int(nbatch / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, returns)
            logger.logkv("serial_timesteps", update * nsteps)
            logger.logkv("nupdates", update)
            logger.logkv("total_episodes", episodes_so_far)
            logger.logkv("total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))
            logger.logkv('eprewmean',
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean',
                         safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.logkv('time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv(lossname, lossval)
            logger.dumpkvs()
        if save_interval and (update % save_interval == 0
                              or update == 1) and logger.get_dir():
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i' % update)
            print('Saving to', savepath)
            obs_norms = {}
            obs_norms['clipob'] = env.clipob
            obs_norms['mean'] = env.ob_rms.mean
            obs_norms['var'] = env.ob_rms.var + env.epsilon
            with open(osp.join(checkdir, 'normalize'), 'wb') as f:
                pickle.dump(obs_norms, f, pickle.HIGHEST_PROTOCOL)
            model.save(savepath)

            if not keep_all_ckpt and old_savepath:
                print('Removing previous checkpoint', old_savepath)
                os.remove(old_savepath)
            old_savepath = savepath

    env.close()
Пример #9
0
    def log(self, rewards, dones):
        self.logs['ep_rew'] += rewards
        self.logs['dones'] = np.maximum(self.logs['dones'], dones)
        if sum(self.logs['dones']) < self.envs.num_envs:
            return
        self.logs['eps'] += self.envs.num_envs
        self.logs['rew_best'] = max(self.logs['rew_best'], np.mean(self.logs['ep_rew']))

        elapsed_time = time.time() - self.logs['start_time']
        frames = self.envs.num_envs * self.n_steps * self.logs['updates']

        logger.logkv('fps', int(frames / elapsed_time))
        logger.logkv('elapsed_time', int(elapsed_time))
        logger.logkv('n_eps', self.logs['eps'])
        logger.logkv('n_samples', frames)
        logger.logkv('n_updates', self.logs['updates'])
        logger.logkv('rew_best_mean', self.logs['rew_best'])
        logger.logkv('rew_max', np.max(self.logs['ep_rew']))
        logger.logkv('rew_mean', np.mean(self.logs['ep_rew']))
        logger.logkv('rew_mestd', np.std(self.logs['ep_rew'])) # weird name to ensure it's above min since logger sorts
        logger.logkv('rew_min', np.min(self.logs['ep_rew']))
        logger.dumpkvs()

        self.logs['dones'] = np.zeros(self.envs.num_envs)
        self.logs['ep_rew'] = np.zeros(self.envs.num_envs)
Пример #10
0
def learn(*, network, env, total_timesteps, nsteps=2048, lr=3e-4, vf_coef=0.5, gamma=0.99, lam=0.95, log_interval=1,
          save_interval=0, load_path=None, gradstepsperepoch=32, noptepochs=10, epsilon=0.4, replay_length=64,
          J_targ=0.001, epsilon_b=0.1, gaev = 1, eval_env = None, seed=None,


            **network_kwargs):
    '''
    Dimension-Wise Importance Sampling Weight Clipping (DISC) parameters

    Parameters:
    ----------

    network:                          multi-layer perceptrons (MLP) with 2 hidden layers of size 64

    env:                              Mujoco environment

    eval_env:                         environment for the deterministic evaluation

    total_timesteps: int              number of time steps

    nsteps (N): int                   size of a sample batch

    lr (beta): float function         learning rate which reduces linearly as iterations goes on

    vf_coef: float                    value function loss coefficient

    gamma: float                      discounting factor

    lam (lambda) : float              discounting factor for GAE

    log_interval: int                 number of time steps between logging events

    save_interval: int                number of time steps between saving events

    load_path: str                    path to load the model from

    gradstepsperepoch: int            number of training per epoch

    noptepochs: int                   number of training epochs per update

    epsilon : float                   clipping factor for dimension-wise clipping

    replay length (L) : int           maximum number of sample batches stored in the replay buffer

    J_targ: float                     IS target constant

    epsilon_b : float                 batch inclusion factor

    gaev : int                        use GAE-V if gaev = 1, and use GAE otherwise

    '''

    set_global_seeds(seed)

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(epsilon, float): epsilon = constfn(epsilon)
    else: assert callable(epsilon)
    total_timesteps = int(total_timesteps)

    policy = build_policy(env, network, **network_kwargs)

    # Get the nb of env
    nenvs = env.num_envs

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space
    obdim = ob_space.shape[0]
    acdim = ac_space.shape[0]
    print("Observation space dimension : " + str(obdim))
    print("Action space dimension : " + str(acdim))

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // gradstepsperepoch

    # Instantiate the model object (that creates act_model and train_model)
    make_model = lambda : Model(policy=policy, nbatch_act=nenvs, nsteps=nsteps, vf_coef=vf_coef)
    model = make_model()
    if load_path is not None:
        model.load(load_path)
    # Instantiate the runner object
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
    if eval_env is not None:
        eval_runner = EvalRunner(env = eval_env, model = model, nsteps = 10*nsteps, gamma = gamma, lam= lam)
        eval_runner.obfilt=runner.obfilt
        eval_runner.rewfilt=runner.rewfilt

    epinfobuf = deque(maxlen=10)

    # Start total timer
    tfirststart = time.time()

    nupdates = total_timesteps//nbatch

    def GAE(seg, gamma, value, lam):
        """
        Compute target value using TD(lambda) estimator, and advantage with GAE
        """
        done = np.append(seg["done"], 0)  # last element is only used for last vtarg, but we already zeroed it if last new = 1

        T = len(seg["rew"])
        gaelam = np.empty(T, 'float32')
        rew = runner.rewfilt(seg["rew"])
        lastgaelam = 0
        for t in reversed(range(T)):
            nonterminal = 1 - done[t + 1]
            delta = rew[t] + gamma * value[t + 1] * nonterminal - value[t]
            gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam
        ret = gaelam + value[:-1]
        return ret, gaelam

    def GAE_V(seg, gamma, value, rho):
        """
        Compute target value using V-trace estimator, and advantage with GAE-V
        """
        done = np.append(seg["done"], 0)  # last element is only used for last vtarg, but we already zeroed it if last new = 1
        rho_ = np.append(rho, 1.0)
        r = np.minimum(1.0, rho_)

        T = len(seg["rew"])
        gaelam = np.empty(T, 'float32')
        rew = runner.rewfilt(seg["rew"])
        lastgaelam = 0
        for t in reversed(range(T)):
            nonterminal = 1 - done[t + 1]
            delta = (rew[t] + gamma * value[t + 1] * nonterminal - value[t])
            gaelam[t] = delta + gamma * lam * nonterminal * lastgaelam
            lastgaelam = r[t] * gaelam[t]
        ret = r[:-1]*gaelam + value[:-1]
        return ret, gaelam

    seg = None
    # Calculate the epsilon
    epsilonnow = epsilon(1.0)
    alpha_IS=1.0
    for update in range(1, nupdates+1):
        assert nbatch % gradstepsperepoch == 0
        # Start timer
        tstart = time.time()
        frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        lrnow = np.maximum(1e-4, lr(frac))

        if seg is None:
            prev_seg = seg
            seg = {}
        else:
            prev_seg = {}
            for i in seg:
                prev_seg[i] = np.copy(seg[i])

        # Run a sample batch
        seg["ob"], seg["rew"], seg["done"], seg["ac"], seg["neglogp"], seg["mean"], seg["logstd"], final_obs, final_done, epinfos = runner.run() #pylint: disable=E0632
        epinfobuf.extend(epinfos)
        if eval_env is not None:
            eval_epinfos = eval_runner.run()

        # Stack the sample batches (the maximum length is L)
        if prev_seg is not None:
            for key in seg:
                if len(np.shape(seg[key])) == 1:
                    seg[key] = np.hstack([prev_seg[key], seg[key]])
                else:
                    seg[key] = np.vstack([prev_seg[key], seg[key]])
                if np.shape(seg[key])[0] > replay_length * nsteps:
                    seg[key] = seg[key][-replay_length * nsteps:]

        # Compute all values of all samples in the buffer
        ob_stack = np.vstack([seg["ob"], final_obs])
        values = model.values(runner.obfilt(ob_stack))
        values[-1] = (1.0-final_done) * values[-1]
        ob = runner.obfilt(seg["ob"])

        # Compute IS weight of all samples in the buffer
        mean_now, logstd_now = model.meanlogstds(ob)
        neglogpnow = 0.5 * np.sum(np.square((seg["ac"] - mean_now) / np.exp(logstd_now)), axis=-1) \
                      + 0.5 * np.log(2.0 * np.pi) * np.shape(seg["ac"])[1] \
                      + np.sum(logstd_now, axis=-1)
        rho = np.exp(-neglogpnow + seg["neglogp"])

        # Estimate target values and advantages
        if gaev==1:
            ret, gae = GAE_V(seg, gamma, values, rho)
        else:
            ret, gae = GAE(seg, gamma, values, lam)

        # Select sample batches which satisfies batch limiting condition in the paper
        prior_prob = np.zeros(len(seg["ob"]))
        rho_dim =  np.exp(- 0.5 * np.square((seg["ac"] - mean_now) / np.exp(logstd_now)) \
                - logstd_now + 0.5 * np.square((seg["ac"] - seg["mean"]) / np.exp(seg["logstd"])) + seg["logstd"])

        for i in range(int(len(prior_prob) / nsteps)):
            batch_condition = np.mean(np.abs(rho_dim[i * nsteps:(i + 1) * nsteps] - 1.0) + 1.0)
            if batch_condition > 1 + epsilon_b:
                prior_prob[i * nsteps:(i + 1) * nsteps] = 0
            else:
                prior_prob[i * nsteps:(i + 1) * nsteps] = 1



        # Here what we're going to do is for each minibatch calculate the loss and append it.
        mblossvals = []
        # Index of each element of batch_size
        # Create the indices array

        # On-policy data indices and minibatch size
        inds_on = np.arange(nsteps)+len(seg["ob"]) - nsteps
        nbatch_adapt_on = int((nsteps) / nsteps * nbatch_train)

        # Off-policy data indices and minibatch size
        inds_off = np.arange(len(seg["ob"]) - nsteps)
        nbatch_adapt_off = int((np.sum(prior_prob) - nsteps) / nsteps * nbatch_train)
        
        # On-policy data index
        on_policy_data = np.ones(len(seg["ob"])) * np.sum(prior_prob) / nsteps
        on_policy_data[:-nsteps]=0

        for _ in range(noptepochs):
            losses_epoch = []
            for _ in range(int(nsteps/nbatch_train)):
                # Choose sample minibatch indices of off policy trajectories
                if nbatch_adapt_off>0:
                    idx_off = np.random.choice(inds_off, nbatch_adapt_off,p=prior_prob[:-nsteps]/np.sum(prior_prob[:-nsteps]))
                else:
                    idx_off = []

                # Choose sample minibatch indices of on policy trajectories
                idx_on = np.random.choice(inds_on, nbatch_adapt_on)

                all_idx = np.hstack([idx_off,idx_on]).astype(int)

                # Sample minibatch
                slices = (arr[all_idx] for arr in (ob, ret, gae, seg["ac"], values[:-1], seg["neglogp"], seg["mean"], seg["logstd"], on_policy_data, rho))

                # Train the model
                loss_epoch = model.train(lrnow, epsilonnow, alpha_IS, *slices)
                mblossvals.append(loss_epoch)
                losses_epoch.append(loss_epoch)

        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)

        # Update adaptive IS target constant
        print("IS loss avg :", lossvals[3])
        if lossvals[3] > J_targ * 1.5:
            alpha_IS *= 2

            print("Adaptive IS loss factor is increased")
        elif lossvals[3] < J_targ / 1.5:
            alpha_IS /= 2
            print("Adaptive IS loss factor is reduced")
        alpha_IS = np.clip(alpha_IS,2**(-10),64)

        # End timer
        tnow = time.time()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            logger.logkv("adaptive IS loss factor", alpha_IS)
            logger.logkv("clipping factor", epsilonnow)
            logger.logkv("learning rate", lrnow)
            logger.logkv("serial_timesteps", update*nsteps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update*nbatch)
            logger.logkv("fps", fps)
            logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf]))
            if eval_env is not None:
                logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfos]))
                logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfos]))
            logger.logkv('time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv(lossname, lossval)
            logger.dumpkvs()
        if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i'%update)
            print('Saving to', savepath)
            model.save(savepath)
    return model
Пример #11
0
 def set_steps_to_remain(self, steps_to_remain):
     if self.args.scheduler_type == "global":
         self.steps_to_remain = steps_to_remain
         self.last_timesteps_so_far = self.locals["timesteps_so_far"]
         logger.logkv("steps_to_remain", steps_to_remain)
         print("GLOBAL curriculum: ", steps_to_remain)
Пример #12
0
def learn(*,
          network,
          env,
          total_timesteps,
          eval_env=None,
          seed=None,
          nsteps=2048,
          ent_coef=0.01,
          lr=3e-4,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          log_interval=1,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.2,
          save_interval=0,
          load_path=None,
          save_model_path=None,
          model_fn=None,
          **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

    Parameters:
    ----------

    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                      See common/models.py/lstm for more details on using recurrent nets in policies

    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                      nenv is number of environment copies simulated in parallel)

    total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)

    ent_coef: float                   policy entropy coefficient in the optimization objective

    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                      training and 0 is the end of the training.

    vf_coef: float                    value function loss coefficient in the optimization objective

    max_grad_norm: float or None      gradient norm clipping coefficient

    gamma: float                      discounting factor

    lam: float                        advantage estimation discounting factor (lambda in the paper)

    log_interval: int                 number of timesteps between logging events

    nminibatches: int                 number of training minibatches per update. For recurrent policies,
                                      should be smaller or equal than number of environments run in parallel.

    noptepochs: int                   number of training epochs per update

    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                      and 0 is the end of the training

    save_interval: int                number of timesteps between saving events

    load_path: str                    path to load the model from

    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.



    '''

    set_global_seeds(seed)

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    # Get the nb of env
    nenvs = env.num_envs

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    if isinstance(network, str):
        network_type = network
        policy_network_fn = get_network_builder(network_type)(**network_kwargs)
        network = policy_network_fn(ob_space.shape)

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0)

    # Instantiate the model object (that creates act_model and train_model)
    if model_fn is None:
        from baselines.ppo2.model import Model
        model_fn = Model

    model = model_fn(ac_space=ac_space,
                     policy_network=network,
                     ent_coef=ent_coef,
                     vf_coef=vf_coef,
                     max_grad_norm=max_grad_norm)

    if load_path is not None:
        load_path = osp.expanduser(load_path)
        ckpt = tf.train.Checkpoint(model=model)
        manager = tf.train.CheckpointManager(ckpt, load_path, max_to_keep=None)
        ckpt.restore(manager.latest_checkpoint)

    # Instantiate the runner object
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
    if eval_env is not None:
        eval_runner = Runner(env=eval_env,
                             model=model,
                             nsteps=nsteps,
                             gamma=gamma,
                             lam=lam)

#  epinfobuf = deque(maxlen=100)
    epinfobuf_rewards = deque(maxlen=100)
    epinfobuf_len = deque(maxlen=100)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=100)

    # Start total timer
    tfirststart = time.perf_counter()

    nupdates = total_timesteps // nbatch
    for update in range(1, nupdates + 1):
        #  for update in range(0, nupdates+1):
        assert nbatch % nminibatches == 0
        # Start timer
        tstart = time.perf_counter()
        frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        lrnow = lr(frac)
        cliprangenow = cliprange(frac)

        if update % log_interval == 0 and is_mpi_root:
            logger.info('Stepping environment...')

        # Get minibatch
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )  #pylint: disable=E0632
        if eval_env is not None:
            eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run(
            )  #pylint: disable=E0632

    # epinfobuf.extend(epinfos)
        epinfobuf_rewards.extend(epinfos['r'])
        epinfobuf_len.extend([len(epinfos['l'])])
        if eval_env is not None:
            eval_epinfobuf.extend(eval_epinfos)

        # Here what we're going to do is for each minibatch calculate the loss and append it.
        mblossvals = []
        if states is None:  # nonrecurrent version
            # Index of each element of batch_size
            # Create the indices array
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                # Randomize the indexes
                np.random.shuffle(inds)
                # 0 to batch_size with batch_train_size step
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (tf.constant(arr[mbinds])
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow,
                                                  *slices))
        else:  # recurrent version
            raise ValueError('Not Support Yet')

        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)
        # End timer
        tnow = time.perf_counter()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, returns)
            logger.logkv("misc/serial_timesteps", update * nsteps)
            logger.logkv("misc/nupdates", update)
            logger.logkv("misc/total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            logger.logkv("misc/explained_variance", float(ev))
            logger.logkv('eprewmean',
                         safemean([epinfo for epinfo in epinfobuf_rewards]))
            logger.logkv('eplenmean',
                         safemean([epinfo for epinfo in epinfobuf_len]))
            # logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf]))
            # logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf]))
            # logger.logkv('eprewmean', safemean(returns))

            if eval_env is not None:
                logger.logkv(
                    'eval_eprewmean',
                    safemean([epinfo['r'] for epinfo in eval_epinfobuf]))
                logger.logkv(
                    'eval_eplenmean',
                    safemean([epinfo['l'] for epinfo in eval_epinfobuf]))
            logger.logkv('misc/time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv('loss/' + lossname, lossval)

            logger.dumpkvs()

            if save_model_path is not None:
                filepath = os.path.join(save_model_path, 'models')
                model.train_model.value_network.save_weights(
                    '{}/world_model_value_net_{}_{}.h5'.format(filepath, seed))
                model.train_model.policy_network.save_weights(
                    '{}/world_model_policy_net_{}_{}.h5'.format(
                        filepath, seed))

    return model
Пример #13
0
def test_one_env(alt_flag,
                 model,
                 start_level,
                 num_levels,
                 logger,
                 args,
                 env=None):
    ## Modified based on random_ppo.learn
    if not env:
        venv = ProcgenEnv(num_envs=num_envs,
                          env_name=args.env_name,
                          num_levels=num_levels,
                          start_level=start_level,
                          distribution_mode=args.distribution_mode)
        venv = VecExtractDictObs(venv, "rgb")
        venv = VecMonitor(
            venv=venv,
            filename=None,
            keep_buf=100,
        )
        venv = VecNormalize(venv=venv, ob=False)
        env = venv

    runner = TestRunner(env=env,
                        model=model,
                        nsteps=nsteps,
                        gamma=gamma,
                        lam=lam)

    epinfobuf10 = deque(maxlen=10)
    epinfobuf100 = deque(maxlen=100)
    mean_rewards = []
    datapoints = []
    for rollout in range(1, args.nrollouts + 1):
        logger.info('collecting rollouts {}...'.format(rollout))
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
            alt_flag)
        epinfobuf10.extend(epinfos)
        epinfobuf100.extend(epinfos)

        rew_mean_10 = safemean([epinfo['r'] for epinfo in epinfobuf10])
        rew_mean_100 = safemean([epinfo['r'] for epinfo in epinfobuf100])
        ep_len_mean_10 = np.nanmean([epinfo['l'] for epinfo in epinfobuf10])
        ep_len_mean_100 = np.nanmean([epinfo['l'] for epinfo in epinfobuf100])

        logger.info('\n----', rollout)
        mean_rewards.append(rew_mean_10)
        logger.logkv('start_level', start_level)
        logger.logkv('eprew10', rew_mean_10)
        logger.logkv('eprew100', rew_mean_100)
        logger.logkv('eplenmean10', ep_len_mean_10)
        logger.logkv('eplenmean100', ep_len_mean_100)
        logger.logkv("misc/total_timesteps", rollout * args.nbatch)

        logger.info('----\n')
        logger.dumpkvs()
    env.close()
    logger.info("Average reward on levels {} ~ {}: {} ".format(
        start_level, start_level + num_levels, mean_rewards))
    return np.mean(mean_rewards)
Пример #14
0
def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4,
            vf_coef=0.5,  max_grad_norm=0.5, gamma=0.99, lam=0.95,
            log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
            save_interval=0, load_path=None, model_fn=None, **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

    Parameters:
    ----------

    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                      See common/models.py/lstm for more details on using recurrent nets in policies

    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                      nenv is number of environment copies simulated in parallel)

    total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)

    ent_coef: float                   policy entropy coefficient in the optimization objective

    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                      training and 0 is the end of the training.

    vf_coef: float                    value function loss coefficient in the optimization objective

    max_grad_norm: float or None      gradient norm clipping coefficient

    gamma: float                      discounting factor

    lam: float                        advantage estimation discounting factor (lambda in the paper)

    log_interval: int                 number of timesteps between logging events

    nminibatches: int                 number of training minibatches per update. For recurrent policies,
                                      should be smaller or equal than number of environments run in parallel.

    noptepochs: int                   number of training epochs per update

    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                      and 0 is the end of the training

    save_interval: int                number of timesteps between saving events

    load_path: str                    path to load the model from

    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.



    '''

    set_global_seeds(seed)

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    policy = build_policy(env, network, **network_kwargs)

    # Get the nb of env
    nenvs = env.num_envs

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    # Instantiate the model object (that creates act_model and train_model)
    if model_fn is None:
        from baselines.ppo2.model import Model
        model_fn = Model

    model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
                    nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
                    max_grad_norm=max_grad_norm)

    if load_path is not None:
        model.load(load_path)
    # Instantiate the runner object
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
    if eval_env is not None:
        eval_runner = Runner(env = eval_env, model = model, nsteps = nsteps, gamma = gamma, lam= lam)

    epinfobuf = deque(maxlen=100)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=100)

    # Start total timer
    tfirststart = time.perf_counter()

    nupdates = total_timesteps//nbatch
    for update in range(1, nupdates+1):
        assert nbatch % nminibatches == 0
        # Start timer
        tstart = time.perf_counter()
        frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        lrnow = lr(frac)
        # Calculate the cliprange
        cliprangenow = cliprange(frac)
        # Get minibatch
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632
        if eval_env is not None:
            eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run() #pylint: disable=E0632

        epinfobuf.extend(epinfos)
        if eval_env is not None:
            eval_epinfobuf.extend(eval_epinfos)

        # Here what we're going to do is for each minibatch calculate the loss and append it.
        mblossvals = []
        if states is None: # nonrecurrent version
            # Index of each element of batch_size
            # Create the indices array
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                # Randomize the indexes
                np.random.shuffle(inds)
                # 0 to batch_size with batch_train_size step
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow, *slices))
        else: # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates))

        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)
        # End timer
        tnow = time.perf_counter()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, returns)
            logger.logkv("serial_timesteps", update*nsteps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update*nbatch)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))
            logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf]))
            if eval_env is not None:
                logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf]) )
                logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf]) )
            logger.logkv('time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv(lossname, lossval)
            if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
                logger.dumpkvs()
        if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and (MPI is None or MPI.COMM_WORLD.Get_rank() == 0):
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i'%update)
            print('Saving to', savepath)
            model.save(savepath)
    return model
Пример #15
0
def ppo(env,
        policy,
        val_fn=None,
        total_steps=TOTAL_STEPS_DEFAULT,
        steps=125,
        n_envs=16,
        gamma=0.99,
        gaelam=0.96,
        clip_ratio=0.2,
        pol_iters=80,
        val_iters=80,
        pol_lr=3e-4,
        val_lr=1e-3,
        target_kl=0.01,
        mb_size=100,
        **saver_kwargs):
    val_fn = val_fn or ValueFunction.from_policy(policy)

    logu.save_config(locals())
    saver = SnapshotSaver(logger.get_dir(), locals(), **saver_kwargs)

    vec_env = VecEnvMaker(env)(n_envs)
    policy = policy.pop("class")(vec_env, **policy)
    val_fn = val_fn.pop("class")(vec_env, **val_fn)
    pol_optim = torch.optim.Adam(policy.parameters(), lr=pol_lr)
    val_optim = torch.optim.Adam(val_fn.parameters(), lr=val_lr)
    loss_fn = torch.nn.MSELoss()

    # Algorithm main loop
    collector = parallel_samples_collector(vec_env, policy, steps)
    beg, end, stp = steps * n_envs, total_steps + steps * n_envs, steps * n_envs
    for samples in trange(beg, end, stp, desc="Training", unit="step"):
        logger.info("Starting iteration {}".format(samples // stp))
        logger.logkv("Iteration", samples // stp)

        logger.info("Start collecting samples")
        trajs = next(collector)

        logger.info("Computing policy gradient variables")
        compute_pg_vars(trajs, val_fn, gamma, gaelam)
        flatten_trajs(trajs)
        all_obs, all_acts, _, _, all_advs, all_vals, all_rets = trajs.values()
        all_obs, all_vals = all_obs[:-n_envs], all_vals[:-n_envs]

        logger.info("Minimizing surrogate loss")
        with torch.no_grad():
            old_dists = policy(all_obs)
        old_logp = old_dists.log_prob(all_acts)
        min_advs = torch.where(all_advs > 0, (1 + clip_ratio) * all_advs,
                               (1 - clip_ratio) * all_advs)
        dataset = TensorDataset(all_obs, all_acts, all_advs, min_advs,
                                old_logp)
        dataloader = DataLoader(dataset, batch_size=mb_size, shuffle=True)
        for itr in range(pol_iters):
            for obs, acts, advs, min_adv, logp in dataloader:
                ratios = (policy(obs).log_prob(acts) - logp).exp()
                pol_optim.zero_grad()
                (-torch.min(ratios * advs, min_adv)).mean().backward()
                pol_optim.step()

            with torch.no_grad():
                mean_kl = kl(old_dists, policy(all_obs)).mean().item()
            if mean_kl > 1.5 * target_kl:
                logger.info(
                    "Stopped at step {} due to reaching max kl".format(itr +
                                                                       1))
                break
        logger.logkv("StopIter", itr + 1)

        logger.info("Updating val_fn")
        for _ in range(val_iters):
            val_optim.zero_grad()
            loss_fn(val_fn(all_obs), all_rets).backward()
            val_optim.step()

        logger.info("Logging information")
        logger.logkv("TotalNSamples", samples)
        logu.log_reward_statistics(vec_env)
        logu.log_val_fn_statistics(all_vals, all_rets)
        logu.log_action_distribution_statistics(old_dists)
        logger.logkv("MeanKL", mean_kl)
        logger.dumpkvs()

        logger.info("Saving snapshot")
        saver.save_state(
            index=samples // stp,
            state=dict(
                alg=dict(last_iter=samples // stp),
                policy=policy.state_dict(),
                val_fn=val_fn.state_dict(),
                pol_optim=pol_optim.state_dict(),
                val_optim=val_optim.state_dict(),
            ),
        )

    vec_env.close()
Пример #16
0
def a2c_kfac(env,
             policy,
             val_fn=None,
             total_steps=TOTAL_STEPS_DEFAULT,
             steps=20,
             n_envs=16,
             kfac=None,
             ent_coeff=0.01,
             vf_loss_coeff=0.5,
             gamma=0.99,
             log_interval=100,
             warm_start=None,
             **saver_kwargs):
    assert val_fn is None or not issubclass(
        policy["class"], WeightSharingAC
    ), "Choose between a weight sharing model or separate policy and val_fn"

    # handle default values
    kfac = kfac or {}
    kfac = {
        "eps": 1e-3,
        "pi": True,
        "alpha": 0.95,
        "kl_clip": 1e-3,
        "eta": 1.0,
        **kfac
    }
    if val_fn is None and not issubclass(policy["class"], WeightSharingAC):
        val_fn = ValueFunction.from_policy(policy)

    # save config and setup state saving
    logu.save_config(locals())
    saver = SnapshotSaver(logger.get_dir(), locals(), **saver_kwargs)

    # initialize models and optimizer
    vec_env = VecEnvMaker(env)(n_envs)
    policy = policy.pop("class")(vec_env, **policy)
    module_list = torch.nn.ModuleList(policy.modules())
    if val_fn is not None:
        val_fn = val_fn.pop("class")(vec_env, **val_fn)
        module_list.extend(val_fn.modules())
    optimizer = KFACOptimizer(module_list, **kfac)
    # scheduler = LinearLR(optimizer, total_steps // (steps*n_envs))
    loss_fn = torch.nn.MSELoss()

    # load state if provided
    updates = 0
    if warm_start is not None:
        if ":" in warm_start:
            warm_start, index = warm_start.split(":")
        else:
            index = None
        config, state = SnapshotSaver(warm_start,
                                      latest_only=False).get_state(int(index))
        policy.load_state_dict(state["policy"])
        if "optimizer" in state:
            optimizer.load_state_dict(state["optimizer"])
        updates = state["alg"]["last_updt"]

    # Algorith main loop
    if val_fn is None:
        compute_dists_vals = policy
    else:

        def compute_dists_vals(obs):
            return policy(obs), val_fn(obs)

    ob_space, ac_space = vec_env.observation_space, vec_env.action_space
    obs = torch.from_numpy(vec_env.reset())
    with torch.no_grad():
        acts = policy.actions(obs)
    logger.info("Starting epoch {}".format(1))
    beg, end, stp = steps * n_envs, total_steps + steps * n_envs, steps * n_envs
    total_updates = total_steps // stp
    for samples in trange(beg, end, stp, desc="Training", unit="step"):
        all_obs = torch.empty((steps, n_envs) + ob_space.shape,
                              dtype=_NP_TO_PT[ob_space.dtype.type])
        all_acts = torch.empty((steps, n_envs) + ac_space.shape,
                               dtype=_NP_TO_PT[ac_space.dtype.type])
        all_rews = torch.empty((steps, n_envs))
        all_dones = torch.empty((steps, n_envs))

        with torch.no_grad():
            for i in range(steps):
                next_obs, rews, dones, _ = vec_env.step(acts.numpy())
                all_obs[i] = obs
                all_acts[i] = acts
                all_rews[i] = torch.from_numpy(rews)
                all_dones[i] = torch.from_numpy(dones.astype("f"))
                obs = torch.from_numpy(next_obs)

                acts = policy.actions(obs)

        all_obs = all_obs.reshape(stp, -1).squeeze()
        all_acts = all_acts.reshape(stp, -1).squeeze()

        # Sample Fisher curvature matrix
        with optimizer.record_stats():
            optimizer.zero_grad()
            all_dists, all_vals = compute_dists_vals(all_obs)
            logp = all_dists.log_prob(all_acts)
            noise = all_vals.detach() + 0.5 * torch.randn_like(all_vals)
            (logp.mean() +
             loss_fn(all_vals, noise)).backward(retain_graph=True)

        # Compute returns and advantages
        with torch.no_grad():
            _, next_vals = compute_dists_vals(obs)
        all_rets = all_rews.clone()
        all_rets[-1] += gamma * (1 - all_dones[-1]) * next_vals
        for i in reversed(range(steps - 1)):
            all_rets[i] += gamma * (1 - all_dones[i]) * all_rets[i + 1]
        all_rets = all_rets.flatten()
        all_advs = all_rets - all_vals.detach()

        # Compute loss
        updates += 1
        # ent_coeff = ent_coeff*0.99 if updates % 10 == 0 else ent_coeff
        pi_loss = -torch.mean(logp * all_advs)
        vf_loss = loss_fn(all_vals, all_rets)
        entropy = all_dists.entropy().mean()
        total_loss = pi_loss - ent_coeff * entropy + vf_loss_coeff * vf_loss

        # scheduler.step()
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

        if updates == 1 or updates % log_interval == 0:
            logger.logkv("Epoch", updates // log_interval + 1)
            logger.logkv("TotalNSamples", samples)
            logu.log_reward_statistics(vec_env)
            logu.log_val_fn_statistics(all_vals, all_rets)
            logu.log_action_distribution_statistics(all_dists)
            logger.dumpkvs()
            logger.info("Starting epoch {}".format(updates // log_interval +
                                                   2))

        saver.save_state(
            index=updates,
            state=dict(
                alg=dict(last_updt=updates),
                policy=policy.state_dict(),
                val_fn=None if val_fn is None else val_fn.state_dict(),
                optimizer=optimizer.state_dict(),
            ),
        )

    vec_env.close()
Пример #17
0
def learn(*,
          network,
          env,
          total_timesteps,
          early_stopping=False,
          eval_env=None,
          seed=None,
          nsteps=2048,
          ent_coef=0.0,
          lr=3e-4,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.2,
          save_interval=0,
          load_path=None,
          model_fn=None,
          scope='',
          **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

    Parameters:
    ----------

    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                      See common/models.py/lstm for more details on using recurrent nets in policies

    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                      nenv is number of environment copies simulated in parallel)

    total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)

    ent_coef: float                   policy entropy coefficient in the optimization objective

    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                      training and 0 is the end of the training.

    vf_coef: float                    value function loss coefficient in the optimization objective

    max_grad_norm: float or None      gradient norm clipping coefficient

    gamma: float                      discounting factor

    lam: float                        advantage estimation discounting factor (lambda in the paper)

    log_interval: int                 number of timesteps between logging events

    nminibatches: int                 number of training minibatches per update. For recurrent policies,
                                      should be smaller or equal than number of environments run in parallel.

    noptepochs: int                   number of training epochs per update

    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                      and 0 is the end of the training

    save_interval: int                number of timesteps between saving events

    load_path: str                    path to load the model from

    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.



    '''
    additional_params = network_kwargs["network_kwargs"]
    from baselines import logger

    # set_global_seeds(seed) We deal with seeds upstream

    if "LR_ANNEALING" in additional_params.keys():
        lr_reduction_factor = additional_params["LR_ANNEALING"]
        start_lr = lr
        lr = lambda prop: (start_lr / lr_reduction_factor) + (
            start_lr - (start_lr / lr_reduction_factor
                        )) * prop  # Anneals linearly from lr to lr/red factor

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    policy = build_policy(env, network, **network_kwargs)

    bestrew = 0
    # Get the nb of env
    nenvs = env.num_envs

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    # Instantiate the model object (that creates act_model and train_model)
    if model_fn is None:
        from baselines.ppo2.model import Model
        model_fn = Model

    model = model_fn(policy=policy,
                     ob_space=ob_space,
                     ac_space=ac_space,
                     nbatch_act=nenvs,
                     nbatch_train=nbatch_train,
                     nsteps=nsteps,
                     ent_coef=ent_coef,
                     vf_coef=vf_coef,
                     max_grad_norm=max_grad_norm,
                     scope=scope)

    if load_path is not None:
        model.load(load_path)
    # Instantiate the runner object
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
    if eval_env is not None:
        eval_runner = Runner(env=eval_env,
                             model=model,
                             nsteps=nsteps,
                             gamma=gamma,
                             lam=lam)

    epinfobuf = deque(maxlen=100)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=100)

    # Start total timer
    tfirststart = time.perf_counter()

    best_rew_per_step = 0

    run_info = defaultdict(list)
    nupdates = total_timesteps // nbatch
    print("TOT NUM UPDATES", nupdates)
    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0, "Have {} total batch size and want {} minibatches, can't split evenly".format(
            nbatch, nminibatches)
        # Start timer
        tstart = time.perf_counter()
        frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        lrnow = lr(frac)
        # Calculate the cliprange
        cliprangenow = cliprange(frac)
        # Get minibatch
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )  #pylint: disable=E0632

        if eval_env is not None:
            eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run(
            )  #pylint: disable=E0632

        eplenmean = safemean([epinfo['l'] for epinfo in epinfos])
        eprewmean = safemean([epinfo['r'] for epinfo in epinfos])
        rew_per_step = eprewmean / eplenmean

        print("Curr learning rate {} \t Curr reward per step {}".format(
            lrnow, rew_per_step))

        if rew_per_step > best_rew_per_step and early_stopping:
            # Avoid updating best model at first iteration because the means might be a bit off because
            # of how the multithreaded batch simulation works
            best_rew_per_step = eprewmean / eplenmean
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            model.save(checkdir + ".temp_best_model")
            print("Saved model as best", best_rew_per_step, "avg rew/step")

        epinfobuf.extend(epinfos)
        if eval_env is not None:
            eval_epinfobuf.extend(eval_epinfos)

        # Here what we're going to do is for each minibatch calculate the loss and append it.
        mblossvals = []
        if states is None:  # nonrecurrent version
            # Index of each element of batch_size
            # Create the indices array
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                # Randomize the indexes
                np.random.shuffle(inds)
                # 0 to batch_size with batch_train_size step
                for start in tqdm.trange(0,
                                         nbatch,
                                         nbatch_train,
                                         desc="{}/{}".format(_, noptepochs)):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow,
                                                  *slices))

        else:  # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(
                        model.train(lrnow, cliprangenow, *slices, mbstates))

        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)
        # End timer
        tnow = time.perf_counter()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))

        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, returns)
            logger.logkv("serial_timesteps", update * nsteps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))

            eprewmean = safemean([epinfo['r'] for epinfo in epinfobuf])
            ep_dense_rew_mean = safemean(
                [epinfo['dense_r'] for epinfo in epinfobuf])
            ep_sparse_rew_mean = safemean(
                [epinfo['sparse_r'] for epinfo in epinfobuf])
            eplenmean = safemean([epinfo['l'] for epinfo in epinfobuf])
            run_info['eprewmean'].append(eprewmean)
            run_info['ep_dense_rew_mean'].append(ep_dense_rew_mean)
            run_info['ep_sparse_rew_mean'].append(ep_sparse_rew_mean)
            run_info['eplenmean'].append(eplenmean)
            run_info['explained_variance'].append(float(ev))

            logger.logkv(
                'true_eprew',
                safemean([epinfo['sparse_r'] for epinfo in epinfobuf]))
            logger.logkv('eprewmean', eprewmean)
            logger.logkv('eplenmean', eplenmean)
            if eval_env is not None:
                logger.logkv(
                    'eval_eprewmean',
                    safemean([epinfo['r'] for epinfo in eval_epinfobuf]))
                logger.logkv(
                    'eval_eplenmean',
                    safemean([epinfo['l'] for epinfo in eval_epinfobuf]))

            time_elapsed = tnow - tfirststart
            logger.logkv('time_elapsed', time_elapsed)

            time_per_update = time_elapsed / update
            time_remaining = (nupdates - update) * time_per_update
            logger.logkv('time_remaining', time_remaining / 60)

            for (lossval, lossname) in zip(lossvals, model.loss_names):
                run_info[lossname].append(lossval)

                logger.logkv(lossname, lossval)

            if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
                logger.dumpkvs()

            # Update current logs
            if additional_params["RUN_TYPE"] in ["ppo", "joint_ppo"]:
                from hr_coordination.utils import save_dict_to_file
                save_dict_to_file(run_info,
                                  additional_params["SAVE_DIR"] + "logs")

                # Linear annealing of reward shaping
                if additional_params["REW_SHAPING_HORIZON"] != 0:
                    # Piecewise linear annealing schedule
                    # annealing_thresh: until when we should stop doing 100% reward shaping
                    # annealing_horizon: when we should reach doing 0% reward shaping
                    annealing_horizon = additional_params[
                        "REW_SHAPING_HORIZON"]
                    annealing_thresh = 0

                    def fn(x):
                        if annealing_thresh != 0 and annealing_thresh - (
                                annealing_horizon / annealing_thresh) * x > 1:
                            return 1
                        else:
                            fn = lambda x: -1 * (x - annealing_thresh) * 1 / (
                                annealing_horizon - annealing_thresh) + 1
                            return max(fn(x), 0)

                    curr_timestep = update * nbatch
                    curr_reward_shaping = fn(curr_timestep)
                    env.update_reward_shaping_param(curr_reward_shaping)
                    print("Current reward shaping", curr_reward_shaping)

                # Save/overwrite best model if past a certain threshold
                if ep_sparse_rew_mean > bestrew and ep_sparse_rew_mean > additional_params[
                        "SAVE_BEST_THRESH"]:
                    # Don't save best model if still doing some self play and it's supposed to be a BC model
                    if additional_params[
                            "OTHER_AGENT_TYPE"][:2] == "bc" and additional_params[
                                "SELF_PLAY_RND_GOAL"] != 0 and env.self_play_randomization > 0:
                        pass

                    from hr_coordination.ppo.ppo import save_ppo_model
                    print("BEST REW", ep_sparse_rew_mean,
                          "overwriting previous model with", bestrew)
                    save_ppo_model(
                        model,
                        "{}seed{}/best".format(additional_params["SAVE_DIR"],
                                               additional_params["CURR_SEED"]))
                    bestrew = max(ep_sparse_rew_mean, bestrew)

                if additional_params["SELF_PLAY_RND_GOAL"] != 0:
                    if type(additional_params["SELF_PLAY_RND_GOAL"]
                            ) is not list:
                        # Sigmoid self-play schedule based on current performance (not recommended)
                        curr_reward = ep_sparse_rew_mean

                        rew_target = additional_params["SELF_PLAY_RND_GOAL"]
                        shift = rew_target / 2
                        t = (1 / rew_target) * 10
                        fn = lambda x: -1 * (np.exp(t * (x - shift)) /
                                             (1 + np.exp(t * (x - shift)))) + 1

                        env.self_play_randomization = fn(curr_reward)
                        print("Current self-play randomization",
                              env.self_play_randomization)
                    else:
                        # Piecewise linear self-play schedule

                        # self_play_thresh: when we should stop doing 100% self-play
                        # self_play_timeline: when we should reach doing 0% self-play
                        self_play_thresh, self_play_timeline = additional_params[
                            "SELF_PLAY_RND_GOAL"]

                        def fn(x):
                            if self_play_thresh != 0 and self_play_timeline - (
                                    self_play_timeline /
                                    self_play_thresh) * x > 1:
                                return 1
                            else:
                                fn = lambda x: -1 * (
                                    x - self_play_thresh) * 1 / (
                                        self_play_timeline - self_play_thresh
                                    ) + 1
                                return max(fn(x), 0)

                        curr_timestep = update * nbatch
                        env.self_play_randomization = fn(curr_timestep)
                        print("Current self-play randomization",
                              env.self_play_randomization)

        if save_interval and (update % save_interval == 0
                              or update == 1) and logger.get_dir() and (
                                  MPI is None
                                  or MPI.COMM_WORLD.Get_rank() == 0):
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)

        # Visualization of rollouts with actual other agent
        run_type = additional_params["RUN_TYPE"]
        if run_type in ["ppo", "joint_ppo"
                        ] and update % additional_params["VIZ_FREQUENCY"] == 0:
            from hr_coordination.agents.agent import AgentPair
            from hr_coordination.agents.benchmarking import AgentEvaluator
            from hr_coordination.pbt.pbt_utils import setup_mdp_env, get_agent_from_model
            print(additional_params["SAVE_DIR"])

            overcooked_env = setup_mdp_env(display=False, **additional_params)
            agent = get_agent_from_model(
                model,
                additional_params["SIM_THREADS"],
                is_joint_action=(run_type == "joint_ppo"))
            agent.set_mdp(overcooked_env.mdp)

            if run_type == "ppo":
                if additional_params["OTHER_AGENT_TYPE"] == 'sp':
                    agent_pair = AgentPair(agent, agent)
                else:
                    print("PPO agent on index 0:")
                    env.other_agent.set_mdp(overcooked_env.mdp)
                    agent_pair = AgentPair(agent, env.other_agent)
                    trajectory, time_taken, tot_rewards, tot_shaped_rewards = overcooked_env.run_agents(
                        agent_pair, display=True, displayUntil=100)
                    print("tot rew", tot_rewards, "tot rew shaped",
                          tot_shaped_rewards)

                    print("PPO agent on index 1:")
                    agent_pair = AgentPair(env.other_agent, agent)

            else:
                agent_pair = AgentPair(agent)

            trajectory, time_taken, tot_rewards, tot_shaped_rewards = overcooked_env.run_agents(
                agent_pair, display=True, displayUntil=100)
            print("tot rew", tot_rewards, "tot rew shaped", tot_shaped_rewards)
            print(additional_params["SAVE_DIR"])

    if nupdates > 0 and early_stopping:
        checkdir = osp.join(logger.get_dir(), 'checkpoints')
        print("Loaded best model", best_rew_per_step)
        model.load(checkdir + ".temp_best_model")
    return model, run_info
Пример #18
0
def learn(*, policy, env, raw_env,
          use_2D_env=True,
          use_other_room=False,
          use_rich_reward=False,
          use_multiple_starts=False,
          use_feedback=True,
          use_real_feedback=False,
          only_use_hr_until=1000,
          trans_to_rl_in=1000,
          nsteps=8,
          total_timesteps=1000,
          ppo_lr=2e-4, cliprange=0.2, ent_coef=.1, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95,
          ppo_noptepochs=4, ppo_batch_size=32, ppo_minibatch_size=8, init_rl_importance=0.2,
          feedback_lr=1e-3, min_feedback_buffer_size=32,
          feedback_noptepochs=4, feedback_batch_size=16, feedback_minibatch_size=8,
          feedback_training_prop=0.7,
          feedback_training_new_prop=0.4,
          feedback_use_mixup=False,
          hf_loss_type="CCE", hf_loss_param=None,
          good_feedback_acc=0.7,
          bad_feedback_acc=0.7,
          log_interval=10, save_interval=0, reload_name=None, base_path=None):

    if isinstance(ppo_lr, float):
        ppo_lr = constfn(ppo_lr)
    else:
        assert callable(ppo_lr)
    if isinstance(cliprange, float):
        cliprange = constfn(cliprange)
    else:
        assert callable(cliprange)

    total_timesteps = int(total_timesteps)
    assert ppo_batch_size % nsteps == 0

    ob_space = env.observation_space
    ac_space = env.action_space

    nenvs = 1
    nbatch = nenvs * nsteps

    if hf_loss_type == 0:
        hf_loss_param = None


    make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space,
                               nbatch_act=nenvs, nbatch_train=ppo_minibatch_size, nbatch_feedback=feedback_minibatch_size,
                               nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
                               max_grad_norm=max_grad_norm,
                               hf_loss_type=hf_loss_type,
                               hf_loss_param=hf_loss_param)

    if save_interval and logger.get_dir():
        import cloudpickle
        if not base_path:
            base_path = os.path.dirname(os.path.abspath(__file__))
        if not os.path.isdir(osp.join(base_path, "models")):
            os.mkdir(osp.join(base_path, "models"))
        with open(osp.join(base_path, "models", 'make_model.pkl'), 'wb') as fh:
            fh.write(cloudpickle.dumps(make_model))

    if use_real_feedback:
        print("looking for an EEG_Pred stream...", end="", flush=True)
        feedback_LSL_stream = pylsl.StreamInlet(pylsl.resolve_stream('type', 'EEG_Pred')[0])
        print(" done")

    model = make_model()
    if reload_name:
        model.load(reload_name)

    target_position = raw_env.robot.get_target_position()
    if use_2D_env:
        judge_action, *_ = run_dijkstra(raw_env, target_position, use_other_room=use_other_room)
    else:
        judge_action = judge_action_1D(raw_env, target_position)

    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam,
                    judge_action=judge_action,
                    use_rich_reward=use_rich_reward,
                    use_multiple_starts=use_multiple_starts,
                    use_feedback=use_feedback,
                    use_real_feedback=use_real_feedback,
                    only_use_hr_until=only_use_hr_until,
                    trans_to_rl_in=trans_to_rl_in,
                    init_rl_importance=init_rl_importance)

    epinfobuf = deque(maxlen=100)

    nupdates = total_timesteps // nbatch

    state_action_buffer = deque(maxlen=100)
    action_idx_buffer = deque(maxlen=100)

    feedback_buffer_train = {}
    feedback_buffer_train_true = {}
    feedback_buffer_valid = {}
    feedback_bmms = {}
    for a in range(ac_space.n):
        feedback_buffer_train[a], feedback_buffer_train_true[a], feedback_buffer_valid[a] = [], [], []
        feedback_bmms[a] = 0
    performance = {"feedback": [], "sparse_reward": [], "rich_reward": [],
                   "train_acc": [], "train_true_acc": [], "valid_acc": []}
    epi_test_num = [0 for _ in range(ac_space.n)]

    ppo_obs, ppo_rewards, ppo_masks, ppo_actions, ppo_values, ppo_neglogpacs = [], [], [], [], [], []
    for update in range(1, nupdates + 1):
        tstart = time.time()
        frac = 1.0 - (update - 1.0) / nupdates
        ppo_lrnow = ppo_lr(frac)
        cliprangenow = cliprange(frac)

        obs, rewards, masks, actions, values, neglogpacs, cors, sparse_rew, rich_rew, _, action_idxs, epinfos = runner.run()
        epinfobuf.extend(epinfos)
        performance["sparse_reward"].extend(sparse_rew)
        performance["rich_reward"].extend(rich_rew)

        mblossvals = []

        state_action_buffer.extend([[s, a] for s, a in zip(obs, actions)])
        action_idx_buffer.extend(action_idxs)

        if use_feedback:
            if use_real_feedback:
                action_idxs, feedbacks, correct_feedbacks = get_feedback_from_LSL(feedback_LSL_stream)
                print("Received feedback from LSL", feedbacks)
            else:
                action_idxs, feedbacks, correct_feedbacks = \
                    get_simulated_feedback(cors if use_2D_env else obs, actions, action_idxs, judge_action,
                                           good_feedback_acc, bad_feedback_acc)
            performance["feedback"].extend(correct_feedbacks)

            # add feedbacks into feedback replay buffer
            if len(feedbacks):
                for a_idx, fb, cfb in zip(action_idxs, feedbacks, correct_feedbacks):
                    s, a = state_action_buffer[action_idx_buffer.index(a_idx)]
                    epi_test_num[a] += 1 - feedback_training_prop
                    # s, fb, cfb = np.ones(13), 1, 1
                    if epi_test_num[a] > 1:
                        feedback_buffer_valid[a].append([s, cfb])
                        epi_test_num[a] -= 1
                    else:
                        feedback_buffer_train[a].append([s, fb])
                        feedback_buffer_train_true[a].append([s, cfb])


        # train PPO
        if runner.num_step >= only_use_hr_until:
            ppo_obs.extend(obs)
            ppo_rewards.extend(rewards)
            ppo_masks.extend(masks)
            ppo_actions.extend(actions)
            ppo_values.extend(values)
            ppo_neglogpacs.extend(neglogpacs)

            if len(ppo_obs) == ppo_batch_size:
                ppo_obs = np.asarray(ppo_obs)
                ppo_rewards = np.asarray(ppo_rewards)
                ppo_masks = np.asarray(ppo_masks)
                ppo_actions = np.asarray(ppo_actions)
                ppo_values = np.asarray(ppo_values)
                ppo_neglogpacs = np.asarray(ppo_neglogpacs)
                ppo_returns = runner.calculate_returns(ppo_rewards, ppo_masks, ppo_values)
                inds = np.arange(ppo_batch_size)
                for _ in range(ppo_noptepochs):
                    np.random.shuffle(inds)
                    for start in range(0, ppo_batch_size, ppo_minibatch_size):
                        end = start + ppo_minibatch_size
                        mbinds = inds[start:end]
                        slices = (arr[mbinds]
                                  for arr in (ppo_obs, ppo_returns, ppo_masks, ppo_actions, ppo_values, ppo_neglogpacs))
                        mblossvals.append(model.train(ppo_lrnow, cliprangenow, *slices))
                ppo_obs, ppo_rewards, ppo_masks, ppo_actions, ppo_values, ppo_neglogpacs = [], [], [], [], [], []

        # train feedback regressor
        if use_feedback and runner.num_step <= only_use_hr_until:
            all_train_acc = []
            all_train_true_acc = []
            all_valid_acc = []

            if not all([len(feedback_buffer) >= min_feedback_buffer_size
                        for feedback_buffer in feedback_buffer_train.values()]):
                performance["train_acc"].append(0.)
                performance["train_true_acc"].append(0.)
                performance["valid_acc"].append(0.)
                continue
            for a in range(ac_space.n):
                feedback_buffer = feedback_buffer_train[a]
                feedback_buffer_t = feedback_buffer_train_true[a]
                feedback_buffer_v = feedback_buffer_valid[a]
                bmm_model = feedback_bmms[a]
                for i in range(feedback_noptepochs):
                    # print(len(feedback_buffer))
                    # print(feedback_buffer[:3])
                    if i < feedback_noptepochs * feedback_training_new_prop:
                        inds = np.arange(len(feedback_buffer) - feedback_batch_size, len(feedback_buffer))
                    else:
                        inds = np.random.choice(len(feedback_buffer), feedback_batch_size, replace=False)

                    np.random.shuffle(inds)
                    for start in range(0, feedback_batch_size, feedback_minibatch_size):
                        end = start + feedback_minibatch_size
                        obs       = np.asarray([feedback_buffer[idx][0] for idx in inds[start:end]])
                        feedbacks = np.asarray([feedback_buffer[idx][1] for idx in inds[start:end]])
                        actions   = np.asarray([a] * feedback_minibatch_size)
                        if "bmm" in hf_loss_type:
                            prop1, prop2 = hf_loss_param
                            use_bootstrap = update * nsteps > only_use_hr_until * prop1
                            tmp = 1 - (1 - 0.001) * (update * nsteps - use_bootstrap) / (only_use_hr_until * prop2 - use_bootstrap)
                            tmp = min(tmp, 0.001)
                            pred, loss, _ = \
                                model.feedback_train_bootstrap(feedback_lr, obs, actions, feedbacks, bmm_model,
                                                               use_bootstrap, tmp)
                        else:
                            pred, loss, _ = model.feedback_train(feedback_lr, obs, actions, feedbacks)
                        # print('action: {} feedback: {} pred: {} loss: {}'.format(actions, feedbacks, pred, loss))

                evaluate_start = time.time()
                obs_train       = np.array([ele[0] for ele in feedback_buffer])
                feedbacks_train = np.array([ele[1] for ele in feedback_buffer])
                actions_train   = np.array([a] * len(feedback_buffer))

                obs_valid       = np.array([ele[0] for ele in feedback_buffer_v])
                feedbacks_valid = np.array([ele[1] for ele in feedback_buffer_v])
                actions_valid   = np.array([a] * len(feedback_buffer_v))

                obs_train_true       = np.array([ele[0] for ele in feedback_buffer_t])
                feedbacks_train_true = np.array([ele[1] for ele in feedback_buffer_t])
                actions_train_true   = np.array([a] * len(feedback_buffer_t))

                train_acc, train_loss = model.feedback_evaluate(obs_train, actions_train, feedbacks_train)
                valid_acc, _ = model.feedback_evaluate(obs_valid, actions_valid, feedbacks_valid)
                train_true_acc, _ = model.feedback_evaluate(obs_train_true, feedbacks_train_true, actions_train_true)

                feedback_bmms[a] = train_bmm_model(train_loss,
                                                   a, update, base_path, feedbacks_train == feedbacks_train_true, good_feedback_acc)

                all_train_acc = np.concatenate([all_train_acc, train_acc])
                all_valid_acc = np.concatenate([all_valid_acc, valid_acc])
                all_train_true_acc = np.concatenate([all_train_true_acc, train_true_acc])
                # print("evaluation takes ", time.time() - evaluate_start)

            all_train_acc, all_train_true_acc, all_valid_acc = \
                np.mean(all_train_acc), np.mean(all_train_true_acc), np.mean(all_valid_acc)
            print("train acc {:>4.2f}; train true acc {:>4.2f}; valid acc {:>4.2f}".format(
                all_train_acc, all_train_true_acc, all_valid_acc))
            performance["train_acc"].append(all_train_acc if math.isfinite(all_train_acc) else 0.)
            performance["train_true_acc"].append(all_train_true_acc if math.isfinite(all_train_true_acc) else 0.)
            performance["valid_acc"].append(all_valid_acc if math.isfinite(all_valid_acc) else 0.)

        lossvals = np.mean(mblossvals, axis=0)
        tnow = time.time()
        fps = int(nbatch / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            # logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update * nbatch)
            # logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf]))
            # logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf]))
            # logger.logkv('time_elapsed', tnow - tfirststart)
            # for (lossval, lossname) in zip(lossvals, model.loss_names):
            #     logger.logkv(lossname, lossval)
            logger.dumpkvs()
        if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
            model_dir = osp.join(base_path, "models")
            os.makedirs(model_dir, exist_ok=True)
            savepath = osp.join(model_dir, '%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)
            print("Saved model successfully.")
            if use_feedback:
                performance_fname = os.path.join(base_path, "performance.p")
                with open(performance_fname, "wb") as f:
                    pickle.dump(performance, f)
    env.close()

    return performance
Пример #19
0
def rollout(*,
            network,
            env,
            total_timesteps,
            eval_env=None,
            seed=None,
            nsteps=2048,
            ent_coef=0.0,
            lr=3e-4,
            vf_coef=0.5,
            max_grad_norm=0.5,
            gamma=0.99,
            lam=0.95,
            log_interval=10,
            nminibatches=4,
            noptepochs=4,
            cliprange=0.2,
            save_interval=0,
            load_path=None,
            model_fn=None,
            update_fn=None,
            init_fn=None,
            mpi_rank_weight=1,
            comm=None,
            num_steps,
            num_envs,
            env_name,
            num_levels,
            start_level,
            distribution_mode,
            **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

    Parameters:
    ----------

    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                      See common/models.py/lstm for more details on using recurrent nets in policies

    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                      nenv is number of environment copies simulated in parallel)

    total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)

    ent_coef: float                   policy entropy coefficient in the optimization objective

    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                      training and 0 is the end of the training.

    vf_coef: float                    value function loss coefficient in the optimization objective

    max_grad_norm: float or None      gradient norm clipping coefficient

    gamma: float                      discounting factor

    lam: float                        advantage estimation discounting factor (lambda in the paper)

    log_interval: int                 number of timesteps between logging events

    nminibatches: int                 number of training minibatches per update. For recurrent policies,
                                      should be smaller or equal than number of environments run in parallel.

    noptepochs: int                   number of training epochs per update

    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                      and 0 is the end of the training

    save_interval: int                number of timesteps between saving events

    load_path: str                    path to load the model from

    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.



    '''

    set_global_seeds(seed)

    # if isinstance(lr, float): lr = constfn(lr)
    # else: assert callable(lr)
    # if isinstance(cliprange, float): cliprange = constfn(cliprange)
    # else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    policy = build_policy(env, network, **network_kwargs)

    # Get the nb of env
    nenvs = env.num_envs

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0)

    # Instantiate the model object (that creates act_model and train_model)
    if model_fn is None:
        from baselines.ppo2.model import Model
        model_fn = Model

    model = model_fn(policy=policy,
                     ob_space=ob_space,
                     ac_space=ac_space,
                     nbatch_act=nenvs,
                     nbatch_train=nbatch_train,
                     nsteps=nsteps,
                     ent_coef=ent_coef,
                     vf_coef=vf_coef,
                     max_grad_norm=max_grad_norm,
                     comm=comm,
                     mpi_rank_weight=mpi_rank_weight)

    if load_path is not None:
        model.load(load_path)
    # Instantiate the runner object
    # runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
    # if eval_env is not None:
    #     eval_runner = Runner(env = eval_env, model = model, nsteps = nsteps, gamma = gamma, lam= lam)

    # epinfobuf = deque(maxlen=100)
    # if eval_env is not None:
    #     eval_epinfobuf = deque(maxlen=100)

    # if init_fn is not None:
    #     init_fn()

    # # Start total timer
    # tfirststart = time.perf_counter()

    # nupdates = total_timesteps//nbatch
    # for update in range(1, nupdates+1):
    #     assert nbatch % nminibatches == 0
    #     # Start timer
    #     tstart = time.perf_counter()
    #     frac = 1.0 - (update - 1.0) / nupdates
    #     # Calculate the learning rate
    #     lrnow = lr(frac)
    #     # Calculate the cliprange
    #     cliprangenow = cliprange(frac)

    #     if update % log_interval == 0 and is_mpi_root: logger.info('Stepping environment...')

    #     # Get minibatch
    #     obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632
    #     if eval_env is not None:
    #         eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run() #pylint: disable=E0632

    #     if update % log_interval == 0 and is_mpi_root: logger.info('Done.')

    #     epinfobuf.extend(epinfos)
    #     if eval_env is not None:
    #         eval_epinfobuf.extend(eval_epinfos)

    #     # Here what we're going to do is for each minibatch calculate the loss and append it.
    #     mblossvals = []
    #     if states is None: # nonrecurrent version
    #         # Index of each element of batch_size
    #         # Create the indices array
    #         inds = np.arange(nbatch)
    #         for _ in range(noptepochs):
    #             # Randomize the indexes
    #             np.random.shuffle(inds)
    #             # 0 to batch_size with batch_train_size step
    #             for start in range(0, nbatch, nbatch_train):
    #                 end = start + nbatch_train
    #                 mbinds = inds[start:end]
    #                 slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
    #                 mblossvals.append(model.train(lrnow, cliprangenow, *slices))
    #     else: # recurrent version
    #         assert nenvs % nminibatches == 0
    #         envsperbatch = nenvs // nminibatches
    #         envinds = np.arange(nenvs)
    #         flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
    #         for _ in range(noptepochs):
    #             np.random.shuffle(envinds)
    #             for start in range(0, nenvs, envsperbatch):
    #                 end = start + envsperbatch
    #                 mbenvinds = envinds[start:end]
    #                 mbflatinds = flatinds[mbenvinds].ravel()
    #                 slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
    #                 mbstates = states[mbenvinds]
    #                 mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates))

    #     # Feedforward --> get losses --> update
    #     lossvals = np.mean(mblossvals, axis=0)
    #     # End timer
    # tnow = time.perf_counter()
    # # Calculate the fps (frame per second)
    # fps = int(nbatch / (tnow - tstart))

    if update_fn is not None:
        update_fn(update)

    rewards = []
    for i in range(num_steps):
        env = ProcgenEnv(num_envs=num_envs,
                         env_name=env_name,
                         num_levels=num_levels,
                         start_level=start_level,
                         distribution_mode=distribution_mode)
        env = VecExtractDictObs(env, "rgb")

        env = VecMonitor(
            venv=env,
            filename=None,
            keep_buf=100,
        )

        env = VecNormalize(venv=env, ob=False)
        obs = env.reset()
        done = False
        reward = 0.0
        timesteps = 0
        while not done:
            # action = env.action_space.sample()
            # print("example of an action: ", action)
            # print("\n\n")
            # print("my action: ")
            actions, _, _, _ = model.step(obs)
            # print(actions.shape)
            # print("obs shape: ", obs.shape)
            # print(actions[0])
            obs, r, done, _ = env.step(actions[0])
            done = done.all()
            reward += r
            timesteps += 1
        rewards.append(reward)

        #Logging reward, timesteps, and numsteps
        logger.logkv("numsteps", i)
        logger.logkv("timesteps", timesteps)
        logger.logkv("episode_reward_mean", safemean(reward))
        logger.dumpkvs()

    # if update % log_interval == 0 or update == 1:
    #     # Calculates if value function is a good predicator of the returns (ev > 1)
    #     # or if it's just worse than predicting nothing (ev =< 0)
    #     ev = explained_variance(values, returns)
    #     logger.logkv("misc/serial_timesteps", update*nsteps)
    #     logger.logkv("misc/nupdates", update)
    #     logger.logkv("misc/total_timesteps", update*nbatch)
    #     logger.logkv("fps", fps)
    #     logger.logkv("misc/explained_variance", float(ev))
    #     logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf]))
    #     logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf]))
    #     if eval_env is not None:
    #         logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf]) )
    #         logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf]) )
    #     logger.logkv('misc/time_elapsed', tnow - tfirststart)
    #     for (lossval, lossname) in zip(lossvals, model.loss_names):
    #         logger.logkv('loss/' + lossname, lossval)

    #     logger.dumpkvs()
    # if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and is_mpi_root:
    #     checkdir = osp.join(logger.get_dir(), 'checkpoints')
    #     os.makedirs(checkdir, exist_ok=True)
    #     savepath = osp.join(checkdir, '%.5i'%update)
    #     print('Saving to', savepath)
    #     model.save(savepath)

    return model
Пример #20
0
def learn(policy,
          env,
          ranking_buffer,
          args,
          ent_coef=0.01,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.2,
          save_interval=0,
          load_path=None):

    seed = args.seed
    batch_size = args.batch_size
    nsteps = args.nsteps
    total_timesteps = int(args.num_timesteps * 1.1)
    lr = args.lr

    set_global_seeds(seed)

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    make_model = lambda: Model(policy=policy,
                               ob_space=ob_space,
                               ac_space=ac_space,
                               nbatch_act=nenvs,
                               nbatch_train=nbatch_train,
                               nsteps=nsteps,
                               ent_coef=ent_coef,
                               vf_coef=vf_coef,
                               max_grad_norm=max_grad_norm)
    if save_interval and logger.get_dir():
        import cloudpickle
        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
            fh.write(cloudpickle.dumps(make_model))
    model = make_model()
    if load_path is not None:
        model.load(load_path)
    runner = Runner(env=env,
                    model=model,
                    ranking_buffer=ranking_buffer,
                    nsteps=nsteps,
                    gamma=gamma,
                    lam=lam,
                    batch_size=batch_size)

    epinfobuf = deque(maxlen=100)
    tfirststart = time.time()

    sl_next = 1
    nupdates = total_timesteps // nbatch
    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0
        nbatch_train = nbatch // nminibatches
        tstart = time.time()
        frac = 1.0 - (update - 1.0) / nupdates
        lrnow = lr(frac)
        cliprangenow = cliprange(frac)

        # Judge whether/how frequent we do SL
        if args.disable_rapid:
            do_sl = False
            do_buffer = False
        else:
            if update * nbatch < args.sl_until and update >= sl_next:
                do_sl = True
                do_buffer = True
                next_gap = int(1 / (1.0 - update * nbatch / args.sl_until))
                sl_next += next_gap
            elif update * nbatch < args.sl_until:
                do_sl = False
                do_buffer = True
            else:
                do_sl = False
                do_buffer = False

        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
            do_buffer, do_sl, args.sl_num, lrnow)
        epinfobuf.extend(epinfos)
        mblossvals = []
        if states is None:  # nonrecurrent version
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                np.random.shuffle(inds)
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mblossvals.append(
                        model.train(args.train_rl, lrnow, cliprangenow,
                                    *slices))
        else:  # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            envsperbatch = nbatch_train // nsteps
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(
                        model.train(lrnow, cliprangenow, *slices, mbstates))

        lossvals = np.mean(mblossvals, axis=0)
        tnow = time.time()
        fps = int(nbatch / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, returns)
            logger.logkv("serial_timesteps", update * nsteps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))
            logger.logkv('eprewmean',
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean',
                         safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.logkv('time_elapsed', tnow - tfirststart)
            logger.logkv('episodes', runner.episodes_count)
            logger.record_tabular("rapid_loss", float(runner.rapid_loss))
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv(lossname, lossval)
            logger.dumpkvs()
        if save_interval and (update % save_interval == 0
                              or update == 1) and logger.get_dir():
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)
    env.close()
    return model
Пример #21
0
def learn(*, agent_str, use_netrand, network, sess, env, nsteps, total_timesteps, ent_coef, lr, arch='impala', use_batch_norm=True, dropout=0, 
            vf_coef=0.5,  max_grad_norm=0.5, gamma=0.99, lam=0.95,
            log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
            save_interval=0, save_path=None, load_path=None, **network_kwargs):

    aug_func = AUG_FUNCS[agent_str]

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    nbatch = nenvs * nsteps
    
    nbatch_train = nbatch // nminibatches
    if use_netrand:
        policy = RandomCnnPolicy
        Model = RandomModel
    else:
        policy = CnnPolicy
        Model = BaseModel
    model = Model(policy=policy, sess=sess, ob_space=ob_space, ac_space=ac_space, 
        nbatch_act=nenvs, nbatch_train=nbatch_train,
        nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
        max_grad_norm=max_grad_norm, arch=arch, use_batch_norm=use_batch_norm, dropout=dropout)

    if load_path is not None:
        model.load(load_path)
        logger.info("Model pramas loaded from save")
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam, aug_func=aug_func)
    epinfobuf10 = deque(maxlen=10)
    epinfobuf100 = deque(maxlen=100)
    tfirststart = time.time()

    nupdates = total_timesteps//nbatch
    logger.info("Running {} updates, each needs {} batches".format(nupdates, nbatch))
    mean_rewards = []
    datapoints = []

    run_t_total = 0
    train_t_total = 0

    if use_netrand:
        init_rand = tf.variables_initializer([v for v in tf.global_variables() if 'randcnn' in v.name])

    for update in range(1, nupdates+1):
        assert nbatch % nminibatches == 0
        nbatch_train = nbatch // nminibatches
        tstart = time.time()
        frac = 1.0 - (update - 1.0) / nupdates
        lrnow = lr(frac)
        cliprangenow = cliprange(frac)

        run_tstart = time.time()
        if use_netrand:
            sess.run(init_rand)
            clean_flag = np.random.rand(1)[0] < use_netrand
        else:
            clean_flag = 0
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(clean_flag)
        epinfobuf10.extend(epinfos)
        epinfobuf100.extend(epinfos)

        run_elapsed = time.time() - run_tstart
        run_t_total += run_elapsed

        mblossvals = []

        logger.info('update: {} updating parameters...'.format(update))
        train_tstart = time.time()
        
        if states is None:
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                np.random.shuffle(inds)
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                    if clean_flag:
                        mblossvals.append(model.clean_train(lrnow, cliprangenow, *slices))
                    else:
                        mblossvals.append(model.train(lrnow, cliprangenow, *slices))
                        
        else:
            assert nenvs % nminibatches == 0
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            envsperbatch = nbatch_train // nsteps
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates))

        # update the dropout mask
        sess.run([model.train_model.dropout_assign_ops])

        train_elapsed = time.time() - train_tstart
        train_t_total += train_elapsed

        lossvals = np.mean(mblossvals, axis=0)
        tnow = time.time()
        fps = int(nbatch / (tnow - tstart))

        if update % log_interval == 0 or update == 1:
            step = update*nbatch

            rew_mean_10 = safemean([epinfo['r'] for epinfo in epinfobuf10])
            rew_mean_100 = safemean([epinfo['r'] for epinfo in epinfobuf100])
            ep_len_mean_10 = np.nanmean([epinfo['l'] for epinfo in epinfobuf10])
            ep_len_mean_100 = np.nanmean([epinfo['l'] for epinfo in epinfobuf100])

            mean_rewards.append(rew_mean_10)
            datapoints.append([step, rew_mean_10])
            mean_rewards.append(rew_mean_10)
            logger.logkv('eprew10', rew_mean_10)
            logger.logkv('eprew100', rew_mean_100)
            logger.logkv('eplenmean10', ep_len_mean_10)
            logger.logkv('eplenmean100', ep_len_mean_100)
            logger.logkv('nupdate', update)

            logger.logkv('misc/total_time_elapsed', tnow - tfirststart)
            logger.logkv('misc/run_t_total', run_t_total)
            logger.logkv('misc/train_t_total', train_t_total)
            logger.logkv("misc/total_timesteps", update*nbatch)
            logger.logkv("misc/serial_timesteps", update*nsteps)
            logger.logkv("fps", fps)

            if len(mblossvals):
                for (lossval, lossname) in zip(lossvals, model.loss_names):
                    logger.logkv('loss/' + lossname, lossval)
            logger.dumpkvs()

    if save_path:
        model.save(save_path)

    env.close()
    return model
Пример #22
0
def learn(network,
          FLAGS,
          eval_env=None,
          seed=None,
          nsteps=2048,
          ent_coef=0.0,
          lr=3e-4,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.2,
          save_interval=10,
          load_path=None,
          model_fn=None,
          update_fn=None,
          init_fn=None,
          mpi_rank_weight=1,
          comm=None,
          episode_window_size=20,
          stop=True,
          scenario='gfootball.scenarios.1_vs_1_easy',
          curriculum=np.linspace(0, 0.9, 10),
          a=0,
          b=0,
          num_timesteps=200000,
          eval_period=20,
          eval_episodes=1,
          **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)
    Parameters:
    ----------
    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                     specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                     tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                     neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                    See common/models.py/lstm for more details on using recurrent nets in policies
    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.
    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                      nenv is number of environment copies simulated in parallel)
    ent_coef: float                   policy entropy coefficient in the optimization objective
    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                      training and 0 is the end of the training.
    vf_coef: float                    value function loss coefficient in the optimization objective
    max_grad_norm: float or None      gradient norm clipping coefficient
    gamma: float                      discounting factor
    lam: float                        advantage estimation discounting factor (lambda in the paper)
    log_interval: int                 number of timesteps between logging events
    nminibatches: int                 number of training minibatches per update. For recurrent policies,
                                      should be smaller or equal than number of environments run in parallel.
    noptepochs: int                   number of training epochs per update
    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                      and 0 is the end of the training
    save_interval: int                number of timesteps between saving events
    load_path: str                    path to load the model from
    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.
    '''

    set_global_seeds(seed)

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)

    basic_builder = importlib.import_module(scenario, package=None)

    def build_builder_with_difficulty(difficulty):
        def builder_with_difficulty(builder):
            basic_builder.build_scenario(builder)
            builder.config().right_team_difficulty = difficulty
            builder.config().left_team_difficulty = difficulty

        return builder_with_difficulty

    def create_single_football_env(iprocess):
        """Creates gfootball environment."""
        env = football_env.create_environment(
            env_name=build_builder_with_difficulty(0),
            stacked=('stacked' in FLAGS.state),
            rewards=FLAGS.reward_experiment,
            logdir=logger.get_dir(),
            write_goal_dumps=FLAGS.dump_scores and (iprocess == 0),
            write_full_episode_dumps=FLAGS.dump_full_episodes
            and (iprocess == 0),
            render=FLAGS.render and (iprocess == 0),
            dump_frequency=50 if FLAGS.render and iprocess == 0 else 0)
        env = monitor.Monitor(
            env,
            logger.get_dir() and os.path.join(logger.get_dir(), str(iprocess)))
        return env

    env = SubprocVecEnv([(lambda _i=i: create_single_football_env(_i))
                         for i in range(FLAGS.num_envs)],
                        context=None)

    policy = build_policy(env, network, **network_kwargs)

    average_window_size = episode_window_size * 16

    # Get the nb of env
    nenvs = FLAGS.num_envs

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0)

    # Configure logger to log_ppo_timestamp formatted
    pickle_str = 'curriculum_a-%db-%d_ppo_impala_chkpt' % (a, b) + '-'.join(
        str(datetime.datetime.now()).replace(':', ' ').split(' '))

    eval_pickle_str = pickle_str + '_eval'

    # open pickle file to append relevant data in binary
    pickle_dir = '/content/cs285_f2020_proj/football/pickled_data/'
    model_dir = '/content/cs285_f2020_proj/football/models/'

    # create dir for pickling & model save
    if not os.path.exists(pickle_dir):
        os.makedirs(pickle_dir)
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    def make_file(file_path):
        if not os.path.exists(file_path):
            with open(file_path, 'w+'):
                print('made path', file_path)

    make_file(pickle_dir + pickle_str)
    make_file(pickle_dir + eval_pickle_str)

    # Instantiate the model object (that creates act_model and train_model)
    if model_fn is None:
        from baselines.ppo2.model import Model
        model_fn = Model

    model = model_fn(policy=policy,
                     ob_space=ob_space,
                     ac_space=ac_space,
                     nbatch_act=nenvs,
                     nbatch_train=nbatch_train,
                     nsteps=nsteps,
                     ent_coef=ent_coef,
                     vf_coef=vf_coef,
                     max_grad_norm=max_grad_norm,
                     comm=comm,
                     mpi_rank_weight=mpi_rank_weight)

    if load_path is not None:
        model.load(load_path)

    def create_single_football_env(iprocess, difficulty):
        """Creates gfootball environment."""
        env = football_env.create_environment(
            env_name=build_builder_with_difficulty(difficulty),
            stacked=('stacked' in FLAGS.state),
            rewards=FLAGS.reward_experiment,
            logdir=logger.get_dir(),
            write_goal_dumps=FLAGS.dump_scores and (iprocess == 0),
            write_full_episode_dumps=FLAGS.dump_full_episodes
            and (iprocess == 0),
            render=FLAGS.render and (iprocess == 0),
            dump_frequency=50 if FLAGS.render and iprocess == 0 else 0)
        env = monitor.Monitor(
            env,
            logger.get_dir() and os.path.join(logger.get_dir(), str(iprocess)))
        return env

    def make_runner(difficulty):
        vec_env = SubprocVecEnv(
            [(lambda _i=i: create_single_football_env(_i, difficulty))
             for i in range(FLAGS.num_envs)],
            context=None)
        print('vec env obs space', vec_env.observation_space)
        return env, Runner(env=vec_env,
                           model=model,
                           nsteps=nsteps,
                           gamma=gamma,
                           lam=lam)

    # get next difficulty according to distribution outlined in probabilities.
    def get_next_difficulty():
        draw = np.random.choice(range(10), 1, p=curriculum_probabilities)
        return draw[0]

    # Instantiate the runner object
    # Curriculum difficulties start off as random.
    curriculum_probabilities = [0.1] * 10

    difficulty_idx = get_next_difficulty()
    env, runner = make_runner(curriculum[difficulty_idx])

    def make_eval_runner(difficulty):
        vec_env = SubprocVecEnv(
            [(lambda _i=i: create_single_football_env(_i, difficulty))
             for i in range(FLAGS.num_envs, 2 * FLAGS.num_envs)],
            context=None)
        print('vec env obs space', vec_env.observation_space)
        return env, Runner(env=vec_env,
                           model=model,
                           nsteps=nsteps,
                           gamma=gamma,
                           lam=lam)

    policy = build_policy(env, network, **network_kwargs)

    eprews = []
    rews_by_difficulty = [[] for i in range(10)]

    # for logging TEMP
    ep_vars = [0] * 10
    lmeans = []
    lvariances = []

    rdi = 20  # reward difference interval, in episodes
    smart_mean = lambda l: np.mean(l) if l else 0
    smart_var = lambda l: np.var(l) if l else 0

    def update_curriculum_probabilities():
        rdi_rew_means = np.array(
            [smart_mean(diffrew[-rdi:]) for diffrew in rews_by_difficulty])
        rdi_rew_vars = np.array(
            [smart_var(diffrew[-rdi:]) for diffrew in rews_by_difficulty])
        print('means', rdi_rew_means)
        print('variances', rdi_rew_vars)
        lmeans.extend(rdi_rew_means)
        lvariances.extend(rdi_rew_vars)
        e_diff_rews = np.exp(a * rdi_rew_means + b * rdi_rew_vars)
        return rdi_rew_vars, e_diff_rews / np.sum(e_diff_rews)

    # eval_rews[i] will be all the rewards from evaluation i
    # eval_rews[i][j] will be rewards from evaluation i at difficulty j ~ 2:20 = (0.05, 0.95)
    eval_rews = []

    epinfobuf = deque(maxlen=100)

    if init_fn is not None:
        init_fn()

    # Start total timer
    tfirststart = time.perf_counter()

    # nupdates = total_timesteps//nbatch
    update = 0
    while update * nsteps < num_timesteps:
        update += 1
        assert nbatch % nminibatches == 0
        # Start timer
        tstart = time.perf_counter()
        # frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        lrnow = lr(0)  # Constant LR, cliprange
        # Calculate the cliprange
        cliprangenow = cliprange(0)

        if update % log_interval == 0 and is_mpi_root:
            logger.info('Stepping environment...')

        # Get minibatch
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )  #pylint: disable=E0632

        if update % log_interval == 0 and is_mpi_root:
            logger.info('Done.')

        rewards_this_episode = [i['r'] for i in epinfos]
        lengths_this_episode = [i['l'] for i in epinfos]

        print('episode rewards ep#', update, rewards_this_episode)
        eprews.extend(rewards_this_episode)
        epinfobuf.extend(epinfos)

        # for each minibatch calculate the loss and append it.
        mblossvals = []
        if states is None:  # nonrecurrent version
            # Index of each element of batch_size
            # Create the indices array
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                # Randomize the indexes
                np.random.shuffle(inds)
                # 0 to batch_size with batch_train_size step
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow,
                                                  *slices))
        else:  # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(
                        model.train(lrnow, cliprangenow, *slices, mbstates))

        # sum of last average_window_size rewards
        last_aws_rewards_sum = sum(eprews[-average_window_size:])
        print('LAST %d ep mean reward' % episode_window_size,
              last_aws_rewards_sum / (average_window_size + 0.0))

        rews_by_difficulty[difficulty_idx].append(np.sum(rewards_this_episode))

        # for logging TEMP
        print('mean of means', np.mean(lmeans), 'var of means', np.var(lmeans))
        print('mean of vars', np.mean(lvariances), 'var of vars',
              np.var(lvariances))

        # pickling
        pickle_data = {
            'episode': update,
            'timesteps': update * nsteps,
            'episode_rewards': rewards_this_episode,
            'episode_window_size': episode_window_size,
            # 'last_window_size_rewards' : eprews[-average_window_size:],
            'difficulty': curriculum[difficulty_idx],
            'len_rewards_array': len(eprews),
            'episode_lenths': lengths_this_episode,
            'eval_period': eval_period,
            'probabilities': curriculum_probabilities,
            'running_ws_variance': ep_vars,
            'a': a,
            'b': b,
        }

        def dict_print(d):
            for k in d:
                print(k, d[k])

        dict_print(pickle_data)
        with open(pickle_dir + pickle_str, 'ab') as pickle_file:
            pickle.dump(pickle_data, pickle_file)

        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)
        # End timer
        tnow = time.perf_counter()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))

        if update_fn is not None:
            update_fn(update)

        # every eval period run for eval_nsteps on every difficulty
        if update % eval_period == 1:
            # rews[i] = sum of rewards from eval_nsteps for difficulty index i
            eval_rews_period = []  # 2D array
            eval_rews_period_sum = []  # 1D array
            for difficulty_eval in curriculum:
                eval_env, eval_runner = make_eval_runner(difficulty_eval)
                eval_rewards_for_difficulty = []
                for k in range(eval_episodes):
                    # run nsteps for the number of eval episodes (nsteps * episodes)
                    eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run(
                    )  #pylint: disable=E0632
                    # append the array of all the rewards gotten for this difficulty in the episode.
                    eval_rewards_for_difficulty.extend(
                        [i['r'] for i in eval_epinfos])
                eval_rews_period.append(eval_rewards_for_difficulty)
                eval_rews_period_sum.append(sum(eval_rewards_for_difficulty))
                print("rews eval timstep", update * nsteps, "difficulty",
                      difficulty_eval, eval_rewards_for_difficulty, "sum",
                      eval_rews_period_sum[-1])

            eval_rews.append(eval_rews_period)
            eval_pickle_data = [
                update * nsteps,  # timesteps for trainer
                eval_rews_period,  # 2D array which contains all rewards gotten for all difficulties this eval period.
                eval_rews_period_sum
            ]
            with open(pickle_dir + eval_pickle_str, 'ab') as eval_pickle_file:
                pickle.dump(eval_pickle_data, eval_pickle_file)
            print('eval pickle dumped, u#', update)

        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, returns)
            logger.logkv("misc/serial_timesteps", update * nsteps)
            logger.logkv("misc/nupdates", update)
            logger.logkv("misc/total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            logger.logkv("misc/explained_variance", float(ev))
            logger.logkv('eprewmean',
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean',
                         safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.logkv('misc/time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv('loss/' + lossname, lossval)

            logger.dumpkvs()
        if save_interval and update % save_interval == 1:
            savepath = osp.join(model_dir, pickle_str)
            print('Saving to', savepath)
            model.save(savepath)

        ep_vars, curriculum_probabilities = update_curriculum_probabilities()
        print('new probability distr:', curriculum_probabilities)
        difficulty_idx = get_next_difficulty()
        print("NEXT DIFFICULTY:", curriculum[difficulty_idx])
        env, runner = make_runner(curriculum[difficulty_idx])
    return model
Пример #23
0
def learn(*,
          network,
          sess,
          env,
          nsteps,
          total_timesteps,
          ent_coef,
          lr,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.2,
          save_interval=0,
          save_path=None,
          load_path=None,
          **network_kwargs):
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    mpi_size = comm.Get_size()
    #sess = tf.get_default_session()
    # tb_writer = TB_Writer(sess)

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    nbatch = nenvs * nsteps

    nbatch_train = nbatch // nminibatches
    policy = CrossCnnPolicy
    model = Model(policy=policy,
                  sess=sess,
                  ob_space=ob_space,
                  ac_space=ac_space,
                  nbatch_act=nenvs,
                  nbatch_train=nbatch_train,
                  nsteps=nsteps,
                  ent_coef=ent_coef,
                  vf_coef=vf_coef,
                  max_grad_norm=max_grad_norm)

    # utils.load_all_params(sess)
    if load_path is not None:
        model.load(load_path)
        logger.info("Model pramas loaded from save")
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
    logger.info("Initilizing runner")
    epinfobuf10 = deque(maxlen=10)
    epinfobuf100 = deque(maxlen=100)
    tfirststart = time.time()
    active_ep_buf = epinfobuf100

    nupdates = total_timesteps // nbatch
    logger.info("Running {} updates, each needs {} batches".format(
        nupdates, nbatch))
    mean_rewards = []
    datapoints = []

    run_t_total = 0
    train_t_total = 0

    can_save = True
    checkpoints = list(range(0, 2049, 10))
    saved_key_checkpoints = [False] * len(checkpoints)
    #init_rand = tf.variables_initializer([v for v in tf.global_variables() if 'randcnn' in v.name])

    # if Config.SYNC_FROM_ROOT and rank != 0:
    #     can_save = False

    # def save_model(base_name=None):
    #     base_dict = {'datapoints': datapoints}
    #     utils.save_params_in_scopes(
    #         sess, ['model'], Config.get_save_file(base_name=base_name), base_dict)

    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0
        nbatch_train = nbatch // nminibatches
        tstart = time.time()
        frac = 1.0 - (update - 1.0) / nupdates
        lrnow = lr(frac)
        cliprangenow = cliprange(frac)

        #logger.info('collecting rollouts...')
        run_tstart = time.time()

        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )
        epinfobuf10.extend(epinfos)
        epinfobuf100.extend(epinfos)

        run_elapsed = time.time() - run_tstart
        run_t_total += run_elapsed
        #logger.info('rollouts complete')

        mblossvals = []

        logger.info('update: {} updating parameters...'.format(update))
        train_tstart = time.time()

        if states is None:  # nonrecurrent version
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                np.random.shuffle(inds)
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow,
                                                  *slices))

        else:  # recurrent version
            assert nenvs % nminibatches == 0
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            envsperbatch = nbatch_train // nsteps
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(
                        model.train(lrnow, cliprangenow, *slices, mbstates))

        # update the dropout mask
        sess.run([model.train_model.dropout_assign_ops])

        train_elapsed = time.time() - train_tstart
        train_t_total += train_elapsed
        #ogger.info('update complete')

        lossvals = np.mean(mblossvals, axis=0)
        tnow = time.time()
        fps = int(nbatch / (tnow - tstart))

        if update % log_interval == 0 or update == 1:
            step = update * nbatch
            #rew_mean_10 = utils.process_ep_buf(active_ep_buf, tb_writer=tb_writer, suffix='', step=step)

            rew_mean_10 = safemean([epinfo['r'] for epinfo in epinfobuf10])
            rew_mean_100 = safemean([epinfo['r'] for epinfo in epinfobuf100])
            ep_len_mean_10 = np.nanmean(
                [epinfo['l'] for epinfo in epinfobuf10])
            ep_len_mean_100 = np.nanmean(
                [epinfo['l'] for epinfo in epinfobuf100])

            logger.info('\n----', update)

            mean_rewards.append(rew_mean_10)
            datapoints.append([step, rew_mean_10])
            mean_rewards.append(rew_mean_10)
            logger.logkv('eprew10', rew_mean_10)
            logger.logkv('eprew100', rew_mean_100)
            logger.logkv('eplenmean10', ep_len_mean_10)
            logger.logkv('eplenmean100', ep_len_mean_100)
            logger.logkv('nupdate', update)

            #logger.info('time_elapsed', tnow - tfirststart, run_t_total, train_t_total)
            logger.logkv('misc/total_time_elapsed', tnow - tfirststart)
            logger.logkv('misc/run_t_total', run_t_total)
            logger.logkv('misc/train_t_total', train_t_total)

            #logger.info('timesteps', update*nsteps, total_timesteps)
            logger.logkv("misc/total_timesteps", update * nbatch)
            logger.logkv("misc/serial_timesteps", update * nsteps)

            #logger.info('fps', fps)
            logger.logkv("fps", fps)

            if len(mblossvals):
                for (lossval, lossname) in zip(lossvals, model.loss_names):
                    logger.info(lossname, lossval)
                    #tb_writer.log_scalar(lossval, lossname)
                    logger.logkv('loss/' + lossname, lossval)
            logger.info('----\n')
            logger.dumpkvs()

        #if can_save:
        if 0:  ## not doing checkpoint saving yet
            if save_interval and (update % save_interval == 0):
                save_model()

            for j, checkpoint in enumerate(checkpoints):
                if (not saved_key_checkpoints[j]) and (step >=
                                                       (checkpoint * 1e6)):
                    saved_key_checkpoints[j] = True
                    save_model(str(checkpoint) + 'M')

    # save_model()
    if save_path:
        model.save(save_path)

    env.close()
    return model
Пример #24
0
    def update(self):
        if self.update_index > self.nupdates:
            return False
        assert self.nbatch % self.nminibatches == 0
        self.nbatch_train = self.nbatch // self.nminibatches
        tstart = time.time()
        frac = 1.0 - (self.update_index - 1.0) / self.nupdates
        lrnow = self.lr(frac)
        cliprangenow = self.cliprange(frac)
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = self.runner.run(
        )  #pylint: disable=E0632
        self.epinfobuf.extend(epinfos)
        mblossvals = []
        if states is None:  # nonrecurrent version
            inds = np.arange(self.nbatch)
            for _ in range(self.noptepochs):
                np.random.shuffle(inds)
                for start in range(0, self.nbatch, self.nbatch_train):
                    end = start + self.nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mblossvals.append(
                        self.model.train(lrnow, cliprangenow, *slices))
        else:  # recurrent version
            assert self.nenvs % self.nminibatches == 0
            envsperbatch = self.nenvs // self.nminibatches
            envinds = np.arange(self.nenvs)
            flatinds = np.arange(self, nenvs * self.nsteps).reshape(
                self.nenvs, self.nsteps)
            envsperbatch = self.nbatch_train // self.nsteps
            for _ in range(self.noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, self.nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(
                        model.train(lrnow, cliprangenow, *slices, mbstates))

        lossvals = np.mean(mblossvals, axis=0)
        tnow = time.time()
        fps = int(self.nbatch / (tnow - tstart))
        if self.update_index % self.log_interval == 0 or self.update_index == 1:
            ev = explained_variance(values, returns)
            logger.logkv("serial_timesteps", self.update_index * self.nsteps)
            logger.logkv("nupdates", self.update_index)
            logger.logkv("total_timesteps", self.update_index * self.nbatch)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))
            logger.logkv('eprewmean',
                         safemean([epinfo['r'] for epinfo in self.epinfobuf]))
            logger.logkv('eplenmean',
                         safemean([epinfo['l'] for epinfo in self.epinfobuf]))
            logger.logkv('time_elapsed', tnow - self.tfirststart)
            logger.logkv('agent', self.scope)
            for (lossval, lossname) in zip(lossvals, self.model.loss_names):
                logger.logkv(lossname, lossval)
            logger.dumpkvs()
        if self.save_interval and (self.update % self.save_interval == 0
                                   or self.update_index
                                   == 1) and logger.get_dir():
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i' % self.update_index)
            print('Saving to', savepath)
            self.model.save(savepath)
        self.update_index += 1
        self.min_reward = safemin([epinfo['r'] for epinfo in self.epinfobuf])
        self.max_reward = safemax([epinfo['r'] for epinfo in self.epinfobuf])
        return True
Пример #25
0
def learn(*,
          policy,
          env,
          nsteps,
          total_timesteps,
          ent_coef,
          lr,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          nminibatches=16,
          noptepochs=4,
          cliprange=0.2,
          save_interval=0,
          nddpgbatches=32,
          ddpg_per_ppo=128,
          target_lag=1,
          ddpg_ac_weight=0.1,
          annealing_updates=50,
          with_ddpg=True,
          with_annealing=True):
    global use_ddpg
    global use_annealing
    use_ddpg = with_ddpg
    use_annealing = with_annealing
    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    make_model = lambda: Model(policy=policy,
                               ob_space=ob_space,
                               ac_space=ac_space,
                               nbatch_act=nenvs,
                               nbatch_train=nbatch_train,
                               nsteps=nsteps,
                               ent_coef=ent_coef,
                               vf_coef=vf_coef,
                               max_grad_norm=max_grad_norm,
                               batch_size=nddpgbatches)
    if save_interval and logger.get_dir():
        import cloudpickle
        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
            fh.write(cloudpickle.dumps(make_model))
    model = make_model()
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)

    epinfobuf = deque(maxlen=100)
    tfirststart = time.time()

    nupdates = total_timesteps // nbatch
    print('nupdates', nupdates)
    ddpg_w = ddpg_ac_weight if use_annealing else 0.0
    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0
        if ddpg_w > 0.0:
            ddpg_w -= 1 / float(annealing_updates) * ddpg_w
        values_list = []
        nbatch_train = nbatch // nminibatches
        tstart = time.time()
        frac = 1.0 - (update - 1.0) / nupdates
        lrnow = lr(frac)
        cliprangenow = cliprange(frac)
        obs, returns, rewards, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )  #pylint: disable=E0632
        if use_annealing:
            ddpg_ac_list = []
            for idx in range(obs.shape[0]):
                ddpg_ac, _ = model.agent.pi(obs[idx],
                                            apply_noise=False,
                                            compute_Q=False)
                ddpg_ac_list.append(ddpg_ac)
            ddpg_ac = np.asarray(ddpg_ac_list)
        values_list.append(values)
        # print('obs.shape', obs.shape, 'rewards.shape', returns.shape, 'masks.shape', masks.shape, 'actions.shape', actions.shape)
        epinfobuf.extend(epinfos)
        mblossvals = []
        if states is None:  # nonrecurrent version
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                np.random.shuffle(inds)
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    if not use_annealing:
                        mblossvals.append(
                            model.train(lrnow, cliprangenow, *slices))
                    else:
                        mblossvals.append(
                            model.train(lrnow,
                                        cliprangenow,
                                        *slices,
                                        ddpg_acs=ddpg_ac[mbinds],
                                        ddpg_w=ddpg_w))
        else:  # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            envsperbatch = nbatch_train // nsteps
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mbstates = states[mbenvinds]
                    # mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates))
                    if not use_annealing:
                        mblossvals.append(
                            model.train(lrnow, cliprangenow, *slices,
                                        mbstates))
                    else:
                        mblossvals.append(
                            model.train(lrnow,
                                        cliprangenow,
                                        *slices,
                                        mbstates,
                                        ddpg_acs=ddpg_ac[mbinds],
                                        ddpg_w=ddpg_w))

        if use_ddpg:
            mbcritic_loss = []
            mbactor_loss = []
            # ------------- train DDPG ----------------
            for _ in range(ddpg_per_ppo * noptepochs * nminibatches):
                cl, al = model.agent.train()
                mbcritic_loss.append(cl)
                mbactor_loss.append(al)
                if update > target_lag:
                    model.agent.update_target_net()
            # print('noptepochs', noptepochs, 'nbatch_train', nbatch_train, 'nbatch', nbatch)
            # ------------- train DDPG ----------------

        lossvals = np.mean(mblossvals, axis=0)
        values_avg = np.mean(values_list)
        if use_ddpg:
            critic_loss = np.mean(mbcritic_loss)
            actor_loss = np.mean(mbactor_loss)
        tnow = time.time()
        fps = int(nbatch / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, returns)
            logger.logkv("serial_timesteps", update * nsteps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))
            logger.logkv('eprewmean',
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean',
                         safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.logkv('time_elapsed', tnow - tfirststart)
            logger.logkv('value estimation', values_avg)
            logger.logkv('eprew_max', np.max(mblossvals))
            logger.logkv('eprew_min', np.min(mblossvals))
            logger.logkv('eprew_std', np.std(mblossvals))
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv(lossname, lossval)
            if use_ddpg:
                logger.logkv('critic_loss', critic_loss)
                logger.logkv('actor_loss', actor_loss)
            logger.dumpkvs()
        if save_interval and (update % save_interval == 0
                              or update == 1) and logger.get_dir():
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)
    env.close()
Пример #26
0
def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
            vf_coef=0.5,  max_grad_norm=0.5, gamma=0.99, lam=0.95,
            log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
            save_interval=0, load_model, load_model_path, save_model, save_model_path):


    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    #nenvs = env.num_envs
    nenvs=1
    ob_space = Box(low=0, high=1, shape=(84, 84, 4))
    ac_space = Discrete(8)
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    make_model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs,
                               nbatch_train=nbatch_train,
                               nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
                               max_grad_norm=max_grad_norm)

    if load_model:
        print('loading model')
        make_model.load(load_model_path)

    if save_interval and logger.get_dir():
        import cloudpickle
        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
            fh.write(cloudpickle.dumps(make_model))
    model = make_model
    print('test0')
    runner = Runner(env=env, model=model, ob_space=ob_space, nsteps=nsteps, gamma=gamma, lam=lam)
    print('test1')
    epinfobuf = deque(maxlen=100)
    tfirststart = time.time()
    nupdates = total_timesteps//nbatch
    print('nbatch = ', nbatch)
    print('test2')
    for update in range(1, nupdates+1):
        assert nbatch % nminibatches == 0
        nbatch_train = nbatch // nminibatches
        tstart = time.time()
        frac = 1.0 - (update - 1.0) / nupdates
        lrnow = lr(frac)
        cliprangenow = cliprange(frac)
        print('test4')
        obs, rewards, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632
        print('test5')
        print('size of returns is', returns.shape)
        np.reshape(obs, (nsteps, 84, 84, 4))
        obs.shape = (nsteps, 84, 84, 4)
        print('obs.shape= ', obs.shape)
        rewards.shape = (nsteps,)
        returns.shape = (nsteps,)
        masks.shape = (nsteps,)
        actions.shape = (nsteps,)
        values.shape = (nsteps,)
        neglogpacs.shape = (nsteps,)
        epinfobuf.extend(epinfos)
        mblossvals = []
        inds = np.arange(nbatch)
        print('len(inds)=nbatch=', len(inds))
        for _ in range(noptepochs):
            np.random.shuffle(inds)
            for start in range(0, nbatch, nbatch_train):
                end = start + nbatch_train
                mbinds = inds[start:end]
                slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                # slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                print('obs.shape=', obs.shape)
                print('returns.shape=', returns.shape)
                print('masks.shape=', masks.shape)
                print('actions.shape=', actions.shape)
                print('values.shape=', values.shape)
                print('neglogpacs.shape=', neglogpacs.shape)
                mblossvals.append(model.train(lrnow, cliprangenow, *slices))


        lossvals = np.mean(mblossvals, axis=0)
        tnow = time.time()
        fps = int(nbatch / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, returns)
            logger.logkv("serial_timesteps", update*nsteps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update*nbatch)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))
            logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.logkv('time_elapsed', tnow - tfirststart)

            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv(lossname, lossval)
            logger.dumpkvs()
        if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i'%update)
            #print('Saving model to', save_model_path)
            #model.save(save_model_path)
            #print('Saving to', savepath)
            #model.save(savepath)
    print('test4')
    env.close()
Пример #27
0
def learn(*,
          network,
          env,
          total_timesteps,
          seed=None,
          nsteps=2048,
          ent_coef=0.0,
          lr=3e-4,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.2,
          save_interval=0,
          load_path=None,
          **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)
    
    Parameters:
    ----------

    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns 
                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                      See common/models.py/lstm for more details on using recurrent nets in policies

    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation. 
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.

    
    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                      nenv is number of environment copies simulated in parallel)

    total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)

    ent_coef: float                   policy entropy coefficient in the optimization objective

    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the 
                                      training and 0 is the end of the training.

    vf_coef: float                    value function loss coefficient in the optimization objective

    max_grad_norm: float or None      gradient norm clipping coefficient
    
    gamma: float                      discounting factor

    lam: float                        advantage estimation discounting factor (lambda in the paper)

    log_interval: int                 number of timesteps between logging events

    nminibatches: int                 number of training minibatches per update. For recurrent policies, 
                                      should be smaller or equal than number of environments run in parallel. 

    noptepochs: int                   number of training epochs per update

    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training 
                                      and 0 is the end of the training 

    save_interval: int                number of timesteps between saving events

    load_path: str                    path to load the model from

    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers. 

    

    '''

    set_global_seeds(seed)

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    policy = build_policy(env, network, **network_kwargs)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    make_model = lambda: Model(policy=policy,
                               ob_space=ob_space,
                               ac_space=ac_space,
                               nbatch_act=nenvs,
                               nbatch_train=nbatch_train,
                               nsteps=nsteps,
                               ent_coef=ent_coef,
                               vf_coef=vf_coef,
                               max_grad_norm=max_grad_norm)
    if save_interval and logger.get_dir(
    ) and False:  # Added the false because saving make_model threw "TypeError: Pickling an AuthenticationString object is disallowed for security reasons"
        import cloudpickle
        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
            print(make_model)
            fh.write(cloudpickle.dumps(make_model))
    model = make_model()
    if load_path is not None:
        model.load(load_path)
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)

    epinfobuf = deque(maxlen=100)
    tfirststart = time.time()

    nupdates = total_timesteps // nbatch
    for update in range(1, nupdates + 1):
        full_start_time = time.time()
        assert nbatch % nminibatches == 0
        tstart = time.time()
        frac = 1.0 - (update - 1.0) / nupdates
        lrnow = lr(frac)
        cliprangenow = cliprange(frac)
        start_env_step_time = time.time()
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )  #pylint: disable=E0632
        end_env_step_time = time.time()

        epinfobuf.extend(epinfos)
        mblossvals = []
        if states is None:  # nonrecurrent version
            inds = np.arange(nbatch)
            start_train_time = time.time()
            for _ in range(noptepochs):
                np.random.shuffle(inds)
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow,
                                                  *slices))
            end_train_time = time.time()

        else:  # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            envsperbatch = nbatch_train // nsteps
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(
                        model.train(lrnow, cliprangenow, *slices, mbstates))

        lossvals = np.mean(mblossvals, axis=0)
        tnow = time.time()
        fps = int(nbatch / (tnow - tstart))
        full_end_time = time.time()
        if update % log_interval == 0 or update == 1:
            print('Full time: ', full_end_time - full_start_time, "s")
            print('Env step time: ', end_env_step_time - start_env_step_time,
                  "s")
            print('Train time: ', end_train_time - start_train_time, "s")
            if safemean([epinfo['r'] for epinfo in epinfobuf]) > 3:
                print([epinfo['r'] for epinfo in epinfobuf])
                #break
            ev = explained_variance(values, returns)
            logger.logkv("serial_timesteps", update * nsteps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))
            logger.logkv('eprewmean',
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean',
                         safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.logkv('time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv(lossname, lossval)
            if MPI.COMM_WORLD.Get_rank() == 0:
                logger.dumpkvs()
        if save_interval and (
                update % save_interval == 0 or update
                == 1) and logger.get_dir() and MPI.COMM_WORLD.Get_rank() == 0:
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)
    env.close()
    return model
Пример #28
0
def learn(*,
          policy,
          env,
          nsteps,
          total_timesteps,
          ent_coef,
          lr,
          restore_path,
          save_path,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.2,
          save_interval=0):
    if isinstance(lr, float):
        lr = constfn(lr)
    else:
        assert callable(lr)
    if isinstance(cliprange, float):
        cliprange = constfn(cliprange)
    else:
        assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    make_model = lambda: Model(policy=policy,
                               ob_space=ob_space,
                               ac_space=ac_space,
                               nbatch_act=nenvs,
                               nbatch_train=nbatch_train,
                               nsteps=nsteps,
                               ent_coef=ent_coef,
                               vf_coef=vf_coef,
                               max_grad_norm=max_grad_norm)
    model = make_model()
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)

    epinfobuf = deque(maxlen=100)
    tfirststart = time.time()

    nupdates = total_timesteps // nbatch
    saver = tf.train.Saver()
    if restore_path:
        saver.restore(model.sess, restore_path)
        print("Model restored from file:", restore_path)
    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0
        nbatch_train = nbatch // nminibatches
        tstart = time.time()
        frac = 1.0 - (update - 1.0) / nupdates
        lrnow = lr(frac)
        cliprangenow = cliprange(frac)
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )  # pylint: disable=E0632
        epinfobuf.extend(epinfos)
        mblossvals = []
        if states is None:  # nonrecurrent version
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                np.random.shuffle(inds)
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow,
                                                  *slices))
        else:  # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            envsperbatch = nbatch_train // nsteps
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(
                        model.train(lrnow, cliprangenow, *slices, mbstates))

        lossvals = np.mean(mblossvals, axis=0)
        tnow = time.time()
        fps = int(nbatch / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, returns)
            # logger.logkv("serial_timesteps", update * nsteps)
            # logger.logkv("nupdates", update)
            # logger.logkv("total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))
            logger.logkv('reward',
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('successes',
                         safemean([epinfo['s'] for epinfo in epinfobuf]))
            logger.logkv('episode_steps',
                         safemean([epinfo['l'] for epinfo in epinfobuf]))
            # logger.logkv('time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv(lossname, lossval)
            logger.dumpkvs()
        if save_interval and (update % save_interval == 0
                              or update == 1) and save_path:
            saver.save(model.sess, save_path)
            print("Model saved in file:", save_path)
    env.close()
Пример #29
0
def learn(*,
          network,
          env,
          total_timesteps,
          dtarg=0.01,
          adaptive_kl=0,
          trunc_rho=1.0,
          clipcut=0.2,
          useadv=0,
          vtrace=0,
          rgae=0,
          eval_env=None,
          seed=None,
          ERlen=1,
          nsteps=2048,
          ent_coef=0.0,
          lr=3e-4,
          vf_coef=0.5,
          max_grad_norm=None,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.2,
          save_interval=0,
          load_path=None,
          **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

    Parameters:
    ----------

    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                      See common/models.py/lstm for more details on using recurrent nets in policies

    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                      nenv is number of environment copies simulated in parallel)

    total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)

    ent_coef: float                   policy entropy coefficient in the optimization objective

    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                      training and 0 is the end of the training.

    vf_coef: float                    value function loss coefficient in the optimization objective

    max_grad_norm: float or None      gradient norm clipping coefficient

    gamma: float                      discounting factor

    lam: float                        advantage estimation discounting factor (lambda in the paper)

    log_interval: int                 number of timesteps between logging events

    nminibatches: int                 number of training minibatches per update. For recurrent policies,
                                      should be smaller or equal than number of environments run in parallel.

    noptepochs: int                   number of training epochs per update

    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                      and 0 is the end of the training

    save_interval: int                number of timesteps between saving events

    load_path: str                    path to load the model from

    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.



    '''

    set_global_seeds(seed)

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    policy = build_policy(env, network, **network_kwargs)

    # Get the nb of env
    nenvs = env.num_envs

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space
    acdim = ac_space.shape[0]

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    # Instantiate the model object (that creates act_model and train_model)
    make_model = lambda: Model(policy=policy,
                               ob_space=ob_space,
                               ac_space=ac_space,
                               nbatch_act=nenvs,
                               nbatch_train=nbatch_train,
                               nsteps=nsteps,
                               ent_coef=ent_coef,
                               vf_coef=vf_coef,
                               max_grad_norm=max_grad_norm,
                               adaptive_kl=adaptive_kl)
    model = make_model()
    if load_path is not None:
        model.load(load_path)
    # Instantiate the runner object
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
    if eval_env is not None:
        eval_runner = EvalRunner(env=eval_env,
                                 model=model,
                                 nsteps=10 * nsteps,
                                 gamma=gamma,
                                 lam=lam)
        eval_runner.obfilt = runner.obfilt
        eval_runner.rewfilt = runner.rewfilt

    epinfobuf = deque(maxlen=10)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=10)

    # Start total timer
    tfirststart = time.time()

    nupdates = total_timesteps // nbatch

    def add_vtarg_and_adv(seg, gamma, value, lam):
        """
        Compute target value using TD(lambda) estimator, and advantage with GAE(lambda)
        """
        done = np.append(
            seg["done"], 0
        )  # last element is only used for last vtarg, but we already zeroed it if last new = 1

        T = len(seg["rew"])
        gaelam = np.empty(T, 'float32')
        rew = runner.rewfilt(seg["rew"])
        lastgaelam = 0
        for t in reversed(range(T)):
            nonterminal = 1 - done[t + 1]
            delta = rew[t] + gamma * value[t + 1] * nonterminal - value[t]
            gaelam[
                t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam
        ret = gaelam + value[:-1]
        return gaelam, ret

    def add_vtarg_and_adv_vtrace(seg,
                                 gamma,
                                 value,
                                 rho,
                                 trunc_rho,
                                 acdim=None):
        """
        Compute target value using TD(lambda) estimator, and advantage with GAE(lambda)
        """
        done = np.append(
            seg["done"], 0
        )  # last element is only used for last vtarg, but we already zeroed it if last new = 1
        rho_ = np.append(rho, 1.0)
        if acdim is not None:
            rho_ = np.exp(np.log(rho_) / acdim)

        r = np.minimum(trunc_rho, rho_)
        c = lam * np.minimum(1.0, rho_)
        T = len(seg["rew"])
        gaelam = np.empty(T, 'float32')
        gaelam2 = np.empty(T, 'float32')
        rew = runner.rewfilt(seg["rew"])
        lastgaelam = 0
        for t in reversed(range(T)):
            nonterminal = 1 - done[t + 1]
            delta = (rew[t] + gamma * value[t + 1] * nonterminal - value[t])
            gaelam[t] = delta + gamma * lam * nonterminal * lastgaelam
            lastgaelam = r[t] * gaelam[t]
        ret = r[:-1] * gaelam + value[:-1]
        adv = rew + gamma * (1.0 - done[1:]) * np.hstack([ret[1:], value[T]
                                                          ]) - value[:-1]
        return adv, ret, gaelam

    def add_vtarg_and_adv_vtrace4(seg,
                                  gamma,
                                  value,
                                  rho,
                                  trunc_rho,
                                  acdim=None):
        """
        Compute target value using TD(lambda) estimator, and advantage with GAE(lambda)
        """
        done = np.append(
            seg["done"], 0
        )  # last element is only used for last vtarg, but we already zeroed it if last new = 1
        rho_ = np.append(rho, 1.0)
        if acdim is not None:
            rho_ = np.exp(np.log(rho_) / acdim)

        T = len(seg["rew"])
        gaelam = np.zeros(T, 'float32')
        rew = runner.rewfilt(seg["rew"])
        delta = (rew + gamma * value[1:] * (1.0 - done[1:]) - value[:-1])
        gamlam = np.zeros(T, 'float32')
        for i in range(T):
            gamlam[i] = (gamma * lam)**i
        idx = T
        c = np.ones(T)
        for t in reversed(range(T)):
            # print(delta2)
            for j in range(t, T):
                if done[j + 1]:
                    idx = j + 1
                break
            gaelam[t] = np.sum(gamlam[:idx - t] *
                               (np.minimum(1.0, c) * delta)[t:idx])
            c[t:] = rho_[t] * c[t:]

        ret = np.minimum(trunc_rho, rho_[:-1]) * gaelam + value[:-1]
        adv = rew + gamma * (1.0 - done[1:]) * np.hstack([ret[1:], value[T]
                                                          ]) - value[:-1]
        return adv, ret, gaelam

    seg = None
    cliprangenow = cliprange(1.0)
    klconst = 1.0
    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0
        # Start timer
        tstart = time.time()
        frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        lrnow = lr(frac)
        # Calculate the cliprange

        # Get minibatch
        if seg is None:
            prev_seg = seg
            seg = {}
        else:
            prev_seg = {}
            for i in seg:
                prev_seg[i] = np.copy(seg[i])
        seg["ob"], seg["rew"], seg["done"], seg["ac"], seg["neglogp"], seg[
            "mean"], seg[
                "logstd"], final_obs, final_done, epinfos = runner.run()  #pylint: disable=E0632
        # print(np.shape(seg["ob"]))
        if prev_seg is not None:
            for key in seg:
                if len(np.shape(seg[key])) == 1:
                    seg[key] = np.hstack([prev_seg[key], seg[key]])
                else:
                    seg[key] = np.vstack([prev_seg[key], seg[key]])
                if np.shape(seg[key])[0] > ERlen * nsteps:
                    seg[key] = seg[key][-ERlen * nsteps:]

        ob_stack = np.vstack([seg["ob"], final_obs])
        values = model.values(runner.obfilt(ob_stack))
        values[:-1] = (1.0 - final_done) * values[:-1]
        ob = runner.obfilt(seg["ob"])
        mean_now, logstd_now = model.meanlogstds(ob)
        # print(np.shape(seg["ac"])[1])
        neglogpnow = 0.5 * np.sum(np.square((seg["ac"] - mean_now) / np.exp(logstd_now)), axis=-1) \
                      + 0.5 * np.log(2.0 * np.pi) * np.shape(seg["ac"])[1] \
                      + np.sum(logstd_now, axis=-1)
        neglogpold = 0.5 * np.sum(np.square((seg["ac"] - seg["mean"]) / np.exp(logstd_now)), axis=-1) \
                     + 0.5 * np.log(2.0 * np.pi) * np.shape(seg["ac"])[1] \
                     + np.sum(logstd_now, axis=-1)
        rho = np.exp(-neglogpnow + neglogpold)
        # print(len(mean_now))
        # print(cliprangenow)
        # print(rho)
        if vtrace == 1:
            adv, ret, gae = add_vtarg_and_adv_vtrace(seg, gamma, values, rho,
                                                     trunc_rho)
            if useadv:
                gae = adv
        elif vtrace == 4:
            adv, ret, gae = add_vtarg_and_adv_vtrace4(seg, gamma, values, rho,
                                                      trunc_rho)
            if useadv:
                gae = adv
        else:
            gae, ret = add_vtarg_and_adv(seg, gamma, values, lam)
        r = np.minimum(1.0, rho)
        r_gae = gae * r
        print("======")
        print(gae)
        print(r_gae)
        print(gae.mean())
        print(r_gae.mean())
        print(gae.std())
        print(r_gae.std())
        print(r.mean())
        print("======")

        if eval_env is not None:
            eval_obs, eval_returns, eval_masks, eval_actions, _, _, eval_epinfos = eval_runner.run(
            )  #pylint: disable=E0632
        prior_row = np.zeros(len(seg["ob"]))
        temp_ = []
        for i in range(int(len(prior_row) / nsteps)):
            temp_row = np.mean(
                np.abs(rho[i * nsteps:(i + 1) * nsteps] - 1.0) + 1.0)
            # local_rho[i + (ERlen-int(len(prior_row)/nsteps))].append(temp_row)
            temp_.append(temp_row)
        print(temp_)
        rho_after =  np.exp(- 0.5 * np.square((seg["ac"] - mean_now) / np.exp(logstd_now)) \
                     + 0.5 * np.square((seg["ac"] - seg["mean"]) / np.exp(logstd_now)))
        temp_prior = []
        for i in range(int(len(prior_row) / nsteps)):
            temp_row = np.mean(
                np.abs(rho_after[i * nsteps:(i + 1) * nsteps] - 1.0) + 1.0)
            # local_rho[i + (ERlen-int(len(prior_row)/nsteps))].append(temp_row)
            if temp_row > 1 + clipcut:
                prior_row[i * nsteps:(i + 1) * nsteps] = 0
            else:
                prior_row[i * nsteps:(i + 1) * nsteps] = 1
                # prior_row[i * nsteps:(i + 1) * nsteps] = 1
            temp_prior.append(temp_row)
        print(temp_prior)

        # for i in range(len(prior_row)):
        #     if (np.abs(rho[i] - 1.0) + 1.0)>1.05:
        #         prior_row[i]=0
        #     else:
        #         prior_row[i]=1
        # for i in range(len(prior_row)):
        #     if rho[i]>1.1 :
        #         prior_row[i]=0
        #     else:
        #         prior_row[i]=1
        # prob = prior_row/np.sum(prior_row)

        print(np.sum(prior_row))

        epinfobuf.extend(epinfos)
        if eval_env is not None:
            eval_epinfobuf.extend(eval_epinfos)

        # Here what we're going to do is for each minibatch calculate the loss and append it.
        mblossvals = []
        # Index of each element of batch_size
        # Create the indices array

        inds1 = np.arange(len(seg["ob"]) - nsteps)
        inds2 = np.arange(nsteps) + len(seg["ob"]) - nsteps
        print(len(seg["ob"]))
        print(cliprangenow)
        nbatch_adapt1 = int(
            (np.sum(prior_row) - nsteps) / nsteps * nbatch_train)
        nbatch_adapt2 = int((nsteps) / nsteps * nbatch_train)
        print(rho)
        idx1 = []
        idx2 = []
        kl_rest = np.ones(len(seg["ob"])) * np.sum(prior_row) / nsteps
        kl_rest[:-nsteps] = 0
        # print(kl_rest)
        for _ in range(noptepochs):
            # Randomize the indexes
            # np.random.shuffle(inds)
            # 0 to batch_size with batch_train_size step

            # print(nbatch_adapt)
            losses_epoch = []
            for _ in range(int(nsteps / nbatch_train)):
                if nbatch_adapt1 > 0:
                    idx1 = np.random.choice(inds1,
                                            nbatch_adapt1,
                                            p=prior_row[:-2048] /
                                            np.sum(prior_row[:-2048]))
                idx2 = np.random.choice(inds2, nbatch_adapt2)
                # print(np.mean(np.abs(rho[mbinds] - 1.0) + 1.0))
                idx = np.hstack([idx1, idx2]).astype(int)

                slices = (arr[idx]
                          for arr in (ob, ret, gae, seg["done"], seg["ac"],
                                      values[:-1], neglogpold, seg["mean"],
                                      logstd_now, kl_rest, rho, neglogpnow))
                loss_epoch = model.train(lrnow, cliprangenow, klconst, rgae,
                                         trunc_rho, *slices)
                mblossvals.append(loss_epoch)
                losses_epoch.append(loss_epoch)

            # # print(np.mean(losses_epoch, axis=0))
            # mean_n, logstd_n = model.meanlogstds(runner.obfilt(seg["ob"]))
            # # print(np.shape(seg["ac"])[1])
            # rho_after =  np.exp(- 0.5 * np.square((seg["ac"] - mean_n) / np.exp(logstd_n)) \
            #              - logstd_n + 0.5 * np.square((seg["ac"] - seg["mean"]) / np.exp(seg["logstd"]))\
            #              + seg["logstd"])
            # temp_ = []
            # for i in range(int(len(prior_row) / nsteps)):
            #     temp_row = np.mean(np.abs(rho_after[i * nsteps:(i + 1) * nsteps] - 1.0) + 1.0)
            #     # local_rho[i + (ERlen-int(len(prior_row)/nsteps))].append(temp_row)
            #     temp_.append(temp_row)
            # print(temp_)

        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)
        if adaptive_kl:
            print("KL avg :", lossvals[3])
            if lossvals[3] > dtarg * 1.5:
                klconst *= 2
                print("kl const is increased")
            elif lossvals[3] < dtarg / 1.5:
                klconst /= 2
                print("kl const is reduced")
            klconst = np.clip(klconst, 2**(-10), 64)
        # End timer
        tnow = time.time()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values[:-1], ret)
            logger.logkv("batch IS weight",
                         [int(1000 * s) / 1000. for s in np.array(temp_prior)])
            logger.logkv("kl const", klconst)
            logger.logkv("clipping factor", cliprangenow)
            logger.logkv("serial_timesteps", update * nsteps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))
            logger.logkv('eprewmean',
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean',
                         safemean([epinfo['l'] for epinfo in epinfobuf]))
            if eval_env is not None:
                logger.logkv(
                    'eval_eprewmean',
                    safemean([epinfo['r'] for epinfo in eval_epinfos]))
                logger.logkv(
                    'eval_eplenmean',
                    safemean([epinfo['l'] for epinfo in eval_epinfos]))
            logger.logkv('time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv(lossname, lossval)
            if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
                logger.dumpkvs()
        if save_interval and (update % save_interval == 0
                              or update == 1) and logger.get_dir() and (
                                  MPI is None
                                  or MPI.COMM_WORLD.Get_rank() == 0):
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)
    return model
Пример #30
0
def print_log(*, model, run_info, batching_config, lossvals, update, fps,
              epinfobuf, tnow, tfirststart):
    ev = explained_variance(run_info.values, run_info.returns)
    logger.logkv("serial_timesteps", update * batching_config.nsteps)
    logger.logkv("nupdates", update)
    logger.logkv("total_timesteps", update * batching_config.nbatch)
    logger.logkv("fps", fps)
    logger.logkv("explained_variance", float(ev))
    logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf]))
    logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf]))
    logger.logkv('time_elapsed', tnow - tfirststart)
    for (lossval, lossname) in zip(lossvals, model.loss_names):
        logger.logkv(lossname, lossval)
    logger.dumpkvs()
def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
            vf_coef=0.5,  max_grad_norm=0.5, gamma=0.99, lam=0.95,
            log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
            save_interval=0, load_path=None, num_casks=0):

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    nenvs = env.num_envs - num_casks
    ob_space = env.observation_space
    ac_space = env.action_space
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=env.num_envs, nbatch_train=nbatch_train,
                    nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
                    max_grad_norm=max_grad_norm)
    if save_interval and logger.get_dir():
        import cloudpickle
        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
            fh.write(cloudpickle.dumps(make_model))
    model = make_model()
    if load_path is not None:
        model.load(load_path)
        # load running mean std
        checkdir = load_path[0:-5]
        checkpoint = int(load_path.split('/')[-1])
        if osp.exists(osp.join(checkdir, '%.5i_ob_rms.pkl' % checkpoint)):
            with open(osp.join(checkdir, '%.5i_ob_rms.pkl' % checkpoint), 'rb') as ob_rms_fp:
                env.ob_rms = pickle.load(ob_rms_fp)
        # if osp.exists(osp.join(checkdir, '%.5i_ret_rms.pkl' % checkpoint)):
        #     with open(osp.join(checkdir, '%.5i_ret_rms.pkl' % checkpoint), 'rb') as ret_rms_fp:
        #         env.ret_rms = pickle.load(ret_rms_fp)
    # tensorboard
    writer = tf.summary.FileWriter(logger.get_dir(), tf.get_default_session().graph)
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam, writer=writer, num_casks=num_casks)

    epinfobuf = deque(maxlen=100)
    tfirststart = time.time()

    nupdates = total_timesteps//nbatch
    for update in range(1, nupdates+1):
        assert nbatch % nminibatches == 0
        nbatch_train = nbatch // nminibatches
        tstart = time.time()
        frac = 1.0 - (update - 1.0) / nupdates
        lrnow = lr(frac)
        cliprangenow = cliprange(frac)
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632
        epinfobuf.extend(epinfos)
        mblossvals = []
        if states is None: # nonrecurrent version
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                np.random.shuffle(inds)
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow, *slices))
        else: # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            envsperbatch = nbatch_train // nsteps
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates))

        lossvals = np.mean(mblossvals, axis=0)
        tnow = time.time()
        fps = int(nbatch / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, returns)
            logger.logkv("serial_timesteps", update*nsteps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update*nbatch)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))
            logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.logkv('epsrewmean', safemean([epinfo['sr'] for epinfo in epinfobuf]))
            logger.logkv('time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv(lossname, lossval)
            logger.dumpkvs()
            # tensorboard
            summary = tf.Summary()
            summary.value.add(tag='iteration/reward_mean', simple_value=safemean([epinfo['r'] for epinfo in epinfobuf]))
            summary.value.add(tag='iteration/length_mean', simple_value=safemean([epinfo['l'] for epinfo in epinfobuf]))
            summary.value.add(tag='iteration/shaped_reward_mean', simple_value=safemean([epinfo['sr'] for epinfo in epinfobuf]))
            summary.value.add(tag='iteration/fps', simple_value=fps)
            writer.add_summary(summary, update)
        if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i'%update)
            print('Saving to', savepath)
            model.save(savepath)
            # save running mean std
            with open(osp.join(checkdir, '%.5i_ob_rms.pkl' % update), 'wb') as ob_rms_fp:
                pickle.dump(env.ob_rms, ob_rms_fp)
            with open(osp.join(checkdir, '%.5i_ret_rms.pkl' % update), 'wb') as ret_rms_fp:
                pickle.dump(env.ret_rms, ret_rms_fp)
    env.close()
Пример #32
0
def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
            vf_coef=0.5,  max_grad_norm=0.5, gamma=0.99, lam=0.95,
            log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
            save_interval=200, useentr, net_size, load_path=None, i_trial, method):

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
                    nsteps=nsteps, vf_coef=vf_coef,
                    max_grad_norm=max_grad_norm, net_size=net_size)
    if save_interval:
        import cloudpickle
        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
            fh.write(cloudpickle.dumps(make_model))

    model = make_model()

    if load_path:
        model.load(load_path=load_path)

    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)

    epinfobuf = deque(maxlen=100)
    tfirststart = time.time()

    nupdates = total_timesteps//nbatch


    # ent_coef = max(ent_coef - 0.25*float(update) / float(nupdates), 0.001)
    ent_coef = useentr * ent_coef
    # ent_coef = entp - float(iters_so_far) / float(max_iters)

    for update in range(1, nupdates+1):
        # ent_coef = useentr * 0.01
        # ent_coef = max(ent_coef - 0.25 * float(update) / float(nupdates), 0.001)
        assert nbatch % nminibatches == 0
        nbatch_train = nbatch // nminibatches
        tstart = time.time()
        frac = 1.0 - (update - 1.0) / nupdates
        lrnow = lr(frac)
        cliprangenow = cliprange(frac)
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632
        epinfobuf.extend(epinfos)
        mblossvals = []
        if states is None: # nonrecurrent version
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                np.random.shuffle(inds)
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow, *slices, ent_dynamic=ent_coef))
        else: # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            envsperbatch = nbatch_train // nsteps
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates, ent_dynamic=ent_coef))

        lossvals = np.mean(mblossvals, axis=0)
        tnow = time.time()
        fps = int(nbatch / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, returns)
            logger.logkv("serial_timesteps", update*nsteps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update*nbatch)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))
            logger.logkv('EpRewMean', safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('EpLenMean', safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.logkv('time_elapsed', tnow - tfirststart)
            logger.logkv('trial', i_trial)
            logger.logkv("Iteration", update)
            logger.logkv('Name', method)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv(lossname, lossval)
            logger.dumpkvs()

        # if update == 1 or update % 100==0 or update==nupdates:
        #     rwd=runner.play(video_path=logger.get_dir()+'/videos', iters_so_far=update)
        #     print('Average Retrun:{0}'.format(np.sum(rwd)/float(len(rwd))))
        #     print('Sum of Return:{0}'.format(np.sum(rwd)))

        if save_interval and (update % save_interval == 0 or update == 1 or update==nupdates) and logger.get_dir():
            checkdir = get_dir(osp.join(logger.get_dir(), 'checkpoints'))
            savepath = osp.join(checkdir, '%.5i'%update)
            print('Saving to', savepath)
            model.save(savepath)
            # np.save('{}/mean'.format(checkdir + '/'), runner.env.obs.mean)
            # np.save('{}/var'.format(checkdir + '/'), runner.env.obs.var)
    env.close()