Пример #1
0
    def train(self, cache, i=None):
        print(
            f"Training {self.policy.__class__} policy with key {self.policy.key}"
        )
        logger.configure()

        sampler = Sampler(env=self.env, policy=self.policy)
        buffer = self.policy.make_training_buffer()

        nbatch = self.policy.config.training.nsteps * self.policy.config.training.nenvs

        i, extra_data = self.restore_training_checkpoint(cache=cache)
        total_timesteps = (i - 1) * nbatch
        total_episodes = extra_data.get('total_episodes', 0)
        epinfobuf = extra_data.get('epinfobuf', deque(maxlen=100))

        log_freq = 1

        while total_timesteps < self.policy.config.training.total_timesteps:
            batch = sampler.sample_batch(self.policy.config.training.nsteps)
            epinfobuf.extend(batch.env_info.epinfobuf)
            buffer.add_batch(batch)

            self.policy.train_step(buffer=buffer,
                                   itr=i,
                                   logger=logger,
                                   log_freq=log_freq,
                                   cache=cache,
                                   save_freq=None)

            if i % log_freq == 0:
                logger.logkv('itr', i)
                logger.logkv('cumulative episodes', total_episodes)
                logger.logkv('timesteps covered', total_timesteps)
                logger.logkv('eprewmean',
                             safemean([epinfo['r'] for epinfo in epinfobuf]))
                logger.logkv('eplenmean',
                             safemean([epinfo['l'] for epinfo in epinfobuf]))
                logger.logkv('buffer size', buffer.time_shape.size)
                logger.dumpkvs()

            i += 1
            total_episodes += len(batch.env_info.epinfobuf)
            total_timesteps += nbatch

            if i % int(self.policy.config.training.total_timesteps /
                       (10 * nbatch)) == 0:
                print("Doing a cache roundtrip...")
                self.store_training_checkpoint(cache,
                                               itr=i,
                                               extra_data={
                                                   'total_episodes':
                                                   total_episodes,
                                                   'epinfobuf': epinfobuf
                                               })
                stored_i, _ = self.restore_training_checkpoint(cache, itr=i)
                assert stored_i == i
Пример #2
0
 def log_performance(self, i):
     logger.logkv('itr', i)
     logger.logkv('cumulative episodes', self.total_episodes)
     logger.logkv('timesteps covered', i * self.env.num_envs * self.batch_t)
     logger.logkv('eprewmean',
                  safemean([epinfo['r'] for epinfo in self.eval_epinfobuf]))
     logger.logkv('eplenmean',
                  safemean([epinfo['l'] for epinfo in self.eval_epinfobuf]))
     logger.logkv('buffer size', self.buffer.time_shape.size)
     logger.logkv(
         'memory used (GB)',
         psutil.Process(os.getpid()).memory_info().rss /
         (1024 * 1024 * 1024))
     logger.dumpkvs()
Пример #3
0
def print_log(*, model, run_info, batching_config, lossvals, update, fps,
              epinfobuf, tnow, tfirststart):
    ev = explained_variance(run_info.values, run_info.returns)
    logger.logkv("serial_timesteps", update * batching_config.nsteps)
    logger.logkv("nupdates", update)
    logger.logkv("total_timesteps", update * batching_config.nbatch)
    logger.logkv("fps", fps)
    logger.logkv("explained_variance", float(ev))
    logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf]))
    logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf]))
    logger.logkv('time_elapsed', tnow - tfirststart)
    for (lossval, lossname) in zip(lossvals, model.loss_names):
        logger.logkv(lossname, lossval)
    logger.dumpkvs()
Пример #4
0
def learn(network,
          env,
          seed,
          total_timesteps=int(40e6),
          gamma=0.99,
          log_interval=1,
          nprocs=32,
          nsteps=20,
          ent_coef=0.01,
          vf_coef=0.5,
          vf_fisher_coef=1.0,
          lr=0.25,
          max_grad_norm=0.5,
          kfac_clip=0.001,
          save_interval=None,
          lrschedule='linear',
          load_path=None,
          is_async=True,
          **network_kwargs):
    set_global_seeds(seed)

    if network == 'cnn':
        network_kwargs['one_dim_bias'] = True

    policy = build_policy(env, network, **network_kwargs)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    make_model = lambda: Model(policy,
                               ob_space,
                               ac_space,
                               nenvs,
                               total_timesteps,
                               nprocs=nprocs,
                               nsteps=nsteps,
                               ent_coef=ent_coef,
                               vf_coef=vf_coef,
                               vf_fisher_coef=vf_fisher_coef,
                               lr=lr,
                               max_grad_norm=max_grad_norm,
                               kfac_clip=kfac_clip,
                               lrschedule=lrschedule,
                               is_async=is_async)
    if save_interval and logger.get_dir():
        import cloudpickle
        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
            fh.write(cloudpickle.dumps(make_model))
    model = make_model()

    if load_path is not None:
        model.load(load_path)

    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
    epinfobuf = deque(maxlen=100)
    nbatch = nenvs * nsteps
    tstart = time.time()
    coord = tf.train.Coordinator()
    if is_async:
        enqueue_threads = model.q_runner.create_threads(model.sess,
                                                        coord=coord,
                                                        start=True)
    else:
        enqueue_threads = []

    for update in range(1, total_timesteps // nbatch + 1):
        obs, states, rewards, masks, actions, values, epinfos = runner.run()
        epinfobuf.extend(epinfos)

        policy_loss, value_loss, policy_entropy = model.train(
            obs, states, rewards, masks, actions, values)
        model.old_obs = obs
        nseconds = time.time() - tstart
        fps = int((update * nbatch) / nseconds)
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update * nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("policy_loss", float(policy_loss))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.record_tabular(
                "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.record_tabular(
                "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.dump_tabular()

        if save_interval and (update % save_interval == 0
                              or update == 1) and logger.get_dir():
            savepath = osp.join(logger.get_dir(), 'checkpoint%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)
    coord.request_stop()
    coord.join(enqueue_threads)
    return model
Пример #5
0
def learn(
    network,
    env,
    seed=None,
    nsteps=5,
    total_timesteps=int(80e6),
    vf_coef=0.5,
    ent_coef=0.01,
    max_grad_norm=0.5,
    lr=7e-4,
    lrschedule='linear',
    epsilon=1e-5,
    alpha=0.99,
    gamma=0.99,
    log_interval=100,
    load_path=None,
    **network_kwargs):

    '''
    Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm.

    Parameters:
    -----------

    network:            policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                        tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                        neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                        See baselines.common/policies.py/lstm for more details on using recurrent nets in policies


    env:                RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py)


    seed:               seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible)

    nsteps:             int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                        nenv is number of environment copies simulated in parallel)

    total_timesteps:    int, total number of timesteps to train on (default: 80M)

    vf_coef:            float, coefficient in front of value function loss in the total loss function (default: 0.5)

    ent_coef:           float, coeffictiant in front of the policy entropy in the total loss function (default: 0.01)

    max_gradient_norm:  float, gradient is clipped to have global L2 norm no more than this value (default: 0.5)

    lr:                 float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)

    lrschedule:         schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
                        returns fraction of the learning rate (specified as lr) as output

    epsilon:            float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)

    alpha:              float, RMSProp decay parameter (default: 0.99)

    gamma:              float, reward discounting parameter (default: 0.99)

    log_interval:       int, specifies how frequently the logs are printed out (default: 100)

    **network_kwargs:   keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                        For instance, 'mlp' network architecture has arguments num_hidden and num_layers.

    '''



    set_global_seeds(seed)

    # Get the nb of env
    nenvs = env.num_envs
    policy = build_policy(env, network, **network_kwargs)

    # Instantiate the model object (that creates step_model and train_model)
    model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
        max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule)
    if load_path is not None:
        model.load(load_path)

    # Instantiate the runner object
    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
    epinfobuf = deque(maxlen=100)

    # Calculate the batch_size
    nbatch = nenvs*nsteps

    # Start total timer
    tstart = time.time()

    for update in range(1, total_timesteps//nbatch+1):
        # Get mini batch of experiences
        obs, states, rewards, masks, actions, values, epinfos = runner.run()
        epinfobuf.extend(epinfos)

        policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
        nseconds = time.time()-tstart

        # Calculate the fps (frame per second)
        fps = int((update*nbatch)/nseconds)
        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update*nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.record_tabular("eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.record_tabular("eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.dump_tabular()
    return model
Пример #6
0
 def mean_length(self):
     return ppo2.safemean([epinfo['l'] for epinfo in self._epinfobuf])
Пример #7
0
 def mean_reward(self):
     return ppo2.safemean([epinfo['r'] for epinfo in self._epinfobuf])
Пример #8
0
def learn_ent_hoof_a2c(network,
                       env,
                       optimiser,
                       seed=None,
                       nsteps=5,
                       total_timesteps=int(1e6),
                       lr_upper_bound=None,
                       ent_upper_bound=None,
                       num_lr=None,
                       num_ent_coeff=None,
                       gamma=0.99,
                       max_kl=None,
                       max_grad_norm=0.5,
                       log_interval=100,
                       load_path=None,
                       **network_kwargs):
    '''
    Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm.

    Parameters:
    -----------

    network:            policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                        tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                        neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                        See baselines.common/policies.py/lstm for more details on using recurrent nets in policies


    env:                RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py)


    seed:               seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible)

    nsteps:             int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                        nenv is number of environment copies simulated in parallel)

    total_timesteps:    int, total number of timesteps to train on (default: 80M)

    max_gradient_norm:  float, gradient is clipped to have global L2 norm no more than this value (default: 0.5)

    lr:                 float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)

    gamma:              float, reward discounting parameter (default: 0.99)

    log_interval:       int, specifies how frequently the logs are printed out (default: 100)

    **network_kwargs:   keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                        For instance, 'mlp' network architecture has arguments num_hidden and num_layers.

    '''

    set_global_seeds(seed)

    # Get the nb of env
    nenvs = env.num_envs
    policy = build_policy(env, network, **network_kwargs)

    # Instantiate the model object (that creates step_model and train_model)
    model = Ent_HOOF_Model(optimiser=optimiser,
                           policy=policy,
                           env=env,
                           nsteps=nsteps,
                           total_timesteps=total_timesteps,
                           max_grad_norm=max_grad_norm)
    runner = HOOF_Runner(env, model, nsteps=nsteps, gamma=gamma)
    epinfobuf = deque(maxlen=100)

    # Calculate the batch_size
    nbatch = nenvs * nsteps

    # model helper functions
    model_params = find_trainable_variables("a2c_model")
    get_flat = U.GetFlat(model_params)
    set_from_flat = U.SetFromFlat(model_params)

    def kl(new_mean, new_sd, old_mean, old_sd):
        approx_kl = np.log(new_sd / old_sd) + (
            old_sd**2 +
            (old_mean - new_mean)**2) / (2.0 * new_sd**2 + 10**-8) - 0.5
        approx_kl = np.sum(approx_kl, axis=1)
        approx_kl = np.mean(approx_kl)
        return approx_kl

    if max_kl is None:  # set max kl to a high val in case there is no constraint
        max_kl = 10**8

    # Start total timer
    tstart = time.time()

    for update in range(1, int(total_timesteps // nbatch + 1)):
        opt_pol_val = -10**8
        approx_kl = np.zeros((num_ent_coeff, num_lr))
        epv = np.zeros((num_ent_coeff, num_lr))
        rand_lr = lr_upper_bound * np.random.rand(num_lr)
        rand_lr = np.sort(rand_lr)
        rand_ent_coeff = ent_upper_bound * np.random.rand(num_ent_coeff)

        old_params = get_flat()
        rms_weights_before_upd = model.get_opt_state()

        obs, states, rewards, masks, actions, values, undisc_rwds, epinfos = runner.run(
        )
        epinfobuf.extend(epinfos)
        old_mean, old_sd, old_neg_ll = model.get_mean_std_neg_ll(obs, actions)
        for nec in range(num_ent_coeff):
            # reset policy and rms prop optimiser
            set_from_flat(old_params)
            model.set_opt_state(rms_weights_before_upd)

            # get grads for loss fn with given entropy coeff
            policy_loss, value_loss, policy_entropy = model.train(
                obs, states, rewards, masks, actions, values,
                rand_ent_coeff[nec])
            new_params = get_flat()
            ent_grads = new_params - old_params

            # enumerate over different LR
            for nlr in range(num_lr):
                new_params = old_params + rand_lr[nlr] * ent_grads
                set_from_flat(new_params)
                new_mean, new_sd, new_neg_ll = model.get_mean_std_neg_ll(
                    obs, actions)
                lik_ratio = np.exp(-new_neg_ll + old_neg_ll)
                est_pol_val = wis_estimate(nenvs, nsteps, undisc_rwds,
                                           lik_ratio)
                approx_kl[nec, nlr] = kl(new_mean, new_sd, old_mean, old_sd)
                epv[nec, nlr] = est_pol_val

                if (nec == 0
                        and nlr == 0) or (est_pol_val > opt_pol_val
                                          and approx_kl[nec, nlr] < max_kl):
                    opt_pol_val = est_pol_val
                    opt_pol_params = get_flat()
                    opt_rms_wts = model.get_opt_state()
                    opt_lr = rand_lr[nlr]
                    opt_ent_coeff = rand_ent_coeff[nec]
                    opt_kl = approx_kl[nec, nlr]

        # update policy and rms prop to optimal wts
        set_from_flat(opt_pol_params)
        model.set_opt_state(opt_rms_wts)

        # Shrink LR search space if too many get rejected
        rejections = np.sum(approx_kl > max_kl) / num_lr
        if rejections > 0.8:
            lr_upper_bound *= 0.8
        if rejections == 0:
            lr_upper_bound *= 1.25

        nseconds = time.time() - tstart

        # Calculate the fps (frame per second)
        fps = int((update * nbatch) / nseconds)
        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update * nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.record_tabular("opt_lr", float(opt_lr))
            logger.record_tabular("ent_coeff", float(opt_ent_coeff))
            logger.record_tabular("approx_kl", float(opt_kl))
            logger.record_tabular("rejections", rejections)
            logger.record_tabular("lr_ub", lr_upper_bound)
            logger.record_tabular(
                "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.record_tabular(
                "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.dump_tabular()
    return model
Пример #9
0
def learn(*, network, env, total_timesteps, eval_env=None, seed=None, nsteps=2048, nbatch=None, ent_coef=0.0, lr=3e-4,
            vf_coef=0.5,  max_grad_norm=0.5, gamma=0.99, lam=0.95,
            log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
            save_interval=0, load_path=None, model_fn=None,
            mode='hippo', use_buffer=False, buffer_capacity=None, hindsight=0.5, reward_fn,
          **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

    Parameters:
    ----------

    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                      See common/models.py/lstm for more details on using recurrent nets in policies

    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


    nsteps: int                       number of steps of the vectorized environment per update 

    nbatch: int                       batch size 

    total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)

    ent_coef: float                   policy entropy coefficient in the optimization objective

    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                      training and 0 is the end of the training.

    vf_coef: float                    value function loss coefficient in the optimization objective

    max_grad_norm: float or None      gradient norm clipping coefficient

    gamma: float                      discounting factor

    lam: float                        advantage estimation discounting factor (lambda in the paper)

    log_interval: int                 number of timesteps between logging events

    nminibatches: int                 number of training minibatches per update. For recurrent policies,
                                      should be smaller or equal than number of environments run in parallel.

    noptepochs: int                   number of training epochs per update

    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                      and 0 is the end of the training

    save_interval: int                number of timesteps between saving events

    load_path: str                    path to load the model from

    mode                              switch between 'ppo' or 'hippo', default 'hippo' 

    buffer_capacity                   max number of steps stored in the replay buffer

    hindsight                         fraction of the batch paths with hindsight

    reward_fn                         reward fuction used to recompute reward under a new goal

    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.

    '''

    set_global_seeds(seed)

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    # Get the nb of env
    nenvs = env.num_envs

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    # Build policy 
    # But first add shape & dtype attributes to the env's observation space (needed for building policy network)   
    dtype = None
    size = 0
    for key in ['observation', 'achieved_goal', 'desired_goal']:
        space = ob_space.spaces[key]
        shape = space.shape
        dtype = space.dtype
        size += np.prod(shape)
        if dtype is not None:
            assert space.dtype == dtype, 'dtype not same between observation spaces'
    ob_space.shape = (size, )
    ob_space.dtype = dtype

    policy = build_policy(env, network, **network_kwargs)

    # Calculate the batch_size, nbatch is a rough approximation
    if nbatch is None: nbatch = nsteps * nenvs
    nbatch_train = nbatch // nminibatches

    # Instantiate the model object (that creates act_model and train_model)
    if model_fn is None:
        from baselines.ppo2.model import Model
        model_fn = Model

    model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
                     nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm)

    if load_path is not None:
        model.load(load_path)
    
    # Instantiate the runner object
    runner = Runner(env=env, model=model, nsteps=nsteps)
    if eval_env is not None:
        eval_runner = Runner(env=eval_env, model=model, nsteps=nsteps)

    epinfobuf = deque(maxlen=100)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=100)

    # Instantiate the replay buffer
    if use_buffer:
        if buffer_capacity is None: buffer_capacity = nbatch
        replay_buffer = ReplayBuffer(capacity=buffer_capacity)

    # Start total timer
    tfirststart = time.perf_counter()
    her_timesteps = 0
    nupdates = total_timesteps//nbatch
    for update in range(1, nupdates+1):
        # Start timer
        tstart = time.perf_counter()
        frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        lrnow = lr(frac)
        # Calculate the cliprange
        cliprangenow = cliprange(frac)

        # Collect new trajectories here
        paths, epinfos = runner.run()   #pylint: disable=E0632
        if eval_env is not None:
            eval_paths,  eval_epinfos = eval_runner.run() #pylint: disable=E0632
        epinfobuf.extend(epinfos)
        if eval_env is not None:
            eval_epinfobuf.extend('epinfos')

        if mode == 'hippo':
            batch_paths = []
            if use_buffer:
                for path in paths: 
                    replay_buffer.insert(path)
                nsamples = 0
                while nsamples < nbatch:
                    path = replay_buffer.sample()
                    subpath = random_subpath(path)
                    if np.random.uniform() < hindsight:
                        if len(subpath) == len(path):
                            subpath.pop_step()
                        subpath = apply_hindsight(path, reward_fn)
                    batch_paths.append(subpath)
                    nsamples += len(subpath)
            else:
                nsamples = 0
                paths = itertools.cycle(paths)
                while nsamples < nbatch:
                    path = next(paths)
                    subpath = random_subpath(path)
                    if np.random.uniform() < hindsight:
                        if len(subpath) == len(path):
                            subpath.pop_step()
                        subpath = apply_hindsight(path, reward_fn)
                    batch_paths.append(subpath)
                    nsamples += len(subpath)
        elif mode == 'ppo':
            batch_paths = paths

        obs, returns, masks, actions, values, neglogpacs = batch(env, model, gamma, lam, batch_paths)
        _nbatch = (len(obs) // nbatch_train) * nbatch_train
        her_timesteps += _nbatch

        # Here what we're going to do is for each minibatch calculate the loss and append it.
        mblossvals = []
        # Index of each element of batch_size
        # Create the indices array
        inds = np.arange(_nbatch)
        for _ in range(noptepochs):
            # Randomize the indexes
            np.random.shuffle(inds)
            # 0 to batch_size with batch_train_size step
            for start in range(0, _nbatch, nbatch_train):
                end = start + nbatch_train
                mbinds = inds[start:end]
                slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                mblossvals.append(model.train(lrnow, cliprangenow, *slices))

        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)
        # End timer
        tnow = time.perf_counter()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, returns)
            logger.logkv("serial_timesteps", update*nsteps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update*nsteps*nenvs)
            logger.logkv("total_steps", update*nbatch)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))
            logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf]))
            if eval_env is not None:
                logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf]) )
                logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf]) )
            logger.logkv('time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv(lossname, lossval)
            if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
                logger.dumpkvs()
        if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and (MPI is None or MPI.COMM_WORLD.Get_rank() == 0):
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i'%update)
            print('Saving to', savepath)
            model.save(savepath)
    return model
Пример #10
0
def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20,
                 ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
                 kfac_clip=0.001, save_interval=None, lrschedule='linear', load_path=None, is_async=True, **network_kwargs):
    set_global_seeds(seed)


    if network == 'cnn':
        network_kwargs['one_dim_bias'] = True

    policy = build_policy(env, network, **network_kwargs)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps
                                =nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=
                                vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip,
                                lrschedule=lrschedule, is_async=is_async)
    if save_interval and logger.get_dir():
        import cloudpickle
        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
            fh.write(cloudpickle.dumps(make_model))
    model = make_model()

    if load_path is not None:
        model.load(load_path)

    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
    epinfobuf = deque(maxlen=100)
    nbatch = nenvs*nsteps
    tstart = time.time()
    coord = tf.train.Coordinator()
    if is_async:
        enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True)
    else:
        enqueue_threads = []

    for update in range(1, total_timesteps//nbatch+1):
        obs, states, rewards, masks, actions, values, epinfos = runner.run()
        epinfobuf.extend(epinfos)
        policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
        model.old_obs = obs
        nseconds = time.time()-tstart
        fps = int((update*nbatch)/nseconds)
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update*nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("policy_loss", float(policy_loss))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.record_tabular("eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.record_tabular("eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.dump_tabular()

        if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
            savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update)
            print('Saving to', savepath)
            model.save(savepath)
    coord.request_stop()
    coord.join(enqueue_threads)
    return model
Пример #11
0
        # Calculate the fps (frame per second)
        fps = int((update * nbatch) / nseconds)
        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", str(update))
            logger.record_tabular("total_timesteps", update * nbatch)
            logger.record_tabular('rewards', np.mean(rewards))
            logger.record_tabular('values', np.mean(values))
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.record_tabular(
                "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.record_tabular(
                "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.dump_tabular()

            # I dati per i grafici
            graph_data['policy_entropy'].append(float(policy_entropy))
            graph_data['value_loss'].append(float(value_loss))
            graph_data['policy_loss'].append(float(policy_loss))
            graph_data['values_mean'].append(np.mean(values))
            graph_data['values_min'].append(np.min(values))
            graph_data['values_max'].append(np.max(values))
            graph_data['values_std'].append(np.std(values))
            graph_data['values_median'].append(np.median(values))
            graph_data['rewards_mean'].append(np.mean(rewards))
            graph_data['rewards_min'].append(np.min(rewards))
Пример #12
0
def learn(network,
          env,
          seed=None,
          nsteps=5,
          total_timesteps=int(80e6),
          vf_coef=0.5,
          ent_coef=0.01,
          max_grad_norm=0.5,
          lr=7e-4,
          lrschedule='linear',
          epsilon=1e-5,
          alpha=0.99,
          gamma=0.99,
          log_interval=100,
          load_path=None,
          **network_kwargs):
    '''
    A2C 알고리즘에 대한 주 진입지점. `a2c` 알고리즘을 사용하여 주어진 환경에서 주어진 망으로 정책을 벼림한다.

    Parameters:
    -----------

    network:            정책망 구조. 표준망 구조를 지정하는 문자열(mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small
                        , conv_only - 전체목록을 보려면 baselines.common/models.py를 보라), 또는 입력으로 텐서플로우
                        텐서를 가지고 출력 텐서는 망 마지막단 출력쌍(output_tensor, extra_feed)을 반환하는 함수,
                        , extra_feed 는 feed-forward 를 위해서는 None 다, 그리고 extra_feed 는 재사용신경망을 위한
                        망으로 상태를 사비하는 방법을 설명하는 목록(dictionary)이다. 정책에서 재사용신경망 사용에 대한
                        자세한 내용은 baselines.common/policies.py/lstm 을 보라.

    env:                강화각흡 환경. VecEnv(baselines.common/vec_env)와 비슷한 전달기를 구현하거나
                        DummyVecEnv(baselines.common/vec_env/dummy_vec_env.py)로 싸야 한다.

    seed:               알고리즘에서 뿌림수 순서를 복제하기 위한 씨알이다. 기본적으로 None 이다, 이것은 씨스템
                        노이즈생성기가 씨알임을 의미한다(복제하지 않는다)

    nsteps:             int, 환경을 배열의 보수 마다 갱신한다(즉, 사리수(batch size)는 nsteps * nenv 이다 여기에서
                        nenv 는 병렬로 모사한 환경을 복사한 개수다.)

    total_timesteps:    int, 벼림하기 위한 총 보수 (기본값: 80M)

    vf_coef:            float, 총손실 함수에서 가치함수 손실 앞의 계수 (기본값: 0.5)

    ent_coef:           float, 총손실 함수에서 정책 엔트로피 앞의 계수 (기본값: 0.01)

    max_gradient_norm:  float, 기울기(gradient)는 전역(global) L2 보다 크지않은 값으로 제한(clipped)한다 (기본값: 0.5)

    lr:                 float, RMSProp 을 위한 벼림비(현재 구현은 RMSProp 에서 강제(hardcoded)한다) (기본값: 7e-4)

    lrschedule:         벼림비 계획. 'linear', 'constant', 또는 [0..1] -> [0..1] 함수로 할수 있다, 이것은 벼림진행의
                        일부를 입력으로 취하여 출력으로 벼림비(lr 로 지정) 부분을 반환한다.

    epsilon:            float, RMSProp epsilon (RMSProp 갱신 분모로 제곱근 계산을 정상화 한다) (기본값: 1e-5)

    alpha:              float, RMSProp 에누리 참여값(decay parameter) (기본값: 0.99)

    gamma:              float, 포상 에누리 참여값(reward discounting parameter) (기본값: 0.99)

    log_interval:       int, 얼마나 자주 기록을 인쇄하는지 지정한다 (기본값: 100)

    **network_kwargs:   정책/망 작성기에 대한 열쇄글 결정고유값(arguments). baselines.common/policies.py/build_policy와
                        망의 특정 유형에 대한 결정고유값(arguments)을 봐라. 예를들어, 'mlp' 망 구조는 num_hidden 와
                        num_layers 의 결정고유값(arguments)을 가진다.

    '''

    set_global_seeds(seed)

    # 환경의 개수를 가져온다(Get the nb of env)
    nenvs = env.num_envs
    policy = build_policy(env, network, **network_kwargs)

    # 모형개체 대리자 (step_model(표집모형) 와 train_model(벼림모형)을 생성한다)
    model = Model(policy=policy,
                  env=env,
                  nsteps=nsteps,
                  ent_coef=ent_coef,
                  vf_coef=vf_coef,
                  max_grad_norm=max_grad_norm,
                  lr=lr,
                  alpha=alpha,
                  epsilon=epsilon,
                  total_timesteps=total_timesteps,
                  lrschedule=lrschedule)

    if load_path is not None:
        model.load(load_path)

    # 실행개체 대리자(Instantiate the runner object)
    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
    epinfobuf = deque(maxlen=100)

    # 사리수(batch_size) 계산
    nbatch = nenvs * nsteps

    # 전체타이머 시작
    tstart = time.time()

    for update in range(1, total_timesteps // nbatch + 1):
        # 경험의 작은 덩이를 가져온다. Get mini batch of experiences
        obs, states, rewards, masks, actions, values, epinfos = runner.run()
        epinfobuf.extend(epinfos)

        policy_loss, value_loss, policy_entropy = model.train(
            obs, states, rewards, masks, actions, values)
        nseconds = time.time() - tstart

        # fps 계산 (frame per second)
        fps = int((update * nbatch) / nseconds)

        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update * nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.record_tabular(
                "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.record_tabular(
                "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.dump_tabular()

    return model
Пример #13
0
def learn(network,
          env,
          seed,
          env_id=None,
          total_timesteps=int(40e6),
          gamma=0.99,
          log_interval=100,
          nprocs=32,
          nsteps=20,
          ent_coef=0.01,
          vf_coef=0.5,
          vf_fisher_coef=1.0,
          lr=0.25,
          max_grad_norm=0.5,
          kfac_clip=0.001,
          save_interval=None,
          save_path=None,
          lrschedule='linear',
          load_path=None,
          is_async=True,
          **network_kwargs):

    info_env = gym.make(env_id)
    algo = 'acktr'
    # wandb.init(project="floorplan_generator", name=algo)
    # wandb.config.algo = algo
    # # wandb.config.action_space = info_env.action_type
    # wandb.config.step_size = info_env.step_size
    #wandb.config.active_rewards = info_env.active_rewards
    #print("\n \n \n \n \n HI21 \n \n \n \n \n")
    if network == 'cnn':
        network_kwargs['one_dim_bias'] = True

    policy = build_policy(env, network, **network_kwargs)
    #print("\n \n \n \n \n HI22 \n \n \n \n \n")

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    make_model = lambda: Model(policy,
                               ob_space,
                               ac_space,
                               nenvs,
                               total_timesteps,
                               nprocs=nprocs,
                               nsteps=nsteps,
                               ent_coef=ent_coef,
                               vf_coef=vf_coef,
                               vf_fisher_coef=vf_fisher_coef,
                               lr=lr,
                               max_grad_norm=max_grad_norm,
                               kfac_clip=kfac_clip,
                               lrschedule=lrschedule,
                               is_async=is_async)
    # if save_interval and logger.get_dir():
    #     import cloudpickle
    #     print(osp.join(logger.get_dir(), 'make_model.pkl'))
    #     with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb+') as fh:
    #         print(make_model)
    #         fh.write(cloudpickle.dumps(make_model))
    model = make_model()
    #print("\n \n \n \n \n HI23 \n \n \n \n \n")

    if load_path is not None:
        model.load(load_path)

    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
    epinfobuf = deque(maxlen=100)
    nbatch = nenvs * nsteps
    tstart = time.time()
    coord = tf.train.Coordinator()
    if is_async:
        enqueue_threads = model.q_runner.create_threads(model.sess,
                                                        coord=coord,
                                                        start=True)
    else:
        enqueue_threads = []
    #print("\n \n \n \n \n HI24 \n \n \n \n \n")

    for update in range(1, total_timesteps // nbatch + 1):
        #print("step1")
        obs, states, rewards, masks, actions, values, epinfos = runner.run()
        #print("step2")
        epinfobuf.extend(epinfos)
        #print("step3")
        policy_loss, value_loss, policy_entropy = model.train(
            obs, states, rewards, masks, actions, values)
        #print("step4")
        model.old_obs = obs
        #print("step5")
        nseconds = time.time() - tstart
        #print("step6")
        fps = int((update * nbatch) / nseconds)

        if update % log_interval == 0 or update == 1:
            # images = env.get_images()
            # image = images[0]
            # writer.add_image('imresult', image, update, dataformats='HWC')
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update * nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("policy_loss", float(policy_loss))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.record_tabular(
                "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.record_tabular(
                "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.dump_tabular()

            # wandb.log({'eprewmean': safemean([epinfo['r'] for epinfo in epinfobuf]),
            #         'eplenmean': safemean([epinfo['l'] for epinfo in epinfobuf])})

        if save_interval and (update % save_interval == 0
                              or update == 1) and logger.get_dir():
            savepath = osp.join(logger.get_dir(), 'checkpoint%.5i' % update)
            savepath = save_path
            print('Saving to', savepath)
            model.save(savepath)

    coord.request_stop()
    coord.join(enqueue_threads)
    return model
Пример #14
0
def learn(network,
          env,
          seed=None,
          nsteps=5,
          total_timesteps=int(80e6),
          vf_coef=0.5,
          ent_coef=0.01,
          max_grad_norm=0.5,
          lr=7e-4,
          lrschedule='linear',
          epsilon=1e-5,
          alpha=0.99,
          gamma=0.99,
          lambda_=0.1,
          margin=0.1,
          i_before=1,
          log_interval=100,
          load_path=None,
          **network_kwargs):
    set_global_seeds(seed)

    # Get the nb of env
    nenvs = env.num_envs
    policy = build_policy(env, network, **network_kwargs)

    # Instantiate the model object (that creates step_model and train_model)
    model = Model(policy=policy,
                  env=env,
                  nsteps=nsteps,
                  ent_coef=ent_coef,
                  vf_coef=vf_coef,
                  max_grad_norm=max_grad_norm,
                  lr=lr,
                  alpha=alpha,
                  epsilon=epsilon,
                  total_timesteps=total_timesteps,
                  lrschedule=lrschedule,
                  lambda_=lambda_,
                  margin=margin)
    if load_path is not None:
        model.load(load_path)

    # Instantiate the runner object
    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
    epinfobuf = deque(maxlen=100)

    # Calculate the batch_size
    nbatch = nenvs * nsteps

    # Start total timer
    tstart = time.time()

    obses_before: deque[np.ndarray] = deque(maxlen=i_before + 1)

    for update in range(1, total_timesteps // nbatch + 1):
        # Get mini batch of experiences
        obs, states, rewards, masks, actions, values, epinfos = runner.run()
        epinfobuf.extend(epinfos)

        left_obs = []
        has_left_obs = []
        for start in range(0, nbatch, nsteps):
            result, has_obs = shift(obs[start:start + nsteps],
                                    i_before,
                                    fill_value=np.zeros_like(obs[0]))
            left_obs.append(result)
            has_left_obs.append(has_obs)
        left_obs = np.vstack(left_obs)
        has_left_obs = np.hstack(has_left_obs)

        right_obs = []
        has_right_obs = []
        for start in range(0, nbatch, nsteps):
            result, has_obs = shift(obs[start:start + nsteps],
                                    -1,
                                    fill_value=np.zeros_like(obs[0]))
            right_obs.append(result)
            has_right_obs.append(has_obs)
        right_obs = np.vstack(right_obs)
        has_right_obs = np.hstack(has_right_obs)

        has_triplet = np.logical_and(has_left_obs, has_right_obs).astype(float)

        policy_loss, value_loss, policy_entropy, repr_loss, delta_d = model.train(
            left_obs, obs, right_obs, states, rewards, masks, actions, values,
            has_triplet)

        nseconds = time.time() - tstart

        # Calculate the fps (frame per second)
        fps = int((update * nbatch) / nseconds)
        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update * nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("repr_loss", float(repr_loss))
            logger.record_tabular("delta_d", float(delta_d))
            logger.record_tabular("explained_variance", float(ev))
            logger.record_tabular(
                "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.record_tabular(
                "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.dump_tabular()
    return model