Exemplo n.º 1
0
def learn(network, env,
          seed=None,
          total_timesteps=None,
          nb_epochs=None, # with default settings, perform 1M steps total
          nb_epoch_cycles=20,
          nb_rollout_steps=100,
          reward_scale=1.0,
          render=False,
          render_eval=False,
          noise_type='adaptive-param_0.2',
          normalize_returns=False,
          normalize_observations=True,
          critic_l2_reg=1e-2,
          actor_lr=1e-4,
          critic_lr=1e-3,
          popart=False,
          gamma=0.99,
          clip_norm=None,
          nb_train_steps=50, # per epoch cycle and MPI worker,
          nb_eval_steps=100,
          batch_size=64, # per MPI worker
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50,
          load_path=None,
          **network_kwargs):

    set_global_seeds(seed)

    if total_timesteps is not None:
        assert nb_epochs is None
        nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps)
    else:
        nb_epochs = 500

    if MPI is not None:
        rank = MPI.COMM_WORLD.Get_rank()
    else:
        rank = 0

    nb_actions = env.action_space.shape[-1]
    assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.

    memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    critic = Critic(network=network, **network_kwargs)
    actor = Actor(nb_actions, network=network, **network_kwargs)

    action_noise = None
    param_noise = None
    if noise_type is not None:
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
            else:
                raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    max_action = env.action_space.high
    logger.info('scaling actions by {} before executing in env'.format(max_action))

    agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
        gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations,
        batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg,
        actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm,
        reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    sess = U.get_session()
    # Prepare everything.
    agent.initialize(sess)
    if load_path is not None:
        agent.load(load_path)
    sess.graph.finalize()

    agent.reset()

    obs = env.reset()
    if eval_env is not None:
        eval_obs = eval_env.reset()
    nenvs = obs.shape[0]

    episode_reward = np.zeros(nenvs, dtype = np.float32) #vector
    episode_step = np.zeros(nenvs, dtype = int) # vector
    episodes = 0 #scalar
    t = 0 # scalar

    epoch = 0



    start_time = time.time()

    epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_qs = []
    epoch_episodes = 0
    for epoch in range(nb_epochs):
        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.
            if nenvs > 1:
                # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each
                # of the environments, so resetting here instead
                agent.reset()
            for t_rollout in range(nb_rollout_steps):
                # Predict next action.
                action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True)

                # Execute next action.
                if rank == 0 and render:
                    env.render()

                # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch
                new_obs, r, done, info = env.step(max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                # note these outputs are batched from vecenv

                t += 1
                if rank == 0 and render:
                    env.render()
                episode_reward += r
                episode_step += 1

                # Book-keeping.
                epoch_actions.append(action)
                epoch_qs.append(q)
                agent.store_transition(obs, action, r, new_obs, done) #the batched data will be unrolled in memory.py's append.

                obs = new_obs

                for d in range(len(done)):
                    if done[d]:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward[d])
                        episode_rewards_history.append(episode_reward[d])
                        epoch_episode_steps.append(episode_step[d])
                        episode_reward[d] = 0.
                        episode_step[d] = 0
                        epoch_episodes += 1
                        episodes += 1
                        if nenvs == 1:
                            agent.reset()



            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []
            for t_train in range(nb_train_steps):
                # Adapt param noise, if necessary.
                if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                    distance = agent.adapt_param_noise()
                    epoch_adaptive_distances.append(distance)

                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()

            # Evaluate.
            eval_episode_rewards = []
            eval_qs = []
            if eval_env is not None:
                nenvs_eval = eval_obs.shape[0]
                eval_episode_reward = np.zeros(nenvs_eval, dtype = np.float32)
                for t_rollout in range(nb_eval_steps):
                    eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True)
                    eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    if render_eval:
                        eval_env.render()
                    eval_episode_reward += eval_r

                    eval_qs.append(eval_q)
                    for d in range(len(eval_done)):
                        if eval_done[d]:
                            eval_episode_rewards.append(eval_episode_reward[d])
                            eval_episode_rewards_history.append(eval_episode_reward[d])
                            eval_episode_reward[d] = 0.0

        if MPI is not None:
            mpi_size = MPI.COMM_WORLD.Get_size()
        else:
            mpi_size = 1

        # Log stats.
        # XXX shouldn't call np.mean on variable length lists
        duration = time.time() - start_time
        stats = agent.get_stats()
        combined_stats = stats.copy()
        combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
        combined_stats['rollout/return_std'] = np.std(epoch_episode_rewards)
        combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
        combined_stats['rollout/return_history_std'] = np.std(episode_rewards_history)
        combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
        combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
        combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
        combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
        combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
        combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
        combined_stats['total/duration'] = duration
        combined_stats['total/steps_per_second'] = float(t) / float(duration)
        combined_stats['total/episodes'] = episodes
        combined_stats['rollout/episodes'] = epoch_episodes
        combined_stats['rollout/actions_std'] = np.std(epoch_actions)
        # Evaluation statistics.
        if eval_env is not None:
            combined_stats['eval/return'] = eval_episode_rewards
            combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
            combined_stats['eval/Q'] = eval_qs
            combined_stats['eval/episodes'] = len(eval_episode_rewards)
        def as_scalar(x):
            if isinstance(x, np.ndarray):
                assert x.size == 1
                return x[0]
            elif np.isscalar(x):
                return x
            else:
                raise ValueError('expected scalar, got %s'%x)

        combined_stats_sums = np.array([ np.array(x).flatten()[0] for x in combined_stats.values()])
        if MPI is not None:
            combined_stats_sums = MPI.COMM_WORLD.allreduce(combined_stats_sums)

        combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)}

        # Total statistics.
        combined_stats['total/epochs'] = epoch + 1
        combined_stats['total/steps'] = t

        for key in sorted(combined_stats.keys()):
            logger.record_tabular(key, combined_stats[key])

        if rank == 0:
            logger.dump_tabular()
        logger.info('')
        logdir = logger.get_dir()
        if rank == 0 and logdir:
            if hasattr(env, 'get_state'):
                with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                    pickle.dump(env.get_state(), f)
            if eval_env and hasattr(eval_env, 'get_state'):
                with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                    pickle.dump(eval_env.get_state(), f)


    return agent
Exemplo n.º 2
0
def learn(
        network,
        env,
        seed=None,
        total_timesteps=None,
        nb_epochs=None,  # with default settings, perform 1M steps total
        nb_epoch_cycles=20,
        nb_rollout_steps=100,
        reward_scale=1.0,
        render=False,
        render_eval=False,
        noise_type='adaptive-param_0.2',
        normalize_returns=False,
        normalize_observations=True,
        actor_l2_reg=0.0,
        critic_l2_reg=1e-2,
        actor_lr=1e-4,
        critic_lr=1e-3,
        popart=False,
        gamma=0.99,
        clip_norm=None,
        nb_train_steps=50,  # per epoch cycle and MPI worker,
        nb_eval_steps=1000,
        batch_size=64,  # per MPI worker
        tau=0.01,
        eval_env=None,
        param_noise_adaption_interval=50,
        rb_size=1e6,
        save_interval=1,
        pretrain_epochs=0,
        load_path=None,
        demos_path=None,
        bc_teacher_lambda=0.0,
        use_qfilter=False,
        **network_kwargs):
    """Learns policy using DDPG, with vectorized environments.

    If we pass other arguments that aren't specified here, they are considered
    as network_kwargs.

    Parameters
    ----------
    noise_type: for noise to be added to the behavior policy. They are NOT
        using the noise type from the paper but 'AdaptiveParamNoiseSpec'. I
        _think_ that if one does the OU process, we get action noise, but not
        parameter noise. Also, be sure to use `name_stdev` in that convention,
        as the code will split the argument at the underscores.

    actor_lr: 1e-4  (matches paper)

    critic_lr: 1e-3  (matches paper)

    critic_l2: 1e-2  (matches paper)

    gamma: 0.99  (matches paper)

    batch_size: 64  (matches paper for lower-dim env obs/states)

    tau: 0.01 for soft target updates of actor and critic nets. Paper used 0.001.

    nb_epoch_cycles: number of times we go through this cycle of: (1) get
        rollouts with noise added to policy and apply to replay buffer, (2)
        gradient updates for actor/critic, (3) evaluation rollouts (if any).
        AFTER all of these cycles happen, THEN we log statistics.

    nb_rollout_steps: number of steps in each parallel env we take with
        exploration policy without training, so this is just to populate the
        replay buffer. More parallel envs *should* mean that we get more
        samples in the buffer between each gradient updates of the network, so
        this might need to be environment *and* machine (# of CPUs) specific.

    nb_train_steps: after doing `nb_rollout_steps` in each parallel env, we do
        this many updates; each involves sampling from the replay buffer and
        updating the actor and critic (via lagged target updates).

    nb_eval_steps: 1000, I changed from the 100 as default. Using 1000 ensures
        that fixed length envs like Ant-v2 can get one full episode (assuming
        one parallel env) during evaluation stagtes.

    eval_env: A separate environment for evaluation only, where no noise is
        applied, similar to how rlkit does it.

    save_interval: Frequency between saving.
    """
    set_global_seeds(seed)

    # Daniel: this helps to maintain compatibility with PPO2 code. For now
    # we're ignoring it, but we should check that we're always clipping. I
    # changed the nb_epochs to match with PPO2 in that we divide by nenvs.
    if 'limit_act_range' in network_kwargs:
        network_kwargs.pop('limit_act_range')
    nenvs = env.num_envs

    nbatchsize = nenvs * nb_epoch_cycles * nb_rollout_steps
    if total_timesteps is not None:
        assert nb_epochs is None
        nb_epochs = int(total_timesteps) // nbatchsize
    else:
        nb_epochs = 500

    if MPI is not None:
        rank = MPI.COMM_WORLD.Get_rank()
    else:
        rank = 0

    # we assume symmetric actions.
    nb_actions = env.action_space.shape[-1]
    assert (np.abs(env.action_space.low) == env.action_space.high).all()

    # Form XP (1M steps, same as in paper), and critic/actor networks.
    # Daniel: force dtype here so we can use uint8 type images.
    assert env.observation_space.low.dtype == env.observation_space.high.dtype
    memory = Memory(limit=int(rb_size),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape,
                    dtype=env.observation_space.low.dtype)
    critic = Critic(network=network, **network_kwargs)
    actor = Actor(nb_actions, network=network, **network_kwargs)

    action_noise = None
    param_noise = None
    if noise_type is not None:
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                #action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                #                                 sigma=float(stddev)*np.ones(nb_actions))
                #if nenvs > 1:
                # Daniel: adding this to replace the former.
                action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                                 sigma=float(stddev) *
                                                 np.ones(nb_actions),
                                                 shape=(nenvs, nb_actions))
            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(
                    mu=np.zeros(nb_actions),
                    sigma=float(stddev) * np.ones(nb_actions))
            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))

    # The `learn` defaults above have priority over defaults in DDPG class.
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 actor_l2_reg=actor_l2_reg,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale,
                 bc_teacher_lambda=bc_teacher_lambda,
                 use_qfilter=use_qfilter)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Prepare everything.
    sess = U.get_session()
    agent.initialize(sess)
    # --------------------------------------------------------------------------
    # Daniel: similar as PPO2 code as `agent` is similar to `model` but has to
    # be initialized explicitly above. Must call after `agent.load` gets
    # created.  Not sure if this works with parameter space noise or with
    # normalization, but I don't plan to resume training (for now). It also has
    # to be *before* the `graph.finalize()` because otherwise we get an error.
    # --------------------------------------------------------------------------
    if load_path is not None:
        logger.info("\nInside ddpg, loading model from: {}".format(load_path))
        agent.load(load_path)
    # --------------------------------------------------------------------------
    sess.graph.finalize()
    agent.reset()

    # --------------------------------------------------------------------------
    # Daniel: populate replay buffer, followed by pretraining stage.
    # But if load_path is not None, then doesn't make sense -- we want to load.
    # We also don't need to do this if timesteps is 0 (e.g., for playing policy).
    # --------------------------------------------------------------------------
    if total_timesteps == 0:
        return agent
    assert seed == 1500, 'We normally want seed 1500, yet: {}'.format(seed)

    if (demos_path is not None and load_path is None):
        _ddpg_demos(demos_path, agent, memory)
        assert memory.nb_entries == memory.nb_teach_entries, memory.nb_entries
        checkdir = osp.join(logger.get_dir(), 'checkpoints')
        os.makedirs(checkdir, exist_ok=True)

        # Pretrain, based on their training code for some # of minibatches.
        pt_actor_losses = []
        pt_critic_losses = []
        batches_per_ep = int(memory.nb_entries / batch_size)
        logger.info(
            'Running pre-training for {} epochs'.format(pretrain_epochs))
        logger.info('  data size in memory: {}'.format(memory.nb_entries))
        logger.info('  each batch: {}, epoch mbs: {}'.format(
            batch_size, batches_per_ep))
        pt_start = time.time()

        for epoch in range(1, pretrain_epochs + 1):
            c_losses = []
            a_losses = []
            for _ in range(batches_per_ep):
                cl, al = agent.train(during_pretrain=True)
                agent.update_target_net()
                c_losses.append(cl)
                a_losses.append(al)
            pt_critic_losses.append(np.mean(c_losses))
            pt_actor_losses.append(np.mean(a_losses))

            # Check and save model occasionally.
            if epoch == 1 or epoch % 5 == 0:
                pt_time = (time.time() - pt_start) / 60.
                logger.info(
                    '  epoch done: {}, loss over past epoch: {:.4f}'.format(
                        str(epoch).zfill(4), pt_actor_losses[-1]))
                logger.info('  critic loss over past epoch: {:.4f}'.format(
                    pt_critic_losses[-1]))
                logger.info('  elapsed time: {:.1f}m'.format(pt_time))
                savepath = osp.join(
                    checkdir, 'pretrain_epoch_{}'.format(str(epoch).zfill(4)))
                logger.info('Saving model checkpoint to: ', savepath)
                agent.save(savepath)

        pt_time = (time.time() - pt_start) / 60.
        logger.info('losses a: {}'.format(np.array(pt_actor_losses)))
        logger.info('losses c: {}'.format(np.array(pt_critic_losses)))
        logger.info('Finished loading teacher samples + pre-training.')
        logger.info('Pre-training took {:.1f}m.\n'.format(pt_time))
    # --------------------------------------------------------------------------

    # Back to their code. For cloth, `env.reset()` takes a while so we put it here.
    obs = env.reset()
    if eval_env is not None:
        eval_obs = eval_env.reset()
    nenvs = obs.shape[0]

    # Daniel: Debugging/sanity checks.
    _variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
    U.display_var_info(_variables)
    logger.info("\nInside DDPG, about to start epochs")
    logger.info(
        "nbatchsize: {}, get this in buffer before DDPG updates".format(
            nbatchsize))
    logger.info("  i.e.: (nenv {}) * (cycles {}) * (nsteps {})".format(
        nenvs, nb_epoch_cycles, nb_rollout_steps))
    logger.info("nb_epochs: {}, number of cycles to use".format(nb_epochs))
    logger.info("eval_env None? {}".format(eval_env is None))
    logger.info("(end of debugging messages)\n")

    # File paths.
    checkdir = osp.join(logger.get_dir(), 'checkpoints')
    action_dir = osp.join(logger.get_dir(), 'actions')
    episode_dir = osp.join(logger.get_dir(), 'ep_all_infos')
    os.makedirs(checkdir, exist_ok=True)
    os.makedirs(action_dir, exist_ok=True)
    os.makedirs(episode_dir, exist_ok=True)

    # Daniel: use these two to store past 100 episode history. Report these stats!
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    all_eval_episode_rewards = []

    # reward/step: cumulative quantities for each episode in vecenv.
    # epoch_{actions,qs} will grow without bound, fyi.
    episode_reward = np.zeros(nenvs, dtype=np.float32)  #vector
    episode_step = np.zeros(nenvs, dtype=int)  # vector
    episodes = 0  #scalar
    t = 0  # scalar
    epoch = 0
    start_time = time.time()
    epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_qs = []
    epoch_episodes = 0

    for epoch in range(nb_epochs):
        mb_actions = []
        mb_epinfos = []
        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.
            if nenvs > 1:
                # if simulating multiple envs in parallel, impossible to reset
                # agent at the end of the episode in each of the environments,
                # so resetting here instead
                agent.reset()

            # Daniel: pure data collection (noise added) to populate replay buffer.
            # No training until after this, and note the parallel stepping (VecEnv).
            for t_rollout in range(nb_rollout_steps):
                # Predict next action.
                # action: (#_parallel_envs, ac_dim), q: (#_parallel_envs, 1)
                action, q, _, _ = agent.step(obs,
                                             apply_noise=True,
                                             compute_Q=True)
                # Execute next action.
                if rank == 0 and render:
                    env.render()

                # max_action is of dimension A, whereas action is dimension
                # (nenvs, A) - the multiplication gets broadcasted to the batch
                # scale for execution in env (as far as DDPG is concerned,
                # every action is in [-1, 1])
                new_obs, r, done, info = env.step(max_action * action)
                r = r.astype(np.float32)
                # note these outputs are batched from vecenv (first dim = batch).
                t += 1
                if rank == 0 and render:
                    env.render()
                episode_reward += r
                episode_step += 1

                # Book-keeping. (Daniel: agent.train() doesn't require these two lists)
                epoch_actions.append(action)
                epoch_qs.append(q)
                # Daniel: Same as PPO2 code.
                mb_actions.append(action)
                for inf in info:
                    maybeepinfo = inf.get('episode')
                    if maybeepinfo:
                        mb_epinfos.append(inf)
                #the batched data will be unrolled in memory.py's append.
                agent.store_transition(obs, action, r, new_obs, done)
                obs = new_obs

                for d in range(len(done)):
                    if done[d]:
                        # Episode done.
                        epoch_episode_rewards.append(
                            episode_reward[d])  # Entire history
                        episode_rewards_history.append(
                            episode_reward[d])  # Last 100 only
                        epoch_episode_steps.append(episode_step[d])
                        episode_reward[d] = 0.
                        episode_step[d] = 0
                        epoch_episodes += 1
                        episodes += 1
                        if nenvs == 1:
                            agent.reset()

            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []
            for t_train in range(nb_train_steps):
                # Adapt param noise, if necessary.
                if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                    distance = agent.adapt_param_noise()
                    epoch_adaptive_distances.append(distance)

                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()

            # Evaluate. (Daniel: note that no noise is applied here.)
            # Also it seems like episodes do not naturally reset before this starts?
            # Also, unlike epoch_episode_reward, here we create eval_episode_reward here ...
            eval_episode_rewards = []
            eval_qs = []
            if eval_env is not None:
                logger.info('Now on the eval_env for {} steps...'.format(
                    nb_eval_steps))
                nenvs_eval = eval_obs.shape[0]
                eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32)
                for t_rollout in range(nb_eval_steps):
                    eval_action, eval_q, _, _ = agent.step(eval_obs,
                                                           apply_noise=False,
                                                           compute_Q=True)
                    # scale for execution in env (for DDPG, every action is in [-1, 1])
                    eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                        max_action * eval_action)
                    if render_eval:
                        eval_env.render()
                    eval_episode_reward += eval_r

                    eval_qs.append(eval_q)
                    for d in range(len(eval_done)):
                        if eval_done[d]:
                            eval_episode_rewards.append(eval_episode_reward[d])
                            eval_episode_rewards_history.append(
                                eval_episode_reward[d])
                            all_eval_episode_rewards.append(
                                eval_episode_reward[d])
                            eval_episode_reward[
                                d] = 0.0  # Daniel: reset for next episode.

        if MPI is not None:
            mpi_size = MPI.COMM_WORLD.Get_size()
        else:
            mpi_size = 1

        # Log stats.
        # XXX shouldn't call np.mean on variable length lists
        duration = time.time() - start_time
        stats = agent.get_stats()
        combined_stats = stats.copy()
        combined_stats['memory/nb_entries'] = memory.nb_entries
        combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
        combined_stats['rollout/return_history'] = np.mean(
            episode_rewards_history)
        combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
        combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
        combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
        combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
        combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
        combined_stats['train/param_noise_distance'] = np.mean(
            epoch_adaptive_distances)
        combined_stats['total/duration'] = duration
        combined_stats['total/steps_per_second'] = float(t) / float(duration)
        combined_stats['total/episodes'] = episodes
        combined_stats['rollout/episodes'] = epoch_episodes
        combined_stats['rollout/actions_std'] = np.std(epoch_actions)
        # Evaluation statistics. (Daniel: use eval/return_history for plots)
        if eval_env is not None:
            combined_stats['eval/return'] = np.mean(eval_episode_rewards)
            combined_stats['eval/return_history'] = np.mean(
                eval_episode_rewards_history)
            combined_stats['eval/Q'] = eval_qs
            combined_stats['eval/episodes'] = len(eval_episode_rewards)

        def as_scalar(x):
            if isinstance(x, np.ndarray):
                assert x.size == 1
                return x[0]
            elif np.isscalar(x):
                return x
            else:
                raise ValueError('expected scalar, got %s' % x)

        combined_stats_sums = np.array(
            [np.array(x).flatten()[0] for x in combined_stats.values()])
        if MPI is not None:
            combined_stats_sums = MPI.COMM_WORLD.allreduce(combined_stats_sums)

        combined_stats = {
            k: v / mpi_size
            for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
        }

        # Total statistics.
        combined_stats['total/epochs'] = epoch + 1
        combined_stats['total/steps_per_env'] = t

        for key in sorted(combined_stats.keys()):
            logger.record_tabular(key, combined_stats[key])

        if rank == 0:
            logger.dump_tabular()
        logger.info('')
        logdir = logger.get_dir()
        if rank == 0 and logdir:
            if hasattr(env, 'get_state'):
                with open(osp.join(logdir, 'env_state.pkl'), 'wb') as f:
                    pickle.dump(env.get_state(), f)
            if eval_env and hasattr(eval_env, 'get_state'):
                with open(osp.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                    pickle.dump(eval_env.get_state(), f)
            # Daniel: arguable, we can save all episodes but hard if we don't know the steps.
            #if eval_env:
            #    with open(os.path.join(logdir, 'all_eval_episode_rewards.pkl'), 'wb') as f:
            #        pickle.dump(all_eval_episode_rewards, f)

        # Daniel: we can use cycle or epoch for this if condition ... kind of annoying but w/e.
        if cycle % save_interval == 0:
            logger.info('We are now saving stuff!!')
            savepath = osp.join(checkdir, '%.5i' % epoch)
            logger.info('Saving model checkpoint to: ', savepath)
            agent.save(savepath)
            # ------------------------------------------------------------------
            # Daniel: extra stuff for debugging PPO on cloth, actions and infos for each episode.
            mb_actions = _sf01(np.asarray(mb_actions))
            act_savepath = osp.join(action_dir, 'actions_%.5i.pkl' % epoch)
            epi_savepath = osp.join(episode_dir, 'infos_%.5i.pkl' % epoch)
            with open(act_savepath, 'wb') as fh:
                pickle.dump(mb_actions, fh)
            with open(epi_savepath, 'wb') as fh:
                pickle.dump(mb_epinfos, fh)

        # Daniel: we were not resetting earlier. Actually there are other
        # epoch_stats which we might consider resetting here?
        epoch_episodes = 0

    return agent