Пример #1
0
    def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None,
        gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True,
        batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf),
        critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.):

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.observation_shape = observation_shape
        self.critic = critic
        self.actor = actor
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg
        self.actor_lr = tf.constant(actor_lr)
        self.critic_lr = tf.constant(critic_lr)

        # Observation normalization.
        if self.normalize_observations:
            with tf.name_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None

        # Return normalization.
        if self.normalize_returns:
            with tf.name_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        self.target_critic = Critic(actor.nb_actions, observation_shape, name='target_critic', network=critic.network, **critic.network_kwargs)
        self.target_actor = Actor(actor.nb_actions, observation_shape, name='target_actor', network=actor.network, **actor.network_kwargs)

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise()

        if MPI is not None:
            comm = MPI.COMM_WORLD
            self.actor_optimizer = MpiAdamOptimizer(comm, self.actor.trainable_variables)
            self.critic_optimizer = MpiAdamOptimizer(comm, self.critic.trainable_variables)
        else:
            self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr)
            self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=critic_lr)

        logger.info('setting up actor optimizer')
        actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_variables]
        actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        logger.info('setting up critic optimizer')
        critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_variables]
        critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        if self.critic_l2_reg > 0.:
            critic_reg_vars = []
            for layer in self.critic.network_builder.layers[1:]:
                critic_reg_vars.append(layer.kernel)
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(self.critic_l2_reg))

        logger.info('setting up critic target updates ...')
        for var, target_var in zip(self.critic.variables, self.target_critic.variables):
            logger.info('  {} <- {}'.format(target_var.name, var.name))
        logger.info('setting up actor target updates ...')
        for var, target_var in zip(self.actor.variables, self.target_actor.variables):
            logger.info('  {} <- {}'.format(target_var.name, var.name))

        if self.param_noise:
            logger.info('setting up param noise')
            for var, perturbed_var in zip(self.actor.variables, self.perturbed_actor.variables):
                if var in actor.perturbable_vars:
                    logger.info('  {} <- {} + noise'.format(perturbed_var.name, var.name))
                else:
                    logger.info('  {} <- {}'.format(perturbed_var.name, var.name))
            for var, perturbed_var in zip(self.actor.variables, self.perturbed_adaptive_actor.variables):
                if var in actor.perturbable_vars:
                    logger.info('  {} <- {} + noise'.format(perturbed_var.name, var.name))
                else:
                    logger.info('  {} <- {}'.format(perturbed_var.name, var.name))

        if self.normalize_returns and self.enable_popart:
            self.setup_popart()

        self.initial_state = None # recurrent architectures not supported yet
Пример #2
0
def run(env_id,
        seed,
        noise_type,
        layer_norm,
        evaluation,
        actor_lr,
        critic_lr,
        classifier_lr,
        dropout,
        rho_W=-4,
        rho_b=-4,
        entropy_coeff=1.0,
        g_step=20,
        timesteps_per_batch=1024,
        **kwargs):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    #env = gym.make(env_id)
    env = make_env(env_id)
    env = bench.Monitor(env,
                        logger.get_dir()
                        and os.path.join(logger.get_dir(), str(rank)),
                        allow_early_resets=True)
    gym.logger.setLevel(logging.WARN)

    if evaluation and rank == 0:
        #eval_env = gym.make(env_id)
        eval_env = make_env(env_id)
        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'))
        env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    fifomemory = FIFOMemory(limit=int(64))  # TODO: customize choosing of limit
    memory = Memory(limit=int(1e6),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    if 0 < dropout and dropout < 1:
        actor = NoiseDropoutActor(nb_actions,
                                  rho_W=rho_W,
                                  rho_b=rho_b,
                                  layer_norm=layer_norm,
                                  p=dropout)
    else:
        actor = NoiseActor(nb_actions,
                           rho_W=rho_W,
                           rho_b=rho_b,
                           layer_norm=layer_norm)
    classifier = Classifier(layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed_old = seed
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Build callback
    arg = {}
    arg['seed'] = seed_old
    arg['env_id'] = env_id
    arg['noise_type'] = noise_type
    arg['rhoW'] = rho_W
    arg['rhob'] = rho_b
    arg['entropy_coeff'] = entropy_coeff
    arg['actor_lr'] = actor_lr
    arg['critic_lr'] = critic_lr
    arg['classifier_lr'] = classifier_lr
    arg['dropout'] = dropout
    arg['gstep'] = g_step
    arg['timesteps_per_batch'] = timesteps_per_batch
    callback = CALLBACK(arg)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    setup_and_learn(env=env,
                    eval_env=eval_env,
                    action_noise=action_noise,
                    actor=actor,
                    critic=critic,
                    classifier=classifier,
                    memory=memory,
                    fifomemory=fifomemory,
                    actor_lr=actor_lr,
                    critic_lr=critic_lr,
                    classifier_lr=classifier_lr,
                    callback=callback,
                    entropy_coeff=entropy_coeff,
                    g_step=g_step,
                    timesteps_per_batch=timesteps_per_batch,
                    **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Пример #3
0
def learn_setup(
        network,
        env,
        seed=None,
        total_timesteps=None,
        iterations=None,
        nb_epochs=None,  # with default settings, perform 1M steps total
        nb_epoch_cycles=None,
        nb_rollout_steps=100,
        n_episodes=None,
        logspace=True,
        n_steps_per_episode=None,
        reward_threshold=0,
        reward_scale=1.0,
        render=False,
        render_eval=False,
        noise_type='adaptive-param_0.2',
        noise_level="0.2",
        normalize_returns=False,
        normalize_observations=True,
        critic_l2_reg=1e-2,
        exp_name="test",
        actor_lr=1e-4,
        critic_lr=1e-3,
        popart=False,
        gamma=0.99,
        clip_norm=None,
        nb_train_steps=50,  # per epoch cycle and MPI worker,
        nb_eval_steps=100,
        batch_size=64,  # per MPI worker
        tau=0.01,
        eval_env=None,
        param_noise_adaption_interval=50,
        **network_kwargs):
    if logspace:
        actor_lr = 10**-actor_lr
        critic_lr = 10**-critic_lr
        batch_size = 2**int(batch_size)
        if seed is None:
            seed = 17
        seed = int(seed)
        tau = 10**-tau
    set_global_seeds(seed)
    if nb_epoch_cycles is None:
        nb_epoch_cycles = n_episodes
        nb_rollout_steps = n_steps_per_episode
    else:
        input("Not using automated interface? ")

    if total_timesteps is not None:
        assert nb_epochs is None
        nb_epochs = int(total_timesteps) // (nb_epoch_cycles *
                                             nb_rollout_steps)

    if MPI is not None:
        rank = MPI.COMM_WORLD.Get_rank()
    else:
        rank = 0
    nb_actions = env.action_space.shape[-1]
    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.

    memory = Memory(limit=int(1e5),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape)
    critic = Critic(network=network, **network_kwargs)
    actor = Actor(nb_actions, network=network, **network_kwargs)

    action_noise = None
    param_noise = None
    if noise_type is not None:
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:

                _, stddev = current_noise_type.split('_')
                stddev = noise_level
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                                 sigma=float(stddev) *
                                                 np.ones(nb_actions))
            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(
                    mu=np.zeros(nb_actions),
                    sigma=float(stddev) * np.ones(nb_actions))
            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

    max_action = env.action_space.high
    #print("actual max action", max_action)
    max_action = 1
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))

    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')

    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    sess = U.get_session()
    # Prepare everything.
    agent.initialize(sess)
    sess.graph.finalize()
    agent.reset()
    obs = env.reset()
    if eval_env is not None:
        eval_obs = eval_env.reset()
    nenvs = obs.shape[0]
    episode_reward = np.zeros(nenvs, dtype=np.float32)  #vector
    episode_step = np.zeros(nenvs, dtype=int)  # vector
    episodes = 0  #scalar
    t = 0  # scalar
    epoch = 0
    start_time = time.time()
    epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_qs = []
    epoch_episodes = 0
    local_variables = {
        "epoch_episode_rewards": epoch_episode_rewards,
        "epoch_episode_steps": epoch_episode_steps,
        "batch_size": batch_size,
        "eval_env": eval_env,
        "reward_threshold": reward_threshold,
        "epoch_actions": epoch_actions,
        "nb_train_steps": nb_train_steps,
        "epoch_qs": epoch_qs,
        "start_time": start_time,
        "epoch_episodes": [epoch_episodes],
        "nb_epoch_cycles": nb_epoch_cycles,
        "nb_rollout_steps": nb_rollout_steps,
        "agent": agent,
        "memory": memory,
        "max_action": max_action,
        "env": env,
        "nenvs": nenvs,
        "obs": [obs],  #Forgive me 6.031
        "t": [t],
        "episode_reward": episode_reward,
        "episode_rewards_history": episode_rewards_history,
        "episode_step": episode_step,
        "episodes": [episodes],
        "rank": rank,
        "param_noise_adaption_interval": param_noise_adaption_interval,
        "noise_type": noise_type,
        "render": render
    }
    return local_variables
Пример #4
0
class DDPG(tf.Module):
    def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None,
        gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True,
        batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf),
        critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.):

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.observation_shape = observation_shape
        self.critic = critic
        self.actor = actor
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg
        self.actor_lr = tf.constant(actor_lr)
        self.critic_lr = tf.constant(critic_lr)

        # Observation normalization.
        if self.normalize_observations:
            with tf.name_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None

        # Return normalization.
        if self.normalize_returns:
            with tf.name_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        self.target_critic = Critic(actor.nb_actions, observation_shape, name='target_critic', network=critic.network, **critic.network_kwargs)
        self.target_actor = Actor(actor.nb_actions, observation_shape, name='target_actor', network=actor.network, **actor.network_kwargs)

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise()

        if MPI is not None:
            comm = MPI.COMM_WORLD
            self.actor_optimizer = MpiAdamOptimizer(comm, self.actor.trainable_variables)
            self.critic_optimizer = MpiAdamOptimizer(comm, self.critic.trainable_variables)
        else:
            self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr)
            self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=critic_lr)

        logger.info('setting up actor optimizer')
        actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_variables]
        actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        logger.info('setting up critic optimizer')
        critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_variables]
        critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        if self.critic_l2_reg > 0.:
            critic_reg_vars = []
            for layer in self.critic.network_builder.layers[1:]:
                critic_reg_vars.append(layer.kernel)
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(self.critic_l2_reg))

        logger.info('setting up critic target updates ...')
        for var, target_var in zip(self.critic.variables, self.target_critic.variables):
            logger.info('  {} <- {}'.format(target_var.name, var.name))
        logger.info('setting up actor target updates ...')
        for var, target_var in zip(self.actor.variables, self.target_actor.variables):
            logger.info('  {} <- {}'.format(target_var.name, var.name))

        if self.param_noise:
            logger.info('setting up param noise')
            for var, perturbed_var in zip(self.actor.variables, self.perturbed_actor.variables):
                if var in actor.perturbable_vars:
                    logger.info('  {} <- {} + noise'.format(perturbed_var.name, var.name))
                else:
                    logger.info('  {} <- {}'.format(perturbed_var.name, var.name))
            for var, perturbed_var in zip(self.actor.variables, self.perturbed_adaptive_actor.variables):
                if var in actor.perturbable_vars:
                    logger.info('  {} <- {} + noise'.format(perturbed_var.name, var.name))
                else:
                    logger.info('  {} <- {}'.format(perturbed_var.name, var.name))

        if self.normalize_returns and self.enable_popart:
            self.setup_popart()

        self.initial_state = None # recurrent architectures not supported yet


    def setup_param_noise(self):
        assert self.param_noise is not None

        # Configure perturbed actor.
        self.perturbed_actor = Actor(self.actor.nb_actions, self.observation_shape, name='param_noise_actor', network=self.actor.network, **self.actor.network_kwargs)

        # Configure separate copy for stddev adoption.
        self.perturbed_adaptive_actor = Actor(self.actor.nb_actions, self.observation_shape, name='adaptive_param_noise_actor', network=self.actor.network, **self.actor.network_kwargs)

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1

    @tf.function
    def step(self, obs, apply_noise=True, compute_Q=True):
        normalized_obs = tf.clip_by_value(normalize(obs, self.obs_rms), self.observation_range[0], self.observation_range[1])
        actor_tf = self.actor(normalized_obs)
        if self.param_noise is not None and apply_noise:
            action = self.perturbed_actor(normalized_obs)
        else:
            action = actor_tf

        if compute_Q:
            normalized_critic_with_actor_tf = self.critic(normalized_obs, actor_tf)
            q = denormalize(tf.clip_by_value(normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
        else:
            q = None

        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            action += noise
        action = tf.clip_by_value(action, self.action_range[0], self.action_range[1])

        return action, q, None, None

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        reward *= self.reward_scale

        B = obs0.shape[0]
        for b in range(B):
            self.memory.append(obs0[b], action[b], reward[b], obs1[b], terminal1[b])
            if self.normalize_observations:
                self.obs_rms.update(np.array([obs0[b]]))

    def train(self):
        batch = self.memory.sample(batch_size=self.batch_size)
        obs0, obs1 = tf.constant(batch['obs0']), tf.constant(batch['obs1'])
        actions, rewards, terminals1 = tf.constant(batch['actions']), tf.constant(batch['rewards']), tf.constant(batch['terminals1'], dtype=tf.float32)
        normalized_obs0, target_Q = self.compute_normalized_obs0_and_target_Q(obs0, obs1, rewards, terminals1)

        if self.normalize_returns and self.enable_popart:
            old_mean = self.ret_rms.mean
            old_std = self.ret_rms.std
            self.ret_rms.update(target_Q.flatten())
            # renormalize Q outputs
            new_mean = self.ret_rms.mean
            new_std = self.ret_rms.std
            for vs in [self.critic.output_vars, self.target_critic.output_vars]:
                kernel, bias = vs
                kernel.assign(kernel * old_std / new_std)
                bias.assign((bias * old_std + old_mean - new_mean) / new_std)


        actor_grads, actor_loss = self.get_actor_grads(normalized_obs0)
        critic_grads, critic_loss = self.get_critic_grads(normalized_obs0, actions, target_Q)

        if MPI is not None:
            self.actor_optimizer.apply_gradients(actor_grads, self.actor_lr)
            self.critic_optimizer.apply_gradients(critic_grads, self.critic_lr)
        else:
            self.actor_optimizer.apply_gradients(zip(actor_grads, self.actor.trainable_variables))
            self.critic_optimizer.apply_gradients(zip(critic_grads, self.critic.trainable_variables))

        return critic_loss, actor_loss

    @tf.function
    def compute_normalized_obs0_and_target_Q(self, obs0, obs1, rewards, terminals1):
        normalized_obs0 = tf.clip_by_value(normalize(obs0, self.obs_rms), self.observation_range[0], self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(obs1, self.obs_rms), self.observation_range[0], self.observation_range[1])
        Q_obs1 = denormalize(self.target_critic(normalized_obs1, self.target_actor(normalized_obs1)), self.ret_rms)
        target_Q = rewards + (1. - terminals1) * self.gamma * Q_obs1
        return normalized_obs0, target_Q

    @tf.function
    def get_actor_grads(self, normalized_obs0):
        with tf.GradientTape() as tape:
            actor_tf = self.actor(normalized_obs0)
            normalized_critic_with_actor_tf = self.critic(normalized_obs0, actor_tf)
            critic_with_actor_tf = denormalize(tf.clip_by_value(normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
            actor_loss = -tf.reduce_mean(critic_with_actor_tf)
        actor_grads = tape.gradient(actor_loss, self.actor.trainable_variables)
        if self.clip_norm:
            actor_grads = [tf.clip_by_norm(grad, clip_norm=self.clip_norm) for grad in actor_grads]
        if MPI is not None:
            actor_grads = tf.concat([tf.reshape(g, (-1,)) for g in actor_grads], axis=0)
        return actor_grads, actor_loss

    @tf.function
    def get_critic_grads(self, normalized_obs0, actions, target_Q):
        with tf.GradientTape() as tape:
            normalized_critic_tf = self.critic(normalized_obs0, actions)
            normalized_critic_target_tf = tf.clip_by_value(normalize(target_Q, self.ret_rms), self.return_range[0], self.return_range[1])
            critic_loss = tf.reduce_mean(tf.square(normalized_critic_tf - normalized_critic_target_tf))
            # The first is input layer, which is ignored here.
            if self.critic_l2_reg > 0.:
                # Ignore the first input layer.
                for layer in self.critic.network_builder.layers[1:]:
                    # The original l2_regularizer takes half of sum square.
                    critic_loss += (self.critic_l2_reg / 2.)* tf.reduce_sum(tf.square(layer.kernel))
        critic_grads = tape.gradient(critic_loss, self.critic.trainable_variables)
        if self.clip_norm:
            critic_grads = [tf.clip_by_norm(grad, clip_norm=self.clip_norm) for grad in critic_grads]
        if MPI is not None:
            critic_grads = tf.concat([tf.reshape(g, (-1,)) for g in critic_grads], axis=0)
        return critic_grads, critic_loss


    def initialize(self):
        if MPI is not None:
            sync_from_root(self.actor.trainable_variables + self.critic.trainable_variables)
        self.target_actor.set_weights(self.actor.get_weights())
        self.target_critic.set_weights(self.critic.get_weights())

    @tf.function
    def update_target_net(self):
        for var, target_var in zip(self.actor.variables, self.target_actor.variables):
            target_var.assign((1. - self.tau) * target_var + self.tau * var)
        for var, target_var in zip(self.critic.variables, self.target_critic.variables):
            target_var.assign((1. - self.tau) * target_var + self.tau * var)

    def get_stats(self):

        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        obs0 = self.stats_sample['obs0']
        actions = self.stats_sample['actions']
        normalized_obs0 = tf.clip_by_value(normalize(obs0, self.obs_rms), self.observation_range[0], self.observation_range[1])
        normalized_critic_tf = self.critic(normalized_obs0, actions)
        critic_tf = denormalize(tf.clip_by_value(normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
        actor_tf = self.actor(normalized_obs0)
        normalized_critic_with_actor_tf = self.critic(normalized_obs0, actor_tf)
        critic_with_actor_tf = denormalize(tf.clip_by_value(normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms)

        stats = {}
        if self.normalize_returns:
            stats['ret_rms_mean'] = self.ret_rms.mean
            stats['ret_rms_std'] = self.ret_rms.std
        if self.normalize_observations:
            stats['obs_rms_mean'] = tf.reduce_mean(self.obs_rms.mean)
            stats['obs_rms_std'] = tf.reduce_mean(self.obs_rms.std)
        stats['reference_Q_mean'] = tf.reduce_mean(critic_tf)
        stats['reference_Q_std'] = reduce_std(critic_tf)
        stats['reference_actor_Q_mean'] = tf.reduce_mean(critic_with_actor_tf)
        stats['reference_actor_Q_std'] = reduce_std(critic_with_actor_tf)
        stats['reference_action_mean'] = tf.reduce_mean(actor_tf)
        stats['reference_action_std'] = reduce_std(actor_tf)

        if self.param_noise:
            perturbed_actor_tf = self.perturbed_actor(normalized_obs0)
            stats['reference_perturbed_action_mean'] = tf.reduce_mean(perturbed_actor_tf)
            stats['reference_perturbed_action_std'] = reduce_std(perturbed_actor_tf)
            stats.update(self.param_noise.get_stats())
        return stats



    def adapt_param_noise(self, obs0):
        try:
            from mpi4py import MPI
        except ImportError:
            MPI = None

        if self.param_noise is None:
            return 0.

        mean_distance = self.get_mean_distance(obs0).numpy()
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    logging.basicConfig(filename='noGazebo_ddpg.log',
                        level=logging.DEBUG,
                        filemode="w")
    logging.getLogger().addHandler(logging.StreamHandler())

    # Configure logger for the process with rank 0 (main-process?)
    # MPI = Message Passing Interface, for parallel computing; rank = process identifier within a group of processes
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        # Disable logging for rank != 0 to avoid noise.
        logging.debug(
            "I'm MPI worker {} and I guess I just log nothing".format(rank))
        logger.set_level(logger.DISABLED)
        logging.disable(logging.CRITICAL)

    logging.info(
        "********************************************* Starting RL algorithm *********************************************"
    )
    now = datetime.datetime.now()
    logging.info(now.isoformat())

    # Create envs.
    env = gym.make(env_id)
    env = bench.Monitor(env,
                        logger.get_dir()
                        and os.path.join(logger.get_dir(), str(rank)),
                        allow_early_resets=True)

    if evaluation and rank == 0:
        eval_env = gym.make(env_id)
        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'))
        env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[0]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Configure components. (initialize memory, critic & actor objects)
    logging.info("action space of env: {}".format(env.action_space))  # Box(2,)
    logging.info("observation space of env: {}".format(
        env.observation_space))  # Box(51200,)
    memory = Memory(limit=int(1e4),
                    action_shape=(env.action_space.shape[0], ),
                    observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Train the RL algorithm
    start_time = time.time()
    training.train(env=env,
                   eval_env=eval_env,
                   param_noise=param_noise,
                   action_noise=action_noise,
                   actor=actor,
                   critic=critic,
                   memory=memory,
                   **kwargs)

    # Training is done
    env.close()
    if eval_env is not None:
        eval_env.close()

    logger.info('total runtime: {}s'.format(time.time() - start_time))

    now = datetime.datetime.now()
    logging.info(now.isoformat())
    logging.info(
        "********************************************* End of RL algorithm *********************************************"
    )
    return True
Пример #6
0
def run_baselines(env, seed, log_dir):
    """
    Create baselines model and training.

    Replace the ddpg and its training with the algorithm you want to run.

    :param env: Environment of the task.
    :param seed: Random seed for the trail.
    :param log_dir: Log dir path.
    :return
    """
    rank = MPI.COMM_WORLD.Get_rank()
    seed = seed + 1000000 * rank
    set_global_seeds(seed)
    env.seed(seed)

    # Set up logger for baselines
    configure(dir=log_dir)
    baselines_logger.info('rank {}: seed={}, logdir={}'.format(
        rank, seed, baselines_logger.get_dir()))

    # Set up params for baselines ddpg
    nb_actions = env.action_space.shape[-1]
    layer_norm = False

    action_noise = OrnsteinUhlenbeckActionNoise(
        mu=np.zeros(nb_actions),
        sigma=float(params["sigma"]) * np.ones(nb_actions))
    memory = Memory(
        limit=params["replay_buffer_size"],
        action_shape=env.action_space.shape,
        observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    training.train(
        env=env,
        eval_env=None,
        param_noise=None,
        action_noise=action_noise,
        actor=actor,
        critic=critic,
        memory=memory,
        nb_epochs=params["n_epochs"],
        nb_epoch_cycles=params["n_epoch_cycles"],
        render_eval=False,
        reward_scale=1.,
        render=False,
        normalize_returns=False,
        normalize_observations=False,
        critic_l2_reg=0,
        actor_lr=params["policy_lr"],
        critic_lr=params["qf_lr"],
        popart=False,
        gamma=params["discount"],
        clip_norm=None,
        nb_train_steps=params["n_train_steps"],
        nb_rollout_steps=params["n_rollout_steps"],
        nb_eval_steps=100,
        batch_size=64)

    return osp.join(log_dir, "progress.csv")
Пример #7
0
def learn(
        network,
        env,
        data_path='',
        model_path='./model/',
        model_name='ddpg_none_fuzzy_150',
        file_name='test',
        model_based=False,
        memory_extend=False,
        model_type='linear',
        restore=False,
        dyna_learning=False,
        seed=None,
        nb_epochs=5,  # with default settings, perform 1M steps total
        nb_sample_cycle=5,
        nb_epoch_cycles=150,
        nb_rollout_steps=400,
        nb_model_learning=10,
        nb_sample_steps=50,
        nb_samples_extend=5,
        reward_scale=1.0,
        noise_type='normal_0.2',  #'adaptive-param_0.2',  ou_0.2, normal_0.2
        normalize_returns=False,
        normalize_observations=True,
        critic_l2_reg=1e-2,
        actor_lr=1e-4,
        critic_lr=1e-3,
        popart=False,
        gamma=0.99,
        clip_norm=None,
        nb_train_steps=50,  # per epoch cycle and MPI worker,
        batch_size=32,  # per MPI worker
        tau=0.01,
        param_noise_adaption_interval=50,
        **network_kwargs):

    nb_actions = env.action_space.shape[0]
    memory = Memory(limit=int(1e5),
                    action_shape=env.action_space.shape[0],
                    observation_shape=env.observation_space.shape)

    if model_based:
        """ store fake_data"""
        fake_memory = Memory(limit=int(1e5),
                             action_shape=env.action_space.shape[0],
                             observation_shape=env.observation_space.shape)
        """ select model or not """
        if model_type == 'gp':
            kernel = ConstantKernel(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2))
            dynamic_model = GaussianProcessRegressor(kernel=kernel)
            reward_model = GaussianProcessRegressor(kernel=kernel)
        elif model_type == 'linear':
            dynamic_model = LinearRegression()
            reward_model = LinearRegression()
        elif model_type == 'mlp':
            dynamic_model = MLPRegressor(hidden_layer_sizes=(100, ),
                                         activation='relu',
                                         solver='adam',
                                         alpha=0.0001,
                                         batch_size='auto',
                                         learning_rate='constant',
                                         learning_rate_init=0.001,
                                         power_t=0.5,
                                         max_iter=200,
                                         shuffle=True,
                                         random_state=None,
                                         tol=0.0001,
                                         verbose=False,
                                         warm_start=False,
                                         momentum=0.9,
                                         nesterovs_momentum=True,
                                         early_stopping=False,
                                         validation_fraction=0.1,
                                         beta_1=0.9,
                                         beta_2=0.999,
                                         epsilon=1e-08)
            reward_model = MLPRegressor(hidden_layer_sizes=(100, ),
                                        activation='relu',
                                        solver='adam',
                                        alpha=0.0001,
                                        batch_size='auto',
                                        learning_rate='constant',
                                        learning_rate_init=0.001,
                                        power_t=0.5,
                                        max_iter=200,
                                        shuffle=True,
                                        random_state=None,
                                        tol=0.0001,
                                        verbose=False,
                                        warm_start=False,
                                        momentum=0.9,
                                        nesterovs_momentum=True,
                                        early_stopping=False,
                                        validation_fraction=0.1,
                                        beta_1=0.9,
                                        beta_2=0.999,
                                        epsilon=1e-08)
        else:
            logger.info(
                "You need to give the model_type to fit the dynamic and reward!!!"
            )

    critic = Critic(network=network, **network_kwargs)
    actor = Actor(nb_actions, network=network, **network_kwargs)
    """ set noise """
    action_noise = None
    param_noise = None

    if noise_type is not None:
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                                 sigma=float(stddev) *
                                                 np.ones(nb_actions))
            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(
                    mu=np.zeros(nb_actions),
                    sigma=float(stddev) * np.ones(nb_actions))
            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))
    """action scale"""
    max_action = env.action_high_bound
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    """ agent ddpg """
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape[0],
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)

    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    sess = U.get_session()

    if restore:
        agent.restore(sess, model_path, model_name)
    else:
        agent.initialize(sess)
        sess.graph.finalize()
    agent.reset()

    episodes = 0
    epochs_rewards = np.zeros((nb_epochs, nb_epoch_cycles), dtype=np.float32)
    epochs_times = np.zeros((nb_epochs, nb_epoch_cycles), dtype=np.float32)
    epochs_steps = np.zeros((nb_epochs, nb_epoch_cycles), dtype=np.float32)
    epochs_states = []
    for epoch in range(nb_epochs):
        logger.info(
            "======================== The {} epoch start !!! ========================="
            .format(epoch))
        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_times = []
        epoch_actions = []
        epoch_episode_states = []
        epoch_qs = []
        epoch_episodes = 0
        for cycle in range(nb_epoch_cycles):
            start_time = time.time()
            obs, state, done = env.reset()
            obs_reset = cp.deepcopy(obs)
            episode_reward = 0.
            episode_step = 0
            episode_states = []
            logger.info(
                "================== The {} episode start !!! ==================="
                .format(cycle))
            for t_rollout in range(nb_rollout_steps):
                logger.info(
                    "================== The {} steps finish  !!! ==================="
                    .format(t_rollout))
                """ Predict next action """
                action, q, _, _ = agent.step(obs,
                                             stddev,
                                             apply_noise=True,
                                             compute_Q=True)

                new_obs, next_state, r, done, safe_or_not, final_action = env.step(
                    max_action * action, t_rollout)

                if safe_or_not is False:
                    break

                episode_reward += r
                episode_step += 1
                episode_states.append([
                    cp.deepcopy(state),
                    cp.deepcopy(final_action),
                    np.array(cp.deepcopy(r)),
                    cp.deepcopy(next_state)
                ])

                epoch_actions.append(action)
                epoch_qs.append(q)

                agent.store_transition(obs, action, r, new_obs, done)
                obs = new_obs
                state = next_state

                if done:
                    break
                """ extend the memory """
                if model_based and cycle > (nb_model_learning +
                                            1) and memory_extend:
                    pred_x = np.zeros((1, 18), dtype=np.float32)
                    for j in range(nb_samples_extend):
                        m_action, _, _, _ = agent.step(obs,
                                                       stddev,
                                                       apply_noise=True,
                                                       compute_Q=False)
                        pred_x[:, :12] = obs
                        pred_x[:, 12:] = m_action
                        m_new_obs = dynamic_model.predict(pred_x)[0]
                        """ get real reward """
                        # state = env.inverse_state(m_new_obs)
                        # m_reward = env.get_reward(state, m_action)
                        m_reward = reward_model.predict(pred_x)[0]
                        agent.store_transition(obs, m_action, m_reward,
                                               m_new_obs, done)
            """ generate new data and fit model"""
            if model_based and cycle > nb_model_learning:
                logger.info(
                    "==============================  Model Fit !!! ==============================="
                )
                input_x = np.concatenate(
                    (memory.observations0.data[:memory.nb_entries],
                     memory.actions.data[:memory.nb_entries]),
                    axis=1)
                input_y_obs = memory.observations1.data[:memory.nb_entries]
                input_y_reward = memory.rewards.data[:memory.nb_entries]
                dynamic_model.fit(input_x, input_y_obs)
                reward_model.fit(input_x, input_y_reward)

                if dyna_learning:
                    logger.info(
                        "=========================  Collect data !!! ================================="
                    )
                    pred_obs = np.zeros((1, 18), dtype=np.float32)
                    for sample_index in range(nb_sample_cycle):
                        fake_obs = obs_reset
                        for t_episode in range(nb_sample_steps):
                            fake_action, _, _, _ = agent.step(fake_obs,
                                                              stddev,
                                                              apply_noise=True,
                                                              compute_Q=False)
                            pred_obs[:, :12] = fake_obs
                            pred_obs[:, 12:] = fake_action
                            next_fake_obs = dynamic_model.predict(pred_obs)[0]
                            fake_reward = reward_model.predict(pred_obs)[0]
                            # next_fake_obs = dynamic_model.predict(np.concatenate((fake_obs, fake_action)))[0]
                            # fake_reward = reward_model.predict(np.concatenate((fake_obs, fake_action)))[0]
                            fake_obs = next_fake_obs
                            fake_terminals = False
                            fake_memory.append(fake_obs, fake_action,
                                               fake_reward, next_fake_obs,
                                               fake_terminals)
            """ noise decay """
            stddev = float(stddev) * 0.95

            duration = time.time() - start_time
            epoch_episode_rewards.append(episode_reward)
            epoch_episode_steps.append(episode_step)
            epoch_episode_times.append(cp.deepcopy(duration))
            epoch_episode_states.append(cp.deepcopy(episode_states))

            epochs_rewards[epoch, cycle] = episode_reward
            epochs_steps[epoch, cycle] = episode_step
            epochs_times[epoch, cycle] = cp.deepcopy(duration)

            logger.info(
                "============================= The Episode_Times:: {}!!! ============================"
                .format(epoch_episode_rewards))
            logger.info(
                "============================= The Episode_Times:: {}!!! ============================"
                .format(epoch_episode_times))

            epoch_episodes += 1
            episodes += 1
            """ Training process """
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []
            for t_train in range(nb_train_steps):
                logger.info("")
                # Adapt param noise, if necessary.
                if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                    distance = agent.adapt_param_noise()
                    epoch_adaptive_distances.append(distance)
                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()
            """ planning training """
            if model_based and cycle > (nb_model_learning +
                                        1) and dyna_learning:
                for t_train in range(nb_train_steps):
                    # setting for adapt param noise, if necessary.
                    if fake_memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)
                    batch = fake_memory.sample(batch_size=batch_size)
                    fake_cl, fake_al = agent.train_fake_data(batch)
                    epoch_critic_losses.append(fake_cl)
                    epoch_actor_losses.append(fake_al)
                    agent.update_target_net()

        epochs_states.append(cp.deepcopy(epoch_episode_states))

        # # save data
        np.save(
            data_path + 'train_reward_' + algorithm_name + '_' + noise_type +
            file_name, epochs_rewards)
        np.save(
            data_path + 'train_step_' + algorithm_name + '_' + noise_type +
            file_name, epochs_steps)
        np.save(
            data_path + 'train_states_' + algorithm_name + '_' + noise_type +
            file_name, epochs_states)
        np.save(
            data_path + 'train_times_' + algorithm_name + '_' + noise_type +
            file_name, epochs_times)

    # # agent save
    agent.store(model_path + 'train_model_' + algorithm_name + '_' +
                noise_type + file_name)
Пример #8
0
    def train(self,
              env_fn,
              num_timesteps,
              noise_type,
              layer_norm,
              folder,
              load_policy,
              video_width,
              video_height,
              plot_rewards,
              save_every=50,
              seed=1234,
              episode_length=1000,
              pi_hid_size=150,
              pi_num_hid_layers=3,
              render_frames=_render_frames,
              **kwargs):
        num_cpu = self.workers
        if sys.platform == 'darwin':
            num_cpu //= 2
        config = tf.ConfigProto(
            allow_soft_placement=True,
            intra_op_parallelism_threads=num_cpu,
            inter_op_parallelism_threads=num_cpu)

        if self.gpu_usage is None or self.gpu_usage <= 0.:
            os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
        else:
            config.gpu_options.allow_growth = True  # pylint: disable=E1101
            config.gpu_options.per_process_gpu_memory_fraction = self.gpu_usage / self.workers
        tf.Session(config=config).__enter__()

        worker_seed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
        set_global_seeds(worker_seed)

        tf.set_random_seed(worker_seed)
        np.random.seed(worker_seed)

        save_every = max(1, save_every)

        env = env_fn()
        env.seed(worker_seed)

        rank = MPI.COMM_WORLD.Get_rank()
        logger.info('rank {}: seed={}, logdir={}'.format(rank, worker_seed,
                                                         logger.get_dir()))

        def policy_fn(name, ob_space, ac_space):
            return mlp_policy.MlpPolicy(
                name=name,
                ob_space=ob_space,
                ac_space=ac_space,
                hid_size=pi_hid_size,
                num_hid_layers=pi_num_hid_layers)

        env = bench.Monitor(
            env,
            logger.get_dir() and osp.join(logger.get_dir(), str(rank)),
            allow_early_resets=True)
        gym.logger.setLevel(logging.INFO)

        that = self

        iter_name = 'iters_so_far'
        if self.method == 'sql':
            iter_name = 'epoch'
        # TODO replace with utils.create_callback(...)
        def callback(locals, globals):
            if that.method != "ddpg":
                if load_policy is not None and locals[iter_name] == 0:
                    # noinspection PyBroadException
                    try:
                        utils.load_state(load_policy)
                        if MPI.COMM_WORLD.Get_rank() == 0:
                            logger.info("Loaded policy network weights from %s." % load_policy)
                            # save TensorFlow summary (contains at least the graph definition)
                    except:
                        logger.error("Failed to load policy network weights from %s." % load_policy)
                if MPI.COMM_WORLD.Get_rank() == 0 and locals[iter_name] == 0:
                    _ = tf.summary.FileWriter(folder, tf.get_default_graph())
            if MPI.COMM_WORLD.Get_rank() == 0 and locals[iter_name] % save_every == 0:
                print('Saving video and checkpoint for policy at iteration %i...' %
                      locals[iter_name])
                ob = env.reset()
                images = []
                rewards = []
                max_reward = 1.  # if any reward > 1, we have to rescale
                lower_part = video_height // 5
                for i in range(episode_length):
                    if that.method == "ddpg":
                        ac, _ = locals['agent'].pi(ob, apply_noise=False, compute_Q=False)
                    elif that.method == "sql":
                        ac, _ = locals['policy'].get_action(ob)
                    elif isinstance(locals['pi'], GaussianMlpPolicy):
                        ac, _, _ = locals['pi'].act(np.concatenate((ob, ob)))
                    else:
                        ac, _ = locals['pi'].act(False, ob)
                    ob, rew, new, _ = env.step(ac)
                    images.append(render_frames(env))
                    if plot_rewards:
                        rewards.append(rew)
                        max_reward = max(rew, max_reward)
                    if new:
                        break

                orange = np.array([255, 163, 0])
                red = np.array([255, 0, 0])
                video = []
                width_factor = 1. / episode_length * video_width
                for i, imgs in enumerate(images):
                    for img in imgs:
                        img[-lower_part, :10] = orange
                        img[-lower_part, -10:] = orange
                        if episode_length < video_width:
                            p_rew_x = 0
                            for j, r in enumerate(rewards[:i]):
                                rew_x = int(j * width_factor)
                                if r < 0:
                                    img[-1:, p_rew_x:rew_x] = red
                                    img[-1:, p_rew_x:rew_x] = red
                                else:
                                    rew_y = int(r / max_reward * lower_part)
                                    img[-rew_y - 1:, p_rew_x:rew_x] = orange
                                    img[-rew_y - 1:, p_rew_x:rew_x] = orange
                                p_rew_x = rew_x
                        else:
                            for j, r in enumerate(rewards[:i]):
                                rew_x = int(j * width_factor)
                                if r < 0:
                                    img[-1:, rew_x] = red
                                    img[-1:, rew_x] = red
                                else:
                                    rew_y = int(r / max_reward * lower_part)
                                    img[-rew_y - 1:, rew_x] = orange
                                    img[-rew_y - 1:, rew_x] = orange
                    video.append(np.hstack(imgs))

                imageio.mimsave(
                    os.path.join(folder, "videos", "%s_%s_iteration_%i.mp4" %
                                 (that.environment, that.method, locals[iter_name])),
                    video,
                    fps=60)
                env.reset()

                if that.method != "ddpg":
                    utils.save_state(os.path.join(that.folder, "checkpoints", "%s_%i" %
                                                 (that.environment, locals[iter_name])))

        if self.method == "ppo":
            pposgd_simple.learn(
                env,
                policy_fn,
                max_timesteps=int(num_timesteps),
                timesteps_per_actorbatch=1024,  # 256
                clip_param=0.2,
                entcoeff=0.01,
                optim_epochs=4,
                optim_stepsize=1e-3,  # 1e-3
                optim_batchsize=64,
                gamma=0.99,
                lam=0.95,
                schedule='linear',  # 'linear'
                callback=callback)
        elif self.method == "trpo":
            trpo_mpi.learn(
                env,
                policy_fn,
                max_timesteps=int(num_timesteps),
                timesteps_per_batch=1024,
                max_kl=0.1,  # 0.01
                cg_iters=10,
                cg_damping=0.1,
                gamma=0.99,
                lam=0.98,
                vf_iters=5,
                vf_stepsize=1e-3,
                callback=callback)
        elif self.method == "acktr":
            from algos.acktr import acktr
            with tf.Session(config=tf.ConfigProto()):
                ob_dim = env.observation_space.shape[0]
                ac_dim = env.action_space.shape[0]
                with tf.variable_scope("vf"):
                    vf = NeuralNetValueFunction(ob_dim, ac_dim)
                with tf.variable_scope("pi"):
                    policy = GaussianMlpPolicy(ob_dim, ac_dim)
                acktr.learn(
                    env,
                    pi=policy,
                    vf=vf,
                    gamma=0.99,
                    lam=0.97,
                    timesteps_per_batch=1024,
                    desired_kl=0.01,  # 0.002
                    num_timesteps=num_timesteps,
                    animate=False,
                    callback=callback)
        elif self.method == "ddpg":
            from algos.ddpg import ddpg
            # Parse noise_type
            action_noise = None
            param_noise = None
            nb_actions = env.action_space.shape[-1]
            for current_noise_type in noise_type.split(','):
                current_noise_type = current_noise_type.strip()
                if current_noise_type == 'none':
                    pass
                elif 'adaptive-param' in current_noise_type:
                    _, stddev = current_noise_type.split('_')
                    from baselines.ddpg.noise import AdaptiveParamNoiseSpec
                    param_noise = AdaptiveParamNoiseSpec(
                        initial_stddev=float(stddev),
                        desired_action_stddev=float(stddev))
                elif 'normal' in current_noise_type:
                    _, stddev = current_noise_type.split('_')
                    from baselines.ddpg.noise import NormalActionNoise
                    action_noise = NormalActionNoise(
                        mu=np.zeros(nb_actions),
                        sigma=float(stddev) * np.ones(nb_actions))
                elif 'ou' in current_noise_type:
                    from baselines.ddpg.noise import OrnsteinUhlenbeckActionNoise
                    _, stddev = current_noise_type.split('_')
                    action_noise = OrnsteinUhlenbeckActionNoise(
                        mu=np.zeros(nb_actions),
                        sigma=float(stddev) * np.ones(nb_actions))
                else:
                    raise RuntimeError(
                        'unknown noise type "{}"'.format(current_noise_type))

            # Configure components.
            memory = Memory(
                limit=int(1e6),
                action_shape=env.action_space.shape,
                observation_shape=env.observation_space.shape)
            critic = Critic(layer_norm=layer_norm)
            actor = Actor(nb_actions, layer_norm=layer_norm)

            ddpg.train(
                env=env,
                eval_env=None,
                param_noise=param_noise,
                render=False,
                render_eval=False,
                action_noise=action_noise,
                actor=actor,
                critic=critic,
                memory=memory,
                callback=callback,
                **kwargs)
        elif self.method == "sql":
            from softqlearning.algorithms import SQL
            from softqlearning.misc.kernel import adaptive_isotropic_gaussian_kernel
            from softqlearning.misc.utils import timestamp
            from softqlearning.replay_buffers import SimpleReplayBuffer
            from softqlearning.value_functions import NNQFunction
            from softqlearning.policies import StochasticNNPolicy

            from rllab.envs.gym_env import GymEnv

            env = GymEnv(env)

            variant = {
                'seed': [1, 2, 3],
                'policy_lr': 3E-4,
                'qf_lr': 3E-4,
                'discount': 0.99,
                'layer_size': 128,
                'batch_size': 128,
                'max_pool_size': 1E6,
                'n_train_repeat': 1,
                'epoch_length': 1000,
                'snapshot_mode': 'last',
                'snapshot_gap': 100,
            }

            pool = SimpleReplayBuffer(
                env_spec=env.spec,
                max_replay_buffer_size=variant['max_pool_size'],
            )

            base_kwargs = dict(
                min_pool_size=episode_length,
                epoch_length=episode_length,
                n_epochs=num_timesteps,
                max_path_length=episode_length,
                batch_size=variant['batch_size'],
                n_train_repeat=variant['n_train_repeat'],
                eval_render=False,
                eval_n_episodes=1,
                iter_callback=callback
            )

            qf = NNQFunction(
                env_spec=env.spec,
                hidden_layer_sizes=tuple([pi_hid_size] * pi_num_hid_layers),
            )

            pi_layers = tuple([pi_hid_size] * pi_num_hid_layers)
            policy = StochasticNNPolicy(env_spec=env.spec, hidden_layer_sizes=pi_layers)

            algorithm = SQL(
                base_kwargs=base_kwargs,
                env=env,
                pool=pool,
                qf=qf,
                policy=policy,
                kernel_fn=adaptive_isotropic_gaussian_kernel,
                kernel_n_particles=32,
                kernel_update_ratio=0.5,
                value_n_particles=16,
                td_target_update_interval=1000,
                qf_lr=variant['qf_lr'],
                policy_lr=variant['policy_lr'],
                discount=variant['discount'],
                reward_scale=1,
                save_full_state=False,
            )

            algorithm.train()
        else:
            print('ERROR: Invalid "method" argument provided.', file=sys.stderr)
        env.close()
Пример #9
0
    def __init__(self, env, network,
                 param_noise_n=None,
                 action_noise_n=None,
                 gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True,
                 batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.),
                 return_range=(-np.inf, np.inf),
                 critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.,
                 shared_critic=False,
                 **network_kwargs):

        # todo clean the init process later

        nb_actions_n = [action_space.shape[-1] for action_space in env.action_space]

        memory = Memory(limit=int(1e5), action_shape=env.action_space_n_shape,
                        observation_shape=env.observation_space_n_shape, reward_shape=env.reward_shape,
                        terminal_shape=env.terminal_shape)
        critic_n = [Critic(name='critic_%d' % i, network=network, **network_kwargs) for i in
                    range(env.n)] if not shared_critic else [
            Critic(network=network, **network_kwargs)]
        actor_n = [Actor(nb_actions_n[i], name='actor_%d' % i, network=network, **network_kwargs) for i in range(env.n)]

        self.n = env.n
        self.observation_shape = env.observation_space
        self.action_shape = env.action_space
        self.observation_shape_n = env.observation_space_n_shape
        self.action_shape_n = env.action_space_n_shape
        self.reward_scale = reward_scale
        self.memory = memory
        self.batch_size = batch_size
        self.agents = []
        self.normalize_returns = normalize_returns
        self.enable_popart = enable_popart
        self.normalize_observations = normalize_observations
        if shared_critic:
            for i, (actor, param_noise, action_noise, obs_shape, act_shape) in enumerate(
                    zip(actor_n, param_noise_n, action_noise_n, self.observation_shape, self.action_shape)):
                self.agents.append(
                    Agent(actor, critic_n[0], memory, obs_shape.shape, act_shape.shape, self.observation_shape_n,
                          self.action_shape_n, param_noise,
                          action_noise,
                          gamma, tau, normalize_returns, enable_popart, normalize_observations,
                          batch_size, observation_range, action_range, return_range, critic_l2_reg,
                          actor_lr, critic_lr, clip_norm, reward_scale, id=i))
        else:
            for i, (actor, critic, param_noise, action_noise, obs_shape, act_shape) in enumerate(
                    zip(actor_n, critic_n, param_noise_n, action_noise_n, self.observation_shape, self.action_shape)):
                self.agents.append(
                    Agent(actor, critic, memory, obs_shape.shape, act_shape.shape, self.observation_shape_n,
                          self.action_shape_n,
                          param_noise,
                          action_noise,
                          gamma, tau, normalize_returns, enable_popart, normalize_observations,
                          batch_size, observation_range, action_range, return_range, critic_l2_reg,
                          actor_lr, critic_lr, clip_norm, reward_scale, id=i))

        # self.sess_n = [U.single_threaded_session() for _ in self.agents]
        self.observation_range = observation_range

        self.obs0_n = tf.placeholder(tf.float32, shape=(None,) + self.observation_shape_n, name='obs0_n')
        self.obs1_n = tf.placeholder(tf.float32, shape=(None,) + self.observation_shape_n, name='obs1_n')
        self.actions_n = tf.placeholder(tf.float32, shape=(None,) + self.action_shape_n, name='actions_n')
Пример #10
0
def run(seed, noise_type, layer_norm, evaluation, **kwargs):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create the opensim env.
    train_env = prosthetics_env.Wrapper(
        osim_env.ProstheticsEnv(visualize=kwargs['render']),
        frameskip=kwargs['frameskip'],
        reward_shaping=kwargs['reward_shaping'],
        reward_shaping_x=kwargs['reward_shaping_x'],
        feature_embellishment=kwargs['feature_embellishment'],
        relative_x_pos=kwargs['relative_x_pos'],
        relative_z_pos=kwargs['relative_z_pos'])
    train_env.change_model(model=kwargs['model'].upper(),
                           prosthetic=kwargs['prosthetic'],
                           difficulty=kwargs['difficulty'],
                           seed=seed)
    if rank == 0:
        train_env = bench.Monitor(train_env, None)
    else:
        train_env = bench.Monitor(
            train_env,
            logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))

    if evaluation:
        eval_env = prosthetics_env.EvaluationWrapper(
            osim_env.ProstheticsEnv(visualize=kwargs['render_eval']),
            frameskip=kwargs['eval_frameskip'],
            reward_shaping=kwargs['reward_shaping'],
            reward_shaping_x=kwargs['reward_shaping_x'],
            feature_embellishment=kwargs['feature_embellishment'],
            relative_x_pos=kwargs['relative_x_pos'],
            relative_z_pos=kwargs['relative_z_pos'])
        eval_env.change_model(model=kwargs['model'].upper(),
                              prosthetic=kwargs['prosthetic'],
                              difficulty=kwargs['difficulty'],
                              seed=seed)
        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'))
    else:
        eval_env = None

    # training.train() doesn't like the extra keyword args added for controlling the prosthetics env, so remove them.
    del kwargs['model']
    del kwargs['prosthetic']
    del kwargs['difficulty']
    del kwargs['reward_shaping_x']
    del kwargs['frameskip']
    del kwargs['eval_frameskip']
    del kwargs['crowdai_submit']
    del kwargs['eval_only']

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = train_env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    actor_layer_sizes = [
        int(x) for x in kwargs['actor_layer_sizes'].replace('[', '').replace(
            ']', '').split(',')
    ]
    critic_layer_sizes = [
        int(x) for x in kwargs['critic_layer_sizes'].replace('[', '').replace(
            ']', '').split(',')
    ]
    del kwargs['actor_layer_sizes']
    del kwargs['critic_layer_sizes']
    logger.info('actor_layer_sizes', actor_layer_sizes)
    logger.info('critic_layer_sizes', critic_layer_sizes)

    # Configure components.
    memory = Memory(limit=int(1e6),
                    action_shape=train_env.action_space.shape,
                    observation_shape=train_env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm,
                    activation=kwargs['activation'],
                    layer_sizes=critic_layer_sizes)
    actor = Actor(nb_actions,
                  layer_norm=layer_norm,
                  activation=kwargs['activation'],
                  layer_sizes=actor_layer_sizes)

    del kwargs['activation']

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    train_env.seed(seed)
    if eval_env:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=train_env,
                   eval_env=eval_env,
                   param_noise=param_noise,
                   action_noise=action_noise,
                   actor=actor,
                   critic=critic,
                   memory=memory,
                   **kwargs)
    train_env.close()
    if eval_env:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    print(
        "********************************************* Starting RL algorithm *********************************************"
    )
    # Configure logger for the process with rank 0 (main-process?)
    # MPI = Message Passing Interface, for parallel computing; rank = process identifier within a group of processes
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        # Disable logging for rank != 0 to avoid noise.
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = gym.make(env_id)
    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))

    if evaluation and rank == 0:
        eval_env = gym.make(env_id)
        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'))
        env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[
        -1] - 2  # 4 action-dimensions (we keep roll & pitch angle of the robot arm fixed)
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Configure components. (initialize memory, critic & actor objects)
    # print(env.action_space) # Box(6,)
    # print(env.observation_space) # Box(220, 300)
    # print(env.observation_space.shape) # (220, 300)
    # print(env.observation_space.shape[0]) # 220
    memory = Memory(limit=int(1e3),
                    action_shape=(env.action_space.shape[0] - 2, ),
                    observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Train the RL algorithm
    if rank == 0:
        start_time = time.time()
    training.train(env=env,
                   eval_env=eval_env,
                   param_noise=param_noise,
                   action_noise=action_noise,
                   actor=actor,
                   critic=critic,
                   memory=memory,
                   **kwargs)

    # Training is done
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))

    return True
Пример #12
0
        pass
    elif 'adaptive-param' in current_noise_type:
        _, stddev = current_noise_type.split('_')
        param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
    elif 'normal' in current_noise_type:
        _, stddev = current_noise_type.split('_')
        action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
    elif 'ou' in current_noise_type:
        _, stddev = current_noise_type.split('_')
        action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
    else:
        raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

# Configure components of DDPG
memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
critic = Critic(layer_norm=args.layer_norm)
actor = Actor(nb_actions, layer_norm=args.layer_norm)
# Seed everything to make things reproducible.
seed = args.seed + 1000000 * rank
logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir()))
# tf.reset_default_graph()
set_global_seeds(seed)
env.seed(seed)
if eval_env is not None:
    eval_env.seed(seed)

# Disable logging for rank != 0 to avoid noise.
if rank == 0:
    start_time = time.time()

# Derive the different numbers for the training process
Пример #13
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    ######################################### DEFAULT DATA #######################################
    history, abbreviation = read_stock_history(filepath='utils/datasets/stocks_history_target.h5')
    history = history[:, :, :4]
    history[:, 1:, 0] = history[:, 0:-1, 3] # correct opens
    target_stocks = abbreviation
    num_training_time = 1095

    # get target history
    target_history = np.empty(shape=(len(target_stocks), num_training_time, history.shape[2]))
    for i, stock in enumerate(target_stocks):
        target_history[i] = history[abbreviation.index(stock), :num_training_time, :]
    print("target:", target_history.shape)

    testing_stocks = abbreviation
    test_history = np.empty(shape=(len(testing_stocks), history.shape[1] - num_training_time,
                                   history.shape[2]))
    for i, stock in enumerate(testing_stocks):
        test_history[i] = history[abbreviation.index(stock), num_training_time:, :]
    print("test:", test_history.shape)

    window_length = kwargs['window_length']
    max_rollout_steps = kwargs['nb_rollout_steps']

    ###############################################################################################

    train_env = PortfolioEnv(target_history, 
                             target_stocks, 
                             steps=min(max_rollout_steps, target_history.shape[1]-window_length-2), 
                             window_length=window_length)
    infer_train_env = PortfolioEnv(target_history, 
                                   target_stocks, 
                                   steps=target_history.shape[1]-window_length-2,
                                   window_length=window_length)
    infer_test_env = PortfolioEnv(test_history, 
                                  testing_stocks, 
                                  steps=test_history.shape[1]-window_length-2, 
                                  window_length=window_length)
    kwargs['nb_eval_steps'] = infer_train_env.steps    
    kwargs['nb_eval_test_steps'] = infer_test_env.steps

    print("SPACE:", train_env.observation_space.shape)

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = train_env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6), action_shape=train_env.action_space.shape, observation_shape=train_env.observation_space.shape)
    critic = Critic(nb_actions, layer_norm=layer_norm, asset_features_shape=train_env.asset_features_shape)
    actor = Actor(nb_actions, layer_norm=layer_norm, asset_features_shape=train_env.asset_features_shape)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    train_env.seed(seed)
    infer_train_env.seed(seed)
    infer_test_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=train_env, train_eval_env=infer_train_env, test_eval_env=infer_test_env,
                   param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs)
    train_env.close()
    infer_train_env.close()
    infer_test_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Пример #14
0
def run(env_id, seed, noise_type, layer_norm, evaluation, outdir, no_hyp,
        **kwargs):
    params = locals()
    # Configure things.
    # rank = MPI.COMM_WORLD.Get_rank()
    # if rank != 0: logger.set_level(logger.DISABLED)
    rank = 0
    # Create envs.
    env = make_env(env_id)
    weight_file = kwargs.pop('weight_file')
    if not weight_file:
        outdir = exp_utils.prepare_exp_dirs(params, outdir, env_id)
    else:
        outdir = exp_utils.prepare_exp_dirs(params, outdir, env_id, 'eval')
    logger.configure(outdir)
    os.makedirs(outdir, exist_ok=True)

    env = bench.Monitor(env, os.path.join(outdir, "%i.monitor.json" % rank))
    gym.logger.setLevel(logging.WARN)
    logger.info('Output directory:{}, env:{}, no_hyp:{}'.format(
        outdir, env_id, no_hyp))
    if evaluation:
        eval_env = make_env(env_id)
        eval_env.seed(42)
        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'),
                                 allow_early_resets=True)
        # env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e5),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape)

    # critic = models.ConvCritic(layer_norm=layer_norm)
    # actor = models.ConvActor(nb_actions, layer_norm=layer_norm, no_hyp=no_hyp)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm, no_hyp=no_hyp)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    # set_global_seeds(seed)
    # env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()

    if weight_file:
        evaluate(
            env,
            nb_episodes=kwargs.get('nb_epochs', 100),
            reward_scale=kwargs.get('reward_scale'),
            render=kwargs.get('render'),
            param_noise=None,
            action_noise=None,
            actor=actor,
            critic=critic,
            critic_l2_reg=kwargs.get('critic_l2_reg'),
            memory=memory,
            weight_file=weight_file,
        )
    else:
        training.train(env=env,
                       eval_env=eval_env,
                       param_noise=param_noise,
                       action_noise=action_noise,
                       actor=actor,
                       critic=critic,
                       memory=memory,
                       outdir=outdir,
                       no_hyp=no_hyp,
                       **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Пример #15
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    # env = gym.make(env_id)
    # env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))

    if evaluation and rank == 0:
        eval_env = gym.make(env_id)
        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'))
        env = bench.Monitor(env, None)
    else:
        eval_env = None

    #dc = TestContainer(num_assets=3, num_samples=20000)
    dc = BitcoinTestContainer(csv_file_name='../../../data/csvs/output.csv')
    env = TradingStateModel(datacontainer=dc,
                            episode_length=kwargs['nb_rollout_steps'],
                            is_training=True,
                            commission_percentage=COMMISSION_PERCENTAGE)

    # Parse noise_type
    action_noise = None
    param_noise = None
    # nb_actions = env.action_space.shape[-1]
    nb_actions = env.datacontainer.num_assets
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    # memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    memory = Memory(limit=int(1e6),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape)
    critic = Critic(num_asset_features=env.datacontainer.total_asset_features,
                    num_actions=env.datacontainer.num_assets,
                    asset_features_shape=env.asset_features_shape,
                    portfolio_features_shape=env.portfolio_features_shape,
                    layer_norm=layer_norm)
    actor = Actor(nb_actions,
                  num_asset_features=env.datacontainer.total_asset_features,
                  num_actions=env.datacontainer.num_assets,
                  asset_features_shape=env.asset_features_shape,
                  portfolio_features_shape=env.portfolio_features_shape,
                  layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    # env.seed(seed)
    # if eval_env is not None:
    #     eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=env,
                   eval_env=eval_env,
                   param_noise=param_noise,
                   action_noise=action_noise,
                   actor=actor,
                   critic=critic,
                   memory=memory,
                   tensorboard_directory='./tensorboard_' +
                   str(COMMISSION_PERCENTAGE),
                   infer_directory='./infer_ims_' + str(COMMISSION_PERCENTAGE),
                   **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
                agent.update_target_net()
    return agent


seed = 8902077161928034768
env = gym.make("MountainCarContinuous-v0")
env.seed(seed)
sess = U.make_session(num_cpu=1).__enter__()
nb_actions = env.action_space.shape[-1]
layer_norm = True
param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(0.2),
                                     desired_action_stddev=float(0.2))
memory = Memory(limit=int(1e6),
                action_shape=env.action_space.shape,
                observation_shape=env.observation_space.shape)
critic = Critic(layer_norm=layer_norm)
actor = Actor(nb_actions, layer_norm=layer_norm)

agent = train_return(env=env,
                     actor=actor,
                     critic=critic,
                     memory=memory,
                     param_noise=param_noise)
max_action = env.action_space.high

from gym import spaces


def compute_traj(**kwargs):
    env.reset()
    if 'init_state' in kwargs:
Пример #17
0
def main():
    args = parse_args()
    logger.configure()
    gamma = 0.99
    tau = 0.01
    normalize_returns = False
    normalize_observations = True
    batch_size = 64
    action_noise = None
    stddev = 0.2
    param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev),
                                         desired_action_stddev=float(stddev))
    critic_l2_reg = 1e-2
    actor_lr = 1e-4
    critic_lr = 1e-3
    popart = False
    clip_norm = None
    reward_scale = 1.

    env = prosthetics_env.Wrapper(osim_env.ProstheticsEnv(visualize=False),
                                  frameskip=4,
                                  reward_shaping=True,
                                  reward_shaping_x=1,
                                  feature_embellishment=True,
                                  relative_x_pos=True,
                                  relative_z_pos=True)

    top_model_dir = 'top-models/'

    # create tf sessions and graphs
    sess_list = []
    graph_list = []
    for i in range(len(args.model_files)):
        graph_list.append(tf.Graph())
        sess_list.append(tf.Session(graph=graph_list[i]))
    ddpg_agents = []
    for i in range(len(args.model_files)):
        model_name = args.model_files[i]
        sess = sess_list[i]
        graph = graph_list[i]
        l_size = args.layer_sizes[i]
        with sess.as_default():
        #with U.make_session(num_cpu=1, graph=g) as sess:
            with graph.as_default():
                #tf.global_variables_initializer()

                # restore agents from model files and store in ddpg_agents
                print("Restoring from..." + model_name)

                # Configure components.
                memory = Memory(limit=int(1e6), action_shape=env.action_space.shape,
                                observation_shape=env.observation_space.shape)
                critic = Critic(layer_norm=True, activation='relu', layer_sizes=[l_size, l_size])
                actor = Actor(env.action_space.shape[-1], layer_norm=True,
                              activation='relu', layer_sizes=[l_size, l_size])
                agent = DDPG(actor, critic, memory, env.observation_space.shape,
                             env.action_space.shape, gamma=gamma, tau=tau,
                             normalize_returns=normalize_returns,
                             normalize_observations=normalize_observations,
                             batch_size=batch_size, action_noise=action_noise,
                             param_noise=param_noise, critic_l2_reg=critic_l2_reg,
                             actor_lr=actor_lr, critic_lr=critic_lr,
                             enable_popart=popart, clip_norm=clip_norm,
                             reward_scale=reward_scale)

                # restore adam state and param noise
                restore_model_path = top_model_dir + model_name
                saver = tf.train.Saver(max_to_keep=500)

                # restore network weights
                saver.restore(sess, restore_model_path)

                adam_optimizer_store = pickle.load(open(restore_model_path
                                                        + ".pkl", "rb"))
                agent.actor_optimizer.m = adam_optimizer_store['actor_optimizer']['m']
                agent.actor_optimizer.v = adam_optimizer_store['actor_optimizer']['v']
                agent.actor_optimizer.t = adam_optimizer_store['actor_optimizer']['t']
                agent.critic_optimizer.m = adam_optimizer_store['critic_optimizer']['m']
                agent.critic_optimizer.v = adam_optimizer_store['critic_optimizer']['v']
                agent.critic_optimizer.t = adam_optimizer_store['critic_optimizer']['t']
                if 'param_noise' in adam_optimizer_store:
                    agent.param_noise = adam_optimizer_store['param_noise']

                # intialize and prepare agent session.
                agent.initialize(sess)
                #sess.graph.finalize()
                agent.reset()

                ddpg_agents.append(agent)

    agent = BlendedAgent(ddpg_agents, sess_list, graph_list)

    if args.evaluation:
        # setup eval env
        eval_env = prosthetics_env.EvaluationWrapper(osim_env.ProstheticsEnv(visualize=False),
                                                     frameskip=4,
                                                     reward_shaping=True,
                                                     reward_shaping_x=1,
                                                     feature_embellishment=True,
                                                     relative_x_pos=True,
                                                     relative_z_pos=True)
        eval_env.change_model(model=('3D').upper(), prosthetic=True, difficulty=0, seed=0)
        eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval'))

        nb_eval_steps = 1000
        # reward, mean_q, final_steps = evaluate_one_episode(eval_env, ddpg_agents, sess_list, graph_list,
        #                                                    nb_eval_steps=nb_eval_steps,
        #                                                    render=False)
        reward, mean_q, final_steps = evaluate_one_episode(eval_env, agent, nb_eval_steps, render=False)
        print("Reward: " + str(reward))
        print("Mean Q: " + str(mean_q))
        print("Final num steps: " + str(final_steps))

    # Submit to crowdai competition. What a hack. :)
    # if crowdai_client is not None and crowdai_token is not None and eval_env is not None:
    crowdai_submit_count = 0
    if args.crowdai_submit:
        remote_base = "http://grader.crowdai.org:1729"
        crowdai_client = Client(remote_base)
        eval_obs_dict = crowdai_client.env_create(args.crowdai_token, env_id="ProstheticsEnv")
        eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation(
            eval_obs_dict,
            reward_shaping=True,
            reward_shaping_x=1.,
            feature_embellishment=True,
            relative_x_pos=True,
            relative_z_pos=True)
        while True:
            action, _ = agent.pi(eval_obs_projection, apply_noise=False, compute_Q=False)
            submit_action = prosthetics_env.openai_to_crowdai_submit_action(action)
            clipped_submit_action = np.clip(submit_action, 0., 1.)
            actions_equal = clipped_submit_action == submit_action
            if not np.all(actions_equal):
                logger.debug("crowdai_submit_count:", crowdai_submit_count)
                logger.debug("  openai-action:", action)
                logger.debug("  submit-action:", submit_action)
            crowdai_submit_count += 1
            [eval_obs_dict, reward, done, info] = crowdai_client.env_step(clipped_submit_action.tolist(), True)
            # [eval_obs_dict, reward, done, info] = crowdai_client.env_step(agent.pi(eval_obs_projection, apply_noise=False, compute_Q=False), True)
            eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation(
                eval_obs_dict,
                reward_shaping=True,
                reward_shaping_x=1.,
                feature_embellishment=True,
                relative_x_pos=True,
                relative_z_pos=True)
            if done:
                logger.debug("done: crowdai_submit_count:", crowdai_submit_count)
                eval_obs_dict = crowdai_client.env_reset()
                if not eval_obs_dict:
                    break
                logger.debug("done: eval_obs_dict exists after reset")
                eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation(
                    eval_obs_dict,
                    reward_shaping=True,
                    reward_shaping_x=1.,
                    feature_embellishment=True,
                    relative_x_pos=True,
                    relative_z_pos=True)
        crowdai_client.submit()

    for i in range(len(sess_list)):
        sess_list[i].close()
Пример #18
0
    def __init__(
            self,
            env,
            gamma,
            total_timesteps,
            network='mlp',
            nb_rollout_steps=100,
            reward_scale=1.0,
            noise_type='adaptive-param_0.2',
            normalize_returns=False,
            normalize_observations=False,
            critic_l2_reg=1e-2,
            actor_lr=1e-4,
            critic_lr=1e-3,
            popart=False,
            clip_norm=None,
            nb_train_steps=50,  # per epoch cycle and MPI worker,  <- HERE!
            nb_eval_steps=100,
            buffer_size=1000000,
            batch_size=64,  # per MPI worker
            tau=0.01,
            param_noise_adaption_interval=50,
            **network_kwargs):

        # Adjusting hyper-parameters by considering the number of options policies to learn
        num_options = env.get_number_of_options()
        buffer_size = num_options * buffer_size
        batch_size = num_options * batch_size

        observation_space = env.option_observation_space
        action_space = env.option_action_space

        nb_actions = action_space.shape[-1]
        assert (np.abs(action_space.low) == action_space.high
                ).all()  # we assume symmetric actions.

        memory = Memory(limit=buffer_size,
                        action_shape=action_space.shape,
                        observation_shape=observation_space.shape)
        critic = Critic(network=network, **network_kwargs)
        actor = Actor(nb_actions, network=network, **network_kwargs)

        action_noise = None
        param_noise = None
        if noise_type is not None:
            for current_noise_type in noise_type.split(','):
                current_noise_type = current_noise_type.strip()
                if current_noise_type == 'none':
                    pass
                elif 'adaptive-param' in current_noise_type:
                    _, stddev = current_noise_type.split('_')
                    param_noise = AdaptiveParamNoiseSpec(
                        initial_stddev=float(stddev),
                        desired_action_stddev=float(stddev))
                elif 'normal' in current_noise_type:
                    _, stddev = current_noise_type.split('_')
                    action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                                     sigma=float(stddev) *
                                                     np.ones(nb_actions))
                elif 'ou' in current_noise_type:
                    _, stddev = current_noise_type.split('_')
                    action_noise = OrnsteinUhlenbeckActionNoise(
                        mu=np.zeros(nb_actions),
                        sigma=float(stddev) * np.ones(nb_actions))
                else:
                    raise RuntimeError(
                        'unknown noise type "{}"'.format(current_noise_type))

        max_action = action_space.high
        logger.info(
            'scaling actions by {} before executing in env'.format(max_action))

        agent = DDPG(actor,
                     critic,
                     memory,
                     observation_space.shape,
                     action_space.shape,
                     gamma=gamma,
                     tau=tau,
                     normalize_returns=normalize_returns,
                     normalize_observations=normalize_observations,
                     batch_size=batch_size,
                     action_noise=action_noise,
                     param_noise=param_noise,
                     critic_l2_reg=critic_l2_reg,
                     actor_lr=actor_lr,
                     critic_lr=critic_lr,
                     enable_popart=popart,
                     clip_norm=clip_norm,
                     reward_scale=reward_scale)
        logger.info('Using agent with the following configuration:')
        logger.info(str(agent.__dict__.items()))

        sess = U.get_session()
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()

        # Variables that are used during learning
        self.agent = agent
        self.memory = memory
        self.max_action = max_action
        self.batch_size = batch_size
        self.nb_train_steps = nb_train_steps
        self.nb_rollout_steps = nb_rollout_steps
        self.param_noise_adaption_interval = param_noise_adaption_interval
Пример #19
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    if env_id == 'navigate':
        env = NavigateEnv(use_camera=False, continuous_actions=True, neg_reward=False, max_steps=500)
    elif env_id == 'toy':
        #env = continuous_gridworld.ContinuousGridworld('', max_steps=1000, obstacle_mode=continuous_gridworld.NO_OBJECTS)
        from toy_environment import room_obstacle_list
        env = gridworld.Gridworld(room_obstacle_list.obstacle_list, step_size=0.2)
    elif env_id == 'arm2pos':
        env = Arm2PosEnv(continuous=True, max_steps=500, neg_reward=False)
    elif env_id == 'pick-and-place':
        env = PickAndPlaceEnv(max_steps=500)
    else:
        env = gym.make(env_id)
    env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
    # env = gym.wrappers.Monitor(env, '/tmp/ddpg/', force=True)
    gym.logger.setLevel(logging.WARN)

    if evaluation and rank == 0:
        eval_env = gym.make(env_id)
        eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval'))
        env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None

    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions),
                                                        sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)



    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    del kwargs['tb_dir']
    del kwargs['save_path']
    hindsight_mode = kwargs['hindsight_mode']
    del kwargs['hindsight_mode']
    training.train(env=env, eval_env=eval_env, param_noise=param_noise,
                   action_noise=action_noise, actor=actor, critic=critic, memory=memory,
                   hindsight_mode=hindsight_mode, **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Пример #20
0
def learn(
        network,
        env,
        seed=None,
        total_timesteps=None,
        nb_epochs=None,  # with default settings, perform 1M steps total
        nb_epoch_cycles=20,
        nb_rollout_steps=100,
        reward_scale=1.0,
        render=False,
        render_eval=False,
        noise_type='adaptive-param_0.2',
        normalize_returns=False,
        normalize_observations=True,
        critic_l2_reg=1e-2,
        actor_lr=1e-4,
        critic_lr=1e-3,
        popart=False,
        gamma=0.99,
        clip_norm=None,
        nb_train_steps=50,  # per epoch cycle and MPI worker,
        nb_eval_steps=100,
        batch_size=64,  # per MPI worker
        tau=0.01,
        eval_env=None,
        param_noise_adaption_interval=50,
        **network_kwargs):

    set_global_seeds(seed)

    if total_timesteps is not None:
        assert nb_epochs is None
        nb_epochs = int(total_timesteps) // (nb_epoch_cycles *
                                             nb_rollout_steps)
    else:
        nb_epochs = 500

    rank = MPI.COMM_WORLD.Get_rank()
    nb_actions = env.action_space.shape[-1]
    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.

    memory = Memory(limit=int(1e6),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape)
    critic = Critic(network=network, **network_kwargs)
    actor = Actor(nb_actions, network=network, **network_kwargs)

    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    if noise_type is not None:
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                                 sigma=float(stddev) *
                                                 np.ones(nb_actions))
            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(
                    mu=np.zeros(nb_actions),
                    sigma=float(stddev) * np.ones(nb_actions))
            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))

    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    sess = U.get_session()
    # Prepare everything.
    agent.initialize(sess)
    sess.graph.finalize()

    agent.reset()

    obs = env.reset()
    if eval_env is not None:
        eval_obs = eval_env.reset()
    nenvs = obs.shape[0]

    episode_reward = np.zeros(nenvs, dtype=np.float32)  #vector
    episode_step = np.zeros(nenvs, dtype=int)  # vector
    episodes = 0  #scalar
    t = 0  # scalar

    epoch = 0

    start_time = time.time()

    epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_qs = []
    epoch_episodes = 0
    for epoch in range(nb_epochs):
        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.
            if nenvs > 1:
                # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each
                # of the environments, so resetting here instead
                agent.reset()
            for t_rollout in range(nb_rollout_steps):
                # Predict next action.
                action, q, _, _ = agent.step(obs,
                                             apply_noise=True,
                                             compute_Q=True)

                # Execute next action.
                if rank == 0 and render:
                    env.render()

                # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch
                new_obs, r, done, info = env.step(
                    max_action * action
                )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                # note these outputs are batched from vecenv

                t += 1
                if rank == 0 and render:
                    env.render()
                episode_reward += r
                episode_step += 1

                # Book-keeping.
                epoch_actions.append(action)
                epoch_qs.append(q)
                agent.store_transition(
                    obs, action, r, new_obs, done
                )  #the batched data will be unrolled in memory.py's append.

                obs = new_obs

                for d in range(len(done)):
                    if done[d]:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward[d])
                        episode_rewards_history.append(episode_reward[d])
                        epoch_episode_steps.append(episode_step[d])
                        episode_reward[d] = 0.
                        episode_step[d] = 0
                        epoch_episodes += 1
                        episodes += 1
                        if nenvs == 1:
                            agent.reset()

            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []
            for t_train in range(nb_train_steps):
                # Adapt param noise, if necessary.
                if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                    distance = agent.adapt_param_noise()
                    epoch_adaptive_distances.append(distance)

                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()

            # Evaluate.
            eval_episode_rewards = []
            eval_qs = []
            if eval_env is not None:
                nenvs_eval = eval_obs.shape[0]
                eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32)
                for t_rollout in range(nb_eval_steps):
                    eval_action, eval_q, _, _ = agent.step(eval_obs,
                                                           apply_noise=False,
                                                           compute_Q=True)
                    eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                        max_action * eval_action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    if render_eval:
                        eval_env.render()
                    eval_episode_reward += eval_r

                    eval_qs.append(eval_q)
                    for d in range(len(eval_done)):
                        if eval_done[d]:
                            eval_episode_rewards.append(eval_episode_reward[d])
                            eval_episode_rewards_history.append(
                                eval_episode_reward[d])
                            eval_episode_reward[d] = 0.0

        mpi_size = MPI.COMM_WORLD.Get_size()
        # Log stats.
        # XXX shouldn't call np.mean on variable length lists
        duration = time.time() - start_time
        stats = agent.get_stats()
        combined_stats = stats.copy()
        combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
        combined_stats['rollout/return_history'] = np.mean(
            episode_rewards_history)
        combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
        combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
        combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
        combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
        combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
        combined_stats['train/param_noise_distance'] = np.mean(
            epoch_adaptive_distances)
        combined_stats['total/duration'] = duration
        combined_stats['total/steps_per_second'] = float(t) / float(duration)
        combined_stats['total/episodes'] = episodes
        combined_stats['rollout/episodes'] = epoch_episodes
        combined_stats['rollout/actions_std'] = np.std(epoch_actions)
        # Evaluation statistics.
        if eval_env is not None:
            combined_stats['eval/return'] = eval_episode_rewards
            combined_stats['eval/return_history'] = np.mean(
                eval_episode_rewards_history)
            combined_stats['eval/Q'] = eval_qs
            combined_stats['eval/episodes'] = len(eval_episode_rewards)

        def as_scalar(x):
            if isinstance(x, np.ndarray):
                assert x.size == 1
                return x[0]
            elif np.isscalar(x):
                return x
            else:
                raise ValueError('expected scalar, got %s' % x)

        combined_stats_sums = MPI.COMM_WORLD.allreduce(
            np.array(
                [np.array(x).flatten()[0] for x in combined_stats.values()]))
        combined_stats = {
            k: v / mpi_size
            for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
        }

        # Total statistics.
        combined_stats['total/epochs'] = epoch + 1
        combined_stats['total/steps'] = t

        for key in sorted(combined_stats.keys()):
            logger.record_tabular(key, combined_stats[key])

        if rank == 0:
            logger.dump_tabular()
        logger.info('')
        logdir = logger.get_dir()
        if rank == 0 and logdir:
            if hasattr(env, 'get_state'):
                with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                    pickle.dump(env.get_state(), f)
            if eval_env and hasattr(eval_env, 'get_state'):
                with open(os.path.join(logdir, 'eval_env_state.pkl'),
                          'wb') as f:
                    pickle.dump(eval_env.get_state(), f)

    return agent
Пример #21
0
def run(env_id, seed, noise_type, num_cpu, layer_size, nb_layers, layer_norm,
        logdir, gym_monitor, evaluation, bind_to_core, portnum, max_to_keep,
        **kwargs):
    kwargs['logdir'] = logdir
    whoami = mpi_fork(num_cpu, bind_to_core=bind_to_core)
    if whoami == 'parent':
        sys.exit(0)

    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    utils.portnum = portnum + rank
    if rank != 0:
        # Write to temp directory for all non-master workers.
        actual_dir = None
        Logger.CURRENT.close()
        Logger.CURRENT = Logger(dir=mkdtemp(), output_formats=[])
        logger.set_level(logger.DISABLED)

    # Create envs.
    if rank == 0:
        env = gym.make(env_id)
        if gym_monitor and logdir:
            env = gym.wrappers.Monitor(env,
                                       os.path.join(logdir, 'gym_train'),
                                       force=True)
        env = SimpleMonitor(env)

        if evaluation:
            eval_env = gym.make(env_id)
            if gym_monitor and logdir:
                eval_env = gym.wrappers.Monitor(eval_env,
                                                os.path.join(
                                                    logdir, 'gym_eval'),
                                                force=True)
            eval_env = SimpleMonitor(eval_env)
        else:
            eval_env = None
    else:
        env = gym.make(env_id)
        if evaluation:
            eval_env = gym.make(env_id)
        else:
            eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape)
    critic = Critic(layer_size=layer_size,
                    nb_layers=nb_layers,
                    layer_norm=layer_norm)
    actor = Actor(nb_actions,
                  layer_size=layer_size,
                  nb_layers=nb_layers,
                  layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 10000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=env,
                   eval_env=eval_env,
                   param_noise=param_noise,
                   action_noise=action_noise,
                   actor=actor,
                   critic=critic,
                   memory=memory,
                   max_to_keep=max_to_keep,
                   **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    Logger.CURRENT.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Пример #22
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = gym.make(env_id)
    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
    gym.logger.setLevel(logging.WARN)

    if evaluation and rank == 0:
        eval_env = gym.make(env_id)
        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'))
        env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=env,
                   eval_env=eval_env,
                   param_noise=param_noise,
                   action_noise=action_noise,
                   actor=actor,
                   critic=critic,
                   memory=memory,
                   **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Пример #23
0
def learn(
        network,
        env,
        seed=None,
        total_timesteps=None,
        nb_epochs=None,  # with default settings, perform 1M steps total
        nb_epoch_cycles=20,
        nb_rollout_steps=100,
        reward_scale=1.0,
        render=False,
        render_eval=False,
        noise_type='adaptive-param_0.2',
        normalize_returns=False,
        normalize_observations=True,
        actor_l2_reg=0.0,
        critic_l2_reg=1e-2,
        actor_lr=1e-4,
        critic_lr=1e-3,
        popart=False,
        gamma=0.99,
        clip_norm=None,
        nb_train_steps=50,  # per epoch cycle and MPI worker,
        nb_eval_steps=1000,
        batch_size=64,  # per MPI worker
        tau=0.01,
        eval_env=None,
        param_noise_adaption_interval=50,
        rb_size=1e6,
        save_interval=1,
        pretrain_epochs=0,
        load_path=None,
        demos_path=None,
        bc_teacher_lambda=0.0,
        use_qfilter=False,
        **network_kwargs):
    """Learns policy using DDPG, with vectorized environments.

    If we pass other arguments that aren't specified here, they are considered
    as network_kwargs.

    Parameters
    ----------
    noise_type: for noise to be added to the behavior policy. They are NOT
        using the noise type from the paper but 'AdaptiveParamNoiseSpec'. I
        _think_ that if one does the OU process, we get action noise, but not
        parameter noise. Also, be sure to use `name_stdev` in that convention,
        as the code will split the argument at the underscores.

    actor_lr: 1e-4  (matches paper)

    critic_lr: 1e-3  (matches paper)

    critic_l2: 1e-2  (matches paper)

    gamma: 0.99  (matches paper)

    batch_size: 64  (matches paper for lower-dim env obs/states)

    tau: 0.01 for soft target updates of actor and critic nets. Paper used 0.001.

    nb_epoch_cycles: number of times we go through this cycle of: (1) get
        rollouts with noise added to policy and apply to replay buffer, (2)
        gradient updates for actor/critic, (3) evaluation rollouts (if any).
        AFTER all of these cycles happen, THEN we log statistics.

    nb_rollout_steps: number of steps in each parallel env we take with
        exploration policy without training, so this is just to populate the
        replay buffer. More parallel envs *should* mean that we get more
        samples in the buffer between each gradient updates of the network, so
        this might need to be environment *and* machine (# of CPUs) specific.

    nb_train_steps: after doing `nb_rollout_steps` in each parallel env, we do
        this many updates; each involves sampling from the replay buffer and
        updating the actor and critic (via lagged target updates).

    nb_eval_steps: 1000, I changed from the 100 as default. Using 1000 ensures
        that fixed length envs like Ant-v2 can get one full episode (assuming
        one parallel env) during evaluation stagtes.

    eval_env: A separate environment for evaluation only, where no noise is
        applied, similar to how rlkit does it.

    save_interval: Frequency between saving.
    """
    set_global_seeds(seed)

    # Daniel: this helps to maintain compatibility with PPO2 code. For now
    # we're ignoring it, but we should check that we're always clipping. I
    # changed the nb_epochs to match with PPO2 in that we divide by nenvs.
    if 'limit_act_range' in network_kwargs:
        network_kwargs.pop('limit_act_range')
    nenvs = env.num_envs

    nbatchsize = nenvs * nb_epoch_cycles * nb_rollout_steps
    if total_timesteps is not None:
        assert nb_epochs is None
        nb_epochs = int(total_timesteps) // nbatchsize
    else:
        nb_epochs = 500

    if MPI is not None:
        rank = MPI.COMM_WORLD.Get_rank()
    else:
        rank = 0

    # we assume symmetric actions.
    nb_actions = env.action_space.shape[-1]
    assert (np.abs(env.action_space.low) == env.action_space.high).all()

    # Form XP (1M steps, same as in paper), and critic/actor networks.
    # Daniel: force dtype here so we can use uint8 type images.
    assert env.observation_space.low.dtype == env.observation_space.high.dtype
    memory = Memory(limit=int(rb_size),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape,
                    dtype=env.observation_space.low.dtype)
    critic = Critic(network=network, **network_kwargs)
    actor = Actor(nb_actions, network=network, **network_kwargs)

    action_noise = None
    param_noise = None
    if noise_type is not None:
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                #action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                #                                 sigma=float(stddev)*np.ones(nb_actions))
                #if nenvs > 1:
                # Daniel: adding this to replace the former.
                action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                                 sigma=float(stddev) *
                                                 np.ones(nb_actions),
                                                 shape=(nenvs, nb_actions))
            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(
                    mu=np.zeros(nb_actions),
                    sigma=float(stddev) * np.ones(nb_actions))
            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))

    # The `learn` defaults above have priority over defaults in DDPG class.
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 actor_l2_reg=actor_l2_reg,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale,
                 bc_teacher_lambda=bc_teacher_lambda,
                 use_qfilter=use_qfilter)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Prepare everything.
    sess = U.get_session()
    agent.initialize(sess)
    # --------------------------------------------------------------------------
    # Daniel: similar as PPO2 code as `agent` is similar to `model` but has to
    # be initialized explicitly above. Must call after `agent.load` gets
    # created.  Not sure if this works with parameter space noise or with
    # normalization, but I don't plan to resume training (for now). It also has
    # to be *before* the `graph.finalize()` because otherwise we get an error.
    # --------------------------------------------------------------------------
    if load_path is not None:
        logger.info("\nInside ddpg, loading model from: {}".format(load_path))
        agent.load(load_path)
    # --------------------------------------------------------------------------
    sess.graph.finalize()
    agent.reset()

    # --------------------------------------------------------------------------
    # Daniel: populate replay buffer, followed by pretraining stage.
    # But if load_path is not None, then doesn't make sense -- we want to load.
    # We also don't need to do this if timesteps is 0 (e.g., for playing policy).
    # --------------------------------------------------------------------------
    if total_timesteps == 0:
        return agent
    assert seed == 1500, 'We normally want seed 1500, yet: {}'.format(seed)

    if (demos_path is not None and load_path is None):
        _ddpg_demos(demos_path, agent, memory)
        assert memory.nb_entries == memory.nb_teach_entries, memory.nb_entries
        checkdir = osp.join(logger.get_dir(), 'checkpoints')
        os.makedirs(checkdir, exist_ok=True)

        # Pretrain, based on their training code for some # of minibatches.
        pt_actor_losses = []
        pt_critic_losses = []
        batches_per_ep = int(memory.nb_entries / batch_size)
        logger.info(
            'Running pre-training for {} epochs'.format(pretrain_epochs))
        logger.info('  data size in memory: {}'.format(memory.nb_entries))
        logger.info('  each batch: {}, epoch mbs: {}'.format(
            batch_size, batches_per_ep))
        pt_start = time.time()

        for epoch in range(1, pretrain_epochs + 1):
            c_losses = []
            a_losses = []
            for _ in range(batches_per_ep):
                cl, al = agent.train(during_pretrain=True)
                agent.update_target_net()
                c_losses.append(cl)
                a_losses.append(al)
            pt_critic_losses.append(np.mean(c_losses))
            pt_actor_losses.append(np.mean(a_losses))

            # Check and save model occasionally.
            if epoch == 1 or epoch % 5 == 0:
                pt_time = (time.time() - pt_start) / 60.
                logger.info(
                    '  epoch done: {}, loss over past epoch: {:.4f}'.format(
                        str(epoch).zfill(4), pt_actor_losses[-1]))
                logger.info('  critic loss over past epoch: {:.4f}'.format(
                    pt_critic_losses[-1]))
                logger.info('  elapsed time: {:.1f}m'.format(pt_time))
                savepath = osp.join(
                    checkdir, 'pretrain_epoch_{}'.format(str(epoch).zfill(4)))
                logger.info('Saving model checkpoint to: ', savepath)
                agent.save(savepath)

        pt_time = (time.time() - pt_start) / 60.
        logger.info('losses a: {}'.format(np.array(pt_actor_losses)))
        logger.info('losses c: {}'.format(np.array(pt_critic_losses)))
        logger.info('Finished loading teacher samples + pre-training.')
        logger.info('Pre-training took {:.1f}m.\n'.format(pt_time))
    # --------------------------------------------------------------------------

    # Back to their code. For cloth, `env.reset()` takes a while so we put it here.
    obs = env.reset()
    if eval_env is not None:
        eval_obs = eval_env.reset()
    nenvs = obs.shape[0]

    # Daniel: Debugging/sanity checks.
    _variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
    U.display_var_info(_variables)
    logger.info("\nInside DDPG, about to start epochs")
    logger.info(
        "nbatchsize: {}, get this in buffer before DDPG updates".format(
            nbatchsize))
    logger.info("  i.e.: (nenv {}) * (cycles {}) * (nsteps {})".format(
        nenvs, nb_epoch_cycles, nb_rollout_steps))
    logger.info("nb_epochs: {}, number of cycles to use".format(nb_epochs))
    logger.info("eval_env None? {}".format(eval_env is None))
    logger.info("(end of debugging messages)\n")

    # File paths.
    checkdir = osp.join(logger.get_dir(), 'checkpoints')
    action_dir = osp.join(logger.get_dir(), 'actions')
    episode_dir = osp.join(logger.get_dir(), 'ep_all_infos')
    os.makedirs(checkdir, exist_ok=True)
    os.makedirs(action_dir, exist_ok=True)
    os.makedirs(episode_dir, exist_ok=True)

    # Daniel: use these two to store past 100 episode history. Report these stats!
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    all_eval_episode_rewards = []

    # reward/step: cumulative quantities for each episode in vecenv.
    # epoch_{actions,qs} will grow without bound, fyi.
    episode_reward = np.zeros(nenvs, dtype=np.float32)  #vector
    episode_step = np.zeros(nenvs, dtype=int)  # vector
    episodes = 0  #scalar
    t = 0  # scalar
    epoch = 0
    start_time = time.time()
    epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_qs = []
    epoch_episodes = 0

    for epoch in range(nb_epochs):
        mb_actions = []
        mb_epinfos = []
        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.
            if nenvs > 1:
                # if simulating multiple envs in parallel, impossible to reset
                # agent at the end of the episode in each of the environments,
                # so resetting here instead
                agent.reset()

            # Daniel: pure data collection (noise added) to populate replay buffer.
            # No training until after this, and note the parallel stepping (VecEnv).
            for t_rollout in range(nb_rollout_steps):
                # Predict next action.
                # action: (#_parallel_envs, ac_dim), q: (#_parallel_envs, 1)
                action, q, _, _ = agent.step(obs,
                                             apply_noise=True,
                                             compute_Q=True)
                # Execute next action.
                if rank == 0 and render:
                    env.render()

                # max_action is of dimension A, whereas action is dimension
                # (nenvs, A) - the multiplication gets broadcasted to the batch
                # scale for execution in env (as far as DDPG is concerned,
                # every action is in [-1, 1])
                new_obs, r, done, info = env.step(max_action * action)
                r = r.astype(np.float32)
                # note these outputs are batched from vecenv (first dim = batch).
                t += 1
                if rank == 0 and render:
                    env.render()
                episode_reward += r
                episode_step += 1

                # Book-keeping. (Daniel: agent.train() doesn't require these two lists)
                epoch_actions.append(action)
                epoch_qs.append(q)
                # Daniel: Same as PPO2 code.
                mb_actions.append(action)
                for inf in info:
                    maybeepinfo = inf.get('episode')
                    if maybeepinfo:
                        mb_epinfos.append(inf)
                #the batched data will be unrolled in memory.py's append.
                agent.store_transition(obs, action, r, new_obs, done)
                obs = new_obs

                for d in range(len(done)):
                    if done[d]:
                        # Episode done.
                        epoch_episode_rewards.append(
                            episode_reward[d])  # Entire history
                        episode_rewards_history.append(
                            episode_reward[d])  # Last 100 only
                        epoch_episode_steps.append(episode_step[d])
                        episode_reward[d] = 0.
                        episode_step[d] = 0
                        epoch_episodes += 1
                        episodes += 1
                        if nenvs == 1:
                            agent.reset()

            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []
            for t_train in range(nb_train_steps):
                # Adapt param noise, if necessary.
                if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                    distance = agent.adapt_param_noise()
                    epoch_adaptive_distances.append(distance)

                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()

            # Evaluate. (Daniel: note that no noise is applied here.)
            # Also it seems like episodes do not naturally reset before this starts?
            # Also, unlike epoch_episode_reward, here we create eval_episode_reward here ...
            eval_episode_rewards = []
            eval_qs = []
            if eval_env is not None:
                logger.info('Now on the eval_env for {} steps...'.format(
                    nb_eval_steps))
                nenvs_eval = eval_obs.shape[0]
                eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32)
                for t_rollout in range(nb_eval_steps):
                    eval_action, eval_q, _, _ = agent.step(eval_obs,
                                                           apply_noise=False,
                                                           compute_Q=True)
                    # scale for execution in env (for DDPG, every action is in [-1, 1])
                    eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                        max_action * eval_action)
                    if render_eval:
                        eval_env.render()
                    eval_episode_reward += eval_r

                    eval_qs.append(eval_q)
                    for d in range(len(eval_done)):
                        if eval_done[d]:
                            eval_episode_rewards.append(eval_episode_reward[d])
                            eval_episode_rewards_history.append(
                                eval_episode_reward[d])
                            all_eval_episode_rewards.append(
                                eval_episode_reward[d])
                            eval_episode_reward[
                                d] = 0.0  # Daniel: reset for next episode.

        if MPI is not None:
            mpi_size = MPI.COMM_WORLD.Get_size()
        else:
            mpi_size = 1

        # Log stats.
        # XXX shouldn't call np.mean on variable length lists
        duration = time.time() - start_time
        stats = agent.get_stats()
        combined_stats = stats.copy()
        combined_stats['memory/nb_entries'] = memory.nb_entries
        combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
        combined_stats['rollout/return_history'] = np.mean(
            episode_rewards_history)
        combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
        combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
        combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
        combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
        combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
        combined_stats['train/param_noise_distance'] = np.mean(
            epoch_adaptive_distances)
        combined_stats['total/duration'] = duration
        combined_stats['total/steps_per_second'] = float(t) / float(duration)
        combined_stats['total/episodes'] = episodes
        combined_stats['rollout/episodes'] = epoch_episodes
        combined_stats['rollout/actions_std'] = np.std(epoch_actions)
        # Evaluation statistics. (Daniel: use eval/return_history for plots)
        if eval_env is not None:
            combined_stats['eval/return'] = np.mean(eval_episode_rewards)
            combined_stats['eval/return_history'] = np.mean(
                eval_episode_rewards_history)
            combined_stats['eval/Q'] = eval_qs
            combined_stats['eval/episodes'] = len(eval_episode_rewards)

        def as_scalar(x):
            if isinstance(x, np.ndarray):
                assert x.size == 1
                return x[0]
            elif np.isscalar(x):
                return x
            else:
                raise ValueError('expected scalar, got %s' % x)

        combined_stats_sums = np.array(
            [np.array(x).flatten()[0] for x in combined_stats.values()])
        if MPI is not None:
            combined_stats_sums = MPI.COMM_WORLD.allreduce(combined_stats_sums)

        combined_stats = {
            k: v / mpi_size
            for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
        }

        # Total statistics.
        combined_stats['total/epochs'] = epoch + 1
        combined_stats['total/steps_per_env'] = t

        for key in sorted(combined_stats.keys()):
            logger.record_tabular(key, combined_stats[key])

        if rank == 0:
            logger.dump_tabular()
        logger.info('')
        logdir = logger.get_dir()
        if rank == 0 and logdir:
            if hasattr(env, 'get_state'):
                with open(osp.join(logdir, 'env_state.pkl'), 'wb') as f:
                    pickle.dump(env.get_state(), f)
            if eval_env and hasattr(eval_env, 'get_state'):
                with open(osp.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                    pickle.dump(eval_env.get_state(), f)
            # Daniel: arguable, we can save all episodes but hard if we don't know the steps.
            #if eval_env:
            #    with open(os.path.join(logdir, 'all_eval_episode_rewards.pkl'), 'wb') as f:
            #        pickle.dump(all_eval_episode_rewards, f)

        # Daniel: we can use cycle or epoch for this if condition ... kind of annoying but w/e.
        if cycle % save_interval == 0:
            logger.info('We are now saving stuff!!')
            savepath = osp.join(checkdir, '%.5i' % epoch)
            logger.info('Saving model checkpoint to: ', savepath)
            agent.save(savepath)
            # ------------------------------------------------------------------
            # Daniel: extra stuff for debugging PPO on cloth, actions and infos for each episode.
            mb_actions = _sf01(np.asarray(mb_actions))
            act_savepath = osp.join(action_dir, 'actions_%.5i.pkl' % epoch)
            epi_savepath = osp.join(episode_dir, 'infos_%.5i.pkl' % epoch)
            with open(act_savepath, 'wb') as fh:
                pickle.dump(mb_actions, fh)
            with open(epi_savepath, 'wb') as fh:
                pickle.dump(mb_epinfos, fh)

        # Daniel: we were not resetting earlier. Actually there are other
        # epoch_stats which we might consider resetting here?
        epoch_episodes = 0

    return agent