Exemplo n.º 1
0
class Expert:
    def __init__(self, limit, env):
        self.limit = limit
        self.env = env
        self.memory = Memory(
            limit=self.limit,
            action_shape=self.env.action_space.shape,
            observation_shape=self.env.observation_space.shape)
        self.file_dir = None

    def load_file(self, file_dir):
        self.file_dir = file_dir
        expert_file = open(self.file_dir, 'rb')
        expert_data = pickle.load(expert_file)
        expert_file.close()
        for episode_sample in expert_data:
            for step_sample in episode_sample:
                self.memory.append(step_sample[0], step_sample[1],
                                   step_sample[2], step_sample[3],
                                   step_sample[4])

    def sample(self, batch_size):
        return self.memory.sample(batch_size)

    def set_tf(self, actor, critic, obs_rms, ret_rms, observation_range,
               return_range):
        self.expert_state = tf.placeholder(tf.float32,
                                           shape=(None, ) +
                                           self.env.observation_space.shape,
                                           name='expert_state')
        self.expert_action = tf.placeholder(tf.float32,
                                            shape=(None, ) +
                                            self.env.action_space.shape,
                                            name='expert_action')
        normalized_state = tf.clip_by_value(
            normalize(self.expert_state, obs_rms), observation_range[0],
            observation_range[1])
        expert_actor = actor(normalized_state, reuse=True)
        normalized_q_with_expert_data = critic(normalized_state,
                                               self.expert_action,
                                               reuse=True)
        normalized_q_with_expert_actor = critic(normalized_state,
                                                expert_actor,
                                                reuse=True)
        self.Q_with_expert_data = denormalize(
            tf.clip_by_value(normalized_q_with_expert_data, return_range[0],
                             return_range[1]), ret_rms)
        self.Q_with_expert_actor = denormalize(
            tf.clip_by_value(normalized_q_with_expert_actor, return_range[0],
                             return_range[1]), ret_rms)
        self.critic_loss = tf.reduce_mean(
            tf.nn.softplus(self.Q_with_expert_actor - self.Q_with_expert_data))
        self.actor_loss = -tf.reduce_mean(self.Q_with_expert_actor)
class DQNModel(TensorflowBasedModel):
    key_list = Config.load_json(file_path=None)

    def __init__(self, config, action_bound):
        super(DQNModel, self).__init__(config=config)
        self.proposed_action_list = []
        self.action_bound = action_bound
        action_list = []
        for i in range(len(action_bound[0])):
            low = action_bound[0][i]
            high = action_bound[1][i]
            action_list.append(
                np.arange(start=low,
                          stop=high,
                          step=(high - low) /
                          self.config.config_dict['ACTION_SPLIT_COUNT']))
        action_iterator = itertools.product(*action_list)
        self.action_selection_list = []
        for action_sample in action_iterator:
            self.action_selection_list.append(tf.constant(action_sample))

        self.reward_input = tf.placeholder(shape=[None, 1], dtype=tf.float32)

        self.state_input = tf.placeholder(
            shape=[None] + list(self.config.config_dict['STATE_SPACE']),
            dtype=tf.float32)
        self.next_state_input = tf.placeholder(
            shape=[None] + list(self.config.config_dict['STATE_SPACE']),
            dtype=tf.float32)
        self.action_input = tf.placeholder(
            shape=[None] + list(self.config.config_dict['ACTION_SPACE']),
            dtype=tf.float32)
        self.done_input = tf.placeholder(shape=[None, 1], dtype=tf.bool)
        self.input = tf.concat([self.state_input, self.action_input])
        self.done = tf.cast(self.done_input, dtype=tf.float32)

        self.q_value_list = []
        var_list = None
        for action_sample in self.action_selection_list:
            q_net, q_output, var_list = NetworkCreator.create_network(
                input=tf.concat(self.state_input, action_sample),
                network_config=self.config.config_dict['NET_CONFIG'],
                net_name=self.config.config_dict['NAME'])
            self.q_value_list.append(q_output)
        self.var_list = var_list

        self.target_q_value_list = []
        for action_sample in self.action_selection_list:
            q_net, q_output, var_list = NetworkCreator.create_network(
                input=tf.concat(self.next_state_input, action_sample),
                network_config=self.config.config_dict['NET_CONFIG'],
                net_name='TARGET' + self.config.config_dict['NAME'])
            self.target_var_list.append(q_output)
        self.target_var_list = var_list

        self.loss, self.optimizer, self.optimize = self.create_training_method(
        )
        self.update_target_q_op = self.create_target_q_update()
        self.memory = Memory(
            limit=1e100,
            action_shape=self.config.config_dict['ACTION_SPACE'],
            observation_shape=self.config.config_dict['STATE_SPACE'])
        self.sess = tf.get_default_session()

    def update(self):
        for i in range(self.config.config_dict['ITERATION_EVER_EPOCH']):
            batch_data = self.memory.sample(
                batch_size=self.config.config_dict['BATCH_SIZE'])
            loss = self.sess.run(fetches=[self.loss, self.optimize],
                                 feed_dict={
                                     self.reward_input: batch_data['rewards'],
                                     self.action_input: batch_data['actions'],
                                     self.state_input: batch_data['obs0'],
                                     self.done_input: batch_data['terminals1']
                                 })

    def predict(self, obs, q_value):
        pass

    def print_log_queue(self, status):
        self.status = status
        while self.log_queue.qsize() > 0:
            log = self.log_queue.get()
            print("%s: Critic loss %f: " %
                  (self.name, log[self.name + '_CRITIC']))
            log['INDEX'] = self.log_print_count
            self.log_file_content.append(log)
            self.log_print_count += 1

    def create_training_method(self):
        l1_l2 = tfcontrib.layers.l1_l2_regularizer()
        loss = tf.reduce_sum((self.predict_q_value - self.q_output) ** 2) + \
               tfcontrib.layers.apply_regularization(l1_l2, weights_list=self.var_list)
        optimizer = tf.train.AdadeltaOptimizer(
            learning_rate=self.config.config_dict['LEARNING_RATE'])
        optimize_op = optimizer.minimize(loss=loss, var_list=self.var_list)
        return loss, optimizer, optimize_op

    def create_predict_q_value_op(self):

        predict_q_value = (1. - self.done) * self.config.config_dict['DISCOUNT'] * self.target_q_output \
                          + self.reward_input
        return predict_q_value

    def create_target_q_update(self):
        op = []
        for var, target_var in zip(self.var_list, self.target_var_list):
            ref_val = self.config.config_dict['DECAY'] * target_var + (
                1.0 - self.config.config_dict['DECAY']) * var
            op.append(tf.assign(ref_val, var))
        return op

    def store_one_sample(self, state, next_state, action, reward, done, *arg,
                         **kwargs):
        self.memory.append(obs0=state,
                           obs1=next_state,
                           action=action,
                           reward=reward,
                           terminal1=done)
Exemplo n.º 3
0
class Model(object):
    def __init__(self, network, env, gamma=1, tau=0.01, total_timesteps=1e6,
                 normalize_observations=True, normalize_returns=False, enable_popart=False,
                 noise_type='adaptive-param_0.2', clip_norm=None, reward_scale=1.,
                 batch_size=128, l2_reg_coef=0.2, actor_lr=1e-4, critic_lr=1e-3,
                 observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf),
                 **network_kwargs):
        # logger.info('Using agent with the following configuration:')
        # logger.info(str(self.__dict__.items()))
        observation_shape = env.observation_space.shape
        action_shape = env.action_space.shape

        # Inputs.
        self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0')
        self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1')
        self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1')
        self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards')
        self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions')
        self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev')

        # Parameters.
        self.env = env
        self.gamma = gamma
        self.tau = tau
        self.total_timesteps = total_timesteps
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.enable_popart = enable_popart
        self.clip_norm = clip_norm
        self.reward_scale = reward_scale
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.batch_size = batch_size
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.l2_reg_coef = l2_reg_coef

        self.stats_sample = None

        self.action_noise = None
        self.param_noise = None
        nb_actions = self.env.action_space.shape[-1]
        if noise_type is not None:
            for current_noise_type in noise_type.split(','):
                current_noise_type = current_noise_type.strip()
                if current_noise_type == 'none':
                    pass
                elif 'adaptive-param' in current_noise_type:
                    _, stddev = current_noise_type.split('_')
                    self.param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev),
                                                         desired_action_stddev=float(stddev))
                elif 'normal' in current_noise_type:
                    _, stddev = current_noise_type.split('_')
                    self.action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
                elif 'ou' in current_noise_type:
                    _, stddev = current_noise_type.split('_')
                    self.action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions),
                                                                sigma=float(stddev) * np.ones(nb_actions))
                else:
                    raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

        assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
        self.memory = Memory(limit=int(1e6), action_shape=env.action_space.shape,
                             observation_shape=env.observation_space.shape)
        self.critic = Critic(network=network, **network_kwargs)
        self.actor = Actor(nb_actions, network=network, **network_kwargs)

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
                                           self.observation_range[0], self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
                                           self.observation_range[0], self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(self.actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(self.critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = self.actor(normalized_obs0)
        self.normalized_critic_tf = self.critic(normalized_obs0, self.actions)
        self.critic_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
        self.normalized_critic_with_actor_tf = self.critic(normalized_obs0, self.actor_tf, reuse=True)
        self.critic_with_actor_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]),
            self.ret_rms)
        Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

        self.initial_state = None  # recurrent architectures not supported yet
        self.def_path_pre = os.path.dirname(os.path.abspath(__file__)) + '/tmp/'

    def setup_target_network_updates(self):
        actor_init_updates, actor_soft_updates = get_target_updates(self.actor.vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(self.critic.vars, self.target_critic.vars,
                                                                      self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

    def setup_param_noise(self, normalized_obs0):
        assert self.param_noise is not None

        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)
        param_noise_actor.name = 'param_noise_actor'
        self.perturbed_actor_tf = param_noise_actor(normalized_obs0)
        logger.info('setting up param noise')
        self.perturb_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev)

        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor,
                                                                       self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))

    def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
        actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars]
        actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
                                       beta1=0.9, beta2=0.999, epsilon=1e-08)

    def setup_critic_optimizer(self):
        logger.info('setting up critic optimizer')
        normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms),
                                                       self.return_range[0], self.return_range[1])
        self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
        if self.l2_reg_coef > 0.:
            critic_reg_vars = [var for var in self.critic.trainable_vars if
                               var.name.endswith('/w:0') and 'output' not in var.name]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(self.l2_reg_coef))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.l2_reg_coef),
                weights_list=critic_reg_vars
            )
            self.critic_loss += critic_reg
        critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars]
        critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
                                        beta1=0.9, beta2=0.999, epsilon=1e-08)

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean

        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)]
            self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean) / new_std)]

    def setup_stats(self):
        ops = []
        names = []

        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']

        if self.normalize_observations:
            ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)]
            names += ['obs_rms_mean', 'obs_rms_std']

        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['reference_Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']

        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf)]
        names += ['reference_action_std']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_mean']
            ops += [reduce_std(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_std']

        self.stats_ops = ops
        self.stats_names = names

    def train_step(self, obs, apply_noise=True, compute_Q=True):
        if self.param_noise is not None and apply_noise:
            actor_tf = self.perturbed_actor_tf
        else:
            actor_tf = self.actor_tf
        feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])}
        if compute_Q:
            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None

        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action[0].shape
            action += noise
        action = np.clip(action, self.action_range[0], self.action_range[1])

        return action, q, None, None

    def step(self, obs, compute_Q=True):
        feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])}
        if compute_Q:
            action, q = self.sess.run([self.actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict)
        else:
            action = self.sess.run(self.actor_tf, feed_dict=feed_dict)
            q = None

        action = np.clip(action, self.action_range[0], self.action_range[1])

        return action, q, None, None

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        reward *= self.reward_scale

        B = obs0.shape[0]
        for b in range(B):
            self.memory.append(obs0[b], action[b], reward[b], obs1[b], terminal1[b])
            if self.normalize_observations:
                self.obs_rms.update(np.array([obs0[b]]))

    def train(self):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        if self.normalize_returns and self.enable_popart:
            old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q],
                                                        feed_dict={
                                                            self.obs1: batch['obs1'],
                                                            self.rewards: batch['rewards'],
                                                            self.terminals1: batch['terminals1'].astype('float32'),
                                                        })
            self.ret_rms.update(target_Q.flatten())
            self.sess.run(self.renormalize_Q_outputs_op, feed_dict={
                self.old_std: np.array([old_std]),
                self.old_mean: np.array([old_mean]),
            })

            # Run sanity check. Disabled by default since it slows down things considerably.
            # print('running sanity check')
            # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })
            # print(target_Q_new, target_Q, new_mean, new_std)
            # assert (np.abs(target_Q - target_Q_new) < 1e-3).all()
        else:
            target_Q = self.sess.run(self.target_Q, feed_dict={
                self.obs1: batch['obs1'],
                self.rewards: batch['rewards'],
                self.terminals1: batch['terminals1'].astype('float32'),
            })

        # Get all gradients and perform a synced update.
        ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss]
        actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={
            self.obs0: batch['obs0'],
            self.actions: batch['actions'],
            self.critic_target: target_Q,
        })
        self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)

        return critic_loss, actor_loss

    def initialize(self, sess):
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)

    def update_target_net(self):
        self.sess.run(self.target_soft_updates)

    def get_stats(self):
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        values = self.sess.run(self.stats_ops, feed_dict={
            self.obs0: self.stats_sample['obs0'],
            self.actions: self.stats_sample['actions'],
        })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats

    def adapt_param_noise(self):
        try:
            from mpi4py import MPI
        except ImportError:
            MPI = None

        if self.param_noise is None:
            return 0.

        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={
            self.param_noise_stddev: self.param_noise.current_stddev,
        })
        distance = self.sess.run(self.adaptive_policy_distance, feed_dict={
            self.obs0: batch['obs0'],
            self.param_noise_stddev: self.param_noise.current_stddev,
        })

        if MPI is not None:
            mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
        else:
            mean_distance = distance

        self.param_noise.adapt(mean_distance)
        return mean_distance

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops, feed_dict={
                self.param_noise_stddev: self.param_noise.current_stddev,
            })

    def learn(self,
              total_timesteps=None,
              seed=None,
              nb_epochs=None,  # with default settings, perform 1M steps total
              nb_epoch_cycles=20,
              nb_rollout_steps=100,
              render=False,
              nb_train_steps=50,  # per epoch cycle and MPI worker,
              batch_size=64,  # per MPI worker
              param_noise_adaption_interval=50,):

        set_global_seeds(seed)

        if total_timesteps is not None:
            assert nb_epochs is None
            nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps)
        else:
            nb_epochs = 500

        if MPI is not None:
            rank = MPI.COMM_WORLD.Get_rank()
        else:
            rank = 0

        # eval_episode_rewards_history = deque(maxlen=100)
        episode_rewards_history = deque(maxlen=100)
        sess = U.get_session()
        # Prepare everything.
        self.initialize(sess)
        sess.graph.finalize()
        self.reset()

        obs = self.env.reset()
        # if eval_env is not None:
        #     eval_obs = eval_env.reset()
        nenvs = obs.shape[0]

        episode_reward = np.zeros(nenvs, dtype=np.float32)  # vector
        episode_step = np.zeros(nenvs, dtype=int)  # vector
        episodes = 0  # scalar
        t = 0  # scalar

        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                if nenvs > 1:
                    # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each
                    # of the environments, so resetting here instead
                    self.reset()
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q, _, _ = self.train_step(obs, apply_noise=True, compute_Q=True)

                    # Execute next action.
                    if rank == 0 and render:
                        self.env.render()

                    # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch
                    # new_obs, r, done, info = self.env.step(max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    new_obs, r, done, info = self.env.step(action)
                    # note these outputs are batched from vecenv

                    t += 1
                    if rank == 0 and render:
                        self.env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    # the batched data will be unrolled in memory.py's append.
                    self.store_transition(obs, action, r, new_obs, done)

                    obs = new_obs

                    for d in range(len(done)):
                        if done[d]:
                            # Episode done.
                            epoch_episode_rewards.append(episode_reward[d])
                            episode_rewards_history.append(episode_reward[d])
                            epoch_episode_steps.append(episode_step[d])
                            episode_reward[d] = 0.
                            episode_step[d] = 0
                            epoch_episodes += 1
                            episodes += 1
                            if nenvs == 1:
                                self.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if self.memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = self.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = self.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    self.update_target_net()
            #
            # # Evaluate.
            # eval_episode_rewards = []
            # eval_qs = []
            # if eval_env is not None:
            #     eval_obs = eval_env.reset()
            #     nenvs_eval = eval_obs.shape[0]
            #     eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32)
            #     for t_rollout in range(nb_eval_steps):
            #         eval_action, eval_q, _, _ = self.train_step(eval_obs, apply_noise=False, compute_Q=True)
            #         # eval_obs, eval_r, eval_done, eval_info = eval_env.step(
            #         #     max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
            #         eval_obs, eval_r, eval_done, eval_info = eval_env.step(eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
            #
            #         if render_eval:
            #             eval_env.render()
            #         eval_episode_reward += eval_r
            #
            #         eval_qs.append(eval_q)
            #         for d in range(len(eval_done)):
            #             if eval_done[d]:
            #                 eval_episode_rewards.append(eval_episode_reward[d])
            #                 eval_episode_rewards_history.append(eval_episode_reward[d])
            #                 eval_episode_reward[d] = 0.0

            if MPI is not None:
                mpi_size = MPI.COMM_WORLD.Get_size()
            else:
                mpi_size = 1

            # save trainable variables
            file_name = time.strftime('Y%YM%mD%d_h%Hm%Ms%S', time.localtime(time.time()))
            model_save_path = self.def_path_pre + file_name
            self.save(model_save_path)

            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = self.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_std'] = np.std(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
            combined_stats['rollout/return_history_std'] = np.std(episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            # combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            # combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            # combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            # if eval_env is not None:
            #     combined_stats['eval/return'] = eval_episode_rewards
            #     combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
            #     combined_stats['eval/Q'] = eval_qs
            #     combined_stats['eval/episodes'] = len(eval_episode_rewards)

            combined_stats_sums = np.array([np.array(x).flatten()[0] for x in combined_stats.values()])
            if MPI is not None:
                combined_stats_sums = MPI.COMM_WORLD.allreduce(combined_stats_sums)

            combined_stats = {k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums)}

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])

            if rank == 0:
                logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(self.env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                        pickle.dump(self.env.get_state(), f)
                # if eval_env and hasattr(eval_env, 'get_state'):
                #     with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                #         pickle.dump(eval_env.get_state(), f)
        self.sess.graph._unsafe_unfinalize()
        return self

    def save(self, save_path=None):
        save_variables(save_path=save_path, sess=self.sess)
        print('save model variables to', save_path)

    def load_newest(self, load_path=None):
        file_list = os.listdir(self.def_path_pre)
        file_list.sort(key=lambda x: os.path.getmtime(os.path.join(self.def_path_pre, x)))
        if load_path is None:
            load_path = os.path.join(self.def_path_pre, file_list[-1])
        load_variables(load_path=load_path, sess=self.sess)
        print('load_path: ', load_path)

    def load_index(self, index, load_path=None):
        file_list = os.listdir(self.def_path_pre)
        file_list.sort(key=lambda x: os.path.getmtime(os.path.join(self.def_path_pre, x)), reverse=True)
        if load_path is None:
            load_path = os.path.join(self.def_path_pre, file_list[index])
        load_variables(load_path=load_path, sess=self.sess)
        print('load_path: ', load_path)
Exemplo n.º 4
0
class Expert:
    def __init__(self, limit, env):
        self.limit = limit
        self.env = env
        self.memory = Memory(
            limit=self.limit,
            action_shape=self.env.action_space.shape,
            observation_shape=self.env.observation_space.shape)
        self.file_dir = None

    def load_file(self, file_dir, print_reward=False):
        self.file_dir = file_dir
        expert_file = open(self.file_dir, 'rb')
        expert_data = pickle.load(expert_file)
        expert_file.close()
        k = 0
        if print_reward:
            total_rew = 0.
            ep_rew = 0.
            nep = 1.
        for episode_sample in expert_data:
            for step_sample in episode_sample:
                k = k + 1
                if k <= self.limit:
                    if print_reward:
                        ep_rew += step_sample[2]
                        if step_sample[4]:
                            nep += 1
                            total_rew += ep_rew
                            ep_rew = 0
                    self.memory.append(step_sample[0], step_sample[1],
                                       step_sample[2], step_sample[3],
                                       step_sample[4])
                else:
                    if print_reward:
                        print(
                            'Successfully loaded expert files, average reward ',
                            total_rew / nep)
                    return
        if print_reward:
            print('Successfully loaded expert files, average reward ',
                  total_rew / nep)

    def load_file_trpo(self, file_dir):
        self.file_dir = file_dir
        traj_data = np.load(file_dir)
        if self.limit is None:
            obs = traj_data["obs"][:]
            acs = traj_data["acs"][:]
        else:
            obs = traj_data["obs"][:self.limit]
            acs = traj_data["acs"][:self.limit]
        episode_num = len(acs)
        '''
        step_num = 0
        for i in range(episode_num):
            step_num += len(acs[i])
        print("Total Step is:", step_num, "\nTotal_Episode is:", episode_num)
        '''
        for i in range(episode_num):
            episode_len = len(acs[i])
            for j in range(episode_len):
                done = True if (j == episode_len - 1) else False
                self.memory.append(obs[i][j], acs[i][j], 0., 0., done)

    def sample(self, batch_size):
        return self.memory.sample(batch_size)

    def set_tf(self,
               actor,
               critic,
               obs0,
               actions,
               obs_rms,
               ret_rms,
               observation_range,
               return_range,
               supervise=False,
               critic_only=False,
               actor_only=False,
               both_ours_sup=False,
               gail=False,
               pofd=False):
        self.expert_state = tf.placeholder(tf.float32,
                                           shape=(None, ) +
                                           self.env.observation_space.shape,
                                           name='expert_state')
        self.expert_action = tf.placeholder(tf.float32,
                                            shape=(None, ) +
                                            self.env.action_space.shape,
                                            name='expert_action')
        normalized_state = tf.clip_by_value(
            normalize(self.expert_state, obs_rms), observation_range[0],
            observation_range[1])
        expert_actor = actor(normalized_state, reuse=True)
        normalized_q_with_expert_data = critic(normalized_state,
                                               self.expert_action,
                                               reuse=True)
        normalized_q_with_expert_actor = critic(normalized_state,
                                                expert_actor,
                                                reuse=True)
        self.Q_with_expert_data = denormalize(
            tf.clip_by_value(normalized_q_with_expert_data, return_range[0],
                             return_range[1]), ret_rms)
        self.Q_with_expert_actor = denormalize(
            tf.clip_by_value(normalized_q_with_expert_actor, return_range[0],
                             return_range[1]), ret_rms)
        if supervise:
            self.actor_loss = tf.nn.l2_loss(self.expert_action - expert_actor)
            self.critic_loss = 0
        else:
            self.critic_loss = tf.reduce_mean(
                tf.nn.relu(self.Q_with_expert_actor - self.Q_with_expert_data))
            self.actor_loss = -tf.reduce_mean(self.Q_with_expert_actor)
            if critic_only:
                self.actor_loss = 0
            if actor_only:
                self.critic_loss = 0
        #self.dist = tf.reduce_mean(self.Q_with_expert_data - self.Q_with_expert_actor)
        if both_ours_sup:
            self.actor_loss = tf.nn.l2_loss(self.expert_action -
                                            expert_actor) - tf.reduce_mean(
                                                self.Q_with_expert_actor)
            self.critic_loss = tf.reduce_mean(
                tf.nn.relu(self.Q_with_expert_actor - self.Q_with_expert_data))

        if gail or pofd:
            discriminator = Discriminator()
            d_with_expert_data = discriminator(normalized_state,
                                               self.expert_action)
            d_with_gen_data = discriminator(obs0, actions, reuse=True)
            self.discriminator_loss = tf.reduce_mean(
                tf.log(d_with_gen_data)) + tf.reduce_mean(
                    tf.log(1 - d_with_expert_data))
            self.actor_loss = -tf.reduce_mean(tf.log(d_with_gen_data))
Exemplo n.º 5
0
def learn(
        network,
        env,
        data_path='',
        model_path='./model/',
        model_name='ddpg_none_fuzzy_150',
        file_name='test',
        model_based=False,
        memory_extend=False,
        model_type='linear',
        restore=False,
        dyna_learning=False,
        seed=None,
        nb_epochs=5,  # with default settings, perform 1M steps total
        nb_sample_cycle=5,
        nb_epoch_cycles=150,
        nb_rollout_steps=400,
        nb_model_learning=10,
        nb_sample_steps=50,
        nb_samples_extend=5,
        reward_scale=1.0,
        noise_type='normal_0.2',  #'adaptive-param_0.2',  ou_0.2, normal_0.2
        normalize_returns=False,
        normalize_observations=True,
        critic_l2_reg=1e-2,
        actor_lr=1e-4,
        critic_lr=1e-3,
        popart=False,
        gamma=0.99,
        clip_norm=None,
        nb_train_steps=50,  # per epoch cycle and MPI worker,
        batch_size=32,  # per MPI worker
        tau=0.01,
        param_noise_adaption_interval=50,
        **network_kwargs):

    nb_actions = env.action_space.shape[0]
    memory = Memory(limit=int(1e5),
                    action_shape=env.action_space.shape[0],
                    observation_shape=env.observation_space.shape)

    if model_based:
        """ store fake_data"""
        fake_memory = Memory(limit=int(1e5),
                             action_shape=env.action_space.shape[0],
                             observation_shape=env.observation_space.shape)
        """ select model or not """
        if model_type == 'gp':
            kernel = ConstantKernel(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2))
            dynamic_model = GaussianProcessRegressor(kernel=kernel)
            reward_model = GaussianProcessRegressor(kernel=kernel)
        elif model_type == 'linear':
            dynamic_model = LinearRegression()
            reward_model = LinearRegression()
        elif model_type == 'mlp':
            dynamic_model = MLPRegressor(hidden_layer_sizes=(100, ),
                                         activation='relu',
                                         solver='adam',
                                         alpha=0.0001,
                                         batch_size='auto',
                                         learning_rate='constant',
                                         learning_rate_init=0.001,
                                         power_t=0.5,
                                         max_iter=200,
                                         shuffle=True,
                                         random_state=None,
                                         tol=0.0001,
                                         verbose=False,
                                         warm_start=False,
                                         momentum=0.9,
                                         nesterovs_momentum=True,
                                         early_stopping=False,
                                         validation_fraction=0.1,
                                         beta_1=0.9,
                                         beta_2=0.999,
                                         epsilon=1e-08)
            reward_model = MLPRegressor(hidden_layer_sizes=(100, ),
                                        activation='relu',
                                        solver='adam',
                                        alpha=0.0001,
                                        batch_size='auto',
                                        learning_rate='constant',
                                        learning_rate_init=0.001,
                                        power_t=0.5,
                                        max_iter=200,
                                        shuffle=True,
                                        random_state=None,
                                        tol=0.0001,
                                        verbose=False,
                                        warm_start=False,
                                        momentum=0.9,
                                        nesterovs_momentum=True,
                                        early_stopping=False,
                                        validation_fraction=0.1,
                                        beta_1=0.9,
                                        beta_2=0.999,
                                        epsilon=1e-08)
        else:
            logger.info(
                "You need to give the model_type to fit the dynamic and reward!!!"
            )

    critic = Critic(network=network, **network_kwargs)
    actor = Actor(nb_actions, network=network, **network_kwargs)
    """ set noise """
    action_noise = None
    param_noise = None

    if noise_type is not None:
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                                 sigma=float(stddev) *
                                                 np.ones(nb_actions))
            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(
                    mu=np.zeros(nb_actions),
                    sigma=float(stddev) * np.ones(nb_actions))
            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))
    """action scale"""
    max_action = env.action_high_bound
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    """ agent ddpg """
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape[0],
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)

    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    sess = U.get_session()

    if restore:
        agent.restore(sess, model_path, model_name)
    else:
        agent.initialize(sess)
        sess.graph.finalize()
    agent.reset()

    episodes = 0
    epochs_rewards = np.zeros((nb_epochs, nb_epoch_cycles), dtype=np.float32)
    epochs_times = np.zeros((nb_epochs, nb_epoch_cycles), dtype=np.float32)
    epochs_steps = np.zeros((nb_epochs, nb_epoch_cycles), dtype=np.float32)
    epochs_states = []
    for epoch in range(nb_epochs):
        logger.info(
            "======================== The {} epoch start !!! ========================="
            .format(epoch))
        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_times = []
        epoch_actions = []
        epoch_episode_states = []
        epoch_qs = []
        epoch_episodes = 0
        for cycle in range(nb_epoch_cycles):
            start_time = time.time()
            obs, state, done = env.reset()
            obs_reset = cp.deepcopy(obs)
            episode_reward = 0.
            episode_step = 0
            episode_states = []
            logger.info(
                "================== The {} episode start !!! ==================="
                .format(cycle))
            for t_rollout in range(nb_rollout_steps):
                logger.info(
                    "================== The {} steps finish  !!! ==================="
                    .format(t_rollout))
                """ Predict next action """
                action, q, _, _ = agent.step(obs,
                                             stddev,
                                             apply_noise=True,
                                             compute_Q=True)

                new_obs, next_state, r, done, safe_or_not, final_action = env.step(
                    max_action * action, t_rollout)

                if safe_or_not is False:
                    break

                episode_reward += r
                episode_step += 1
                episode_states.append([
                    cp.deepcopy(state),
                    cp.deepcopy(final_action),
                    np.array(cp.deepcopy(r)),
                    cp.deepcopy(next_state)
                ])

                epoch_actions.append(action)
                epoch_qs.append(q)

                agent.store_transition(obs, action, r, new_obs, done)
                obs = new_obs
                state = next_state

                if done:
                    break
                """ extend the memory """
                if model_based and cycle > (nb_model_learning +
                                            1) and memory_extend:
                    pred_x = np.zeros((1, 18), dtype=np.float32)
                    for j in range(nb_samples_extend):
                        m_action, _, _, _ = agent.step(obs,
                                                       stddev,
                                                       apply_noise=True,
                                                       compute_Q=False)
                        pred_x[:, :12] = obs
                        pred_x[:, 12:] = m_action
                        m_new_obs = dynamic_model.predict(pred_x)[0]
                        """ get real reward """
                        # state = env.inverse_state(m_new_obs)
                        # m_reward = env.get_reward(state, m_action)
                        m_reward = reward_model.predict(pred_x)[0]
                        agent.store_transition(obs, m_action, m_reward,
                                               m_new_obs, done)
            """ generate new data and fit model"""
            if model_based and cycle > nb_model_learning:
                logger.info(
                    "==============================  Model Fit !!! ==============================="
                )
                input_x = np.concatenate(
                    (memory.observations0.data[:memory.nb_entries],
                     memory.actions.data[:memory.nb_entries]),
                    axis=1)
                input_y_obs = memory.observations1.data[:memory.nb_entries]
                input_y_reward = memory.rewards.data[:memory.nb_entries]
                dynamic_model.fit(input_x, input_y_obs)
                reward_model.fit(input_x, input_y_reward)

                if dyna_learning:
                    logger.info(
                        "=========================  Collect data !!! ================================="
                    )
                    pred_obs = np.zeros((1, 18), dtype=np.float32)
                    for sample_index in range(nb_sample_cycle):
                        fake_obs = obs_reset
                        for t_episode in range(nb_sample_steps):
                            fake_action, _, _, _ = agent.step(fake_obs,
                                                              stddev,
                                                              apply_noise=True,
                                                              compute_Q=False)
                            pred_obs[:, :12] = fake_obs
                            pred_obs[:, 12:] = fake_action
                            next_fake_obs = dynamic_model.predict(pred_obs)[0]
                            fake_reward = reward_model.predict(pred_obs)[0]
                            # next_fake_obs = dynamic_model.predict(np.concatenate((fake_obs, fake_action)))[0]
                            # fake_reward = reward_model.predict(np.concatenate((fake_obs, fake_action)))[0]
                            fake_obs = next_fake_obs
                            fake_terminals = False
                            fake_memory.append(fake_obs, fake_action,
                                               fake_reward, next_fake_obs,
                                               fake_terminals)
            """ noise decay """
            stddev = float(stddev) * 0.95

            duration = time.time() - start_time
            epoch_episode_rewards.append(episode_reward)
            epoch_episode_steps.append(episode_step)
            epoch_episode_times.append(cp.deepcopy(duration))
            epoch_episode_states.append(cp.deepcopy(episode_states))

            epochs_rewards[epoch, cycle] = episode_reward
            epochs_steps[epoch, cycle] = episode_step
            epochs_times[epoch, cycle] = cp.deepcopy(duration)

            logger.info(
                "============================= The Episode_Times:: {}!!! ============================"
                .format(epoch_episode_rewards))
            logger.info(
                "============================= The Episode_Times:: {}!!! ============================"
                .format(epoch_episode_times))

            epoch_episodes += 1
            episodes += 1
            """ Training process """
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []
            for t_train in range(nb_train_steps):
                logger.info("")
                # Adapt param noise, if necessary.
                if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                    distance = agent.adapt_param_noise()
                    epoch_adaptive_distances.append(distance)
                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()
            """ planning training """
            if model_based and cycle > (nb_model_learning +
                                        1) and dyna_learning:
                for t_train in range(nb_train_steps):
                    # setting for adapt param noise, if necessary.
                    if fake_memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)
                    batch = fake_memory.sample(batch_size=batch_size)
                    fake_cl, fake_al = agent.train_fake_data(batch)
                    epoch_critic_losses.append(fake_cl)
                    epoch_actor_losses.append(fake_al)
                    agent.update_target_net()

        epochs_states.append(cp.deepcopy(epoch_episode_states))

        # # save data
        np.save(
            data_path + 'train_reward_' + algorithm_name + '_' + noise_type +
            file_name, epochs_rewards)
        np.save(
            data_path + 'train_step_' + algorithm_name + '_' + noise_type +
            file_name, epochs_steps)
        np.save(
            data_path + 'train_states_' + algorithm_name + '_' + noise_type +
            file_name, epochs_states)
        np.save(
            data_path + 'train_times_' + algorithm_name + '_' + noise_type +
            file_name, epochs_times)

    # # agent save
    agent.store(model_path + 'train_model_' + algorithm_name + '_' +
                noise_type + file_name)
Exemplo n.º 6
0
class DDPG(object):
    def __init__(self, **params):
        for k in params:
            setattr(self, k, params[k])
        self.init_args = copy(params)

        if self.her:
            # self.obs_to_goal = None
            # self.goal_idx = None
            # self.reward_fn = None
            self.memory = HERBuffer(limit=int(self.buffer_size),
                                    action_shape=self.action_shape,
                                    observation_shape=self.observation_shape,
                                    obs_to_goal=self.obs_to_goal,
                                    goal_slice=self.goal_idx,
                                    reward_fn=self.reward_fn)
        else:
            self.memory = Memory(limit=int(self.buffer_size),
                                 action_shape=self.action_shape,
                                 observation_shape=self.observation_shape)

        self.critic = Critic(layer_norm=self.layer_norm)
        self.actor = Actor(self.action_shape[-1], layer_norm=self.layer_norm)

        self.action_noise = NormalActionNoise(mu=np.zeros(self.action_shape),
                                              sigma=float(self.noise_sigma) *
                                              np.ones(self.action_shape))
        self.param_noise = None

        # Inputs.
        self.obs0 = tf.placeholder(tf.float32,
                                   shape=(None, ) + self.observation_shape,
                                   name='obs0')
        self.obs1 = tf.placeholder(tf.float32,
                                   shape=(None, ) + self.observation_shape,
                                   name='obs1')
        # self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=(None, 1),
                                      name='rewards')
        self.actions = tf.placeholder(tf.float32,
                                      shape=(None, ) + self.action_shape,
                                      name='actions')
        self.critic_target = tf.placeholder(tf.float32,
                                            shape=(None, 1),
                                            name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32,
                                                 shape=(),
                                                 name='param_noise_stddev')

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=self.observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(self.actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(self.critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = self.actor(normalized_obs0)
        self.normalized_critic_tf = self.critic(normalized_obs0, self.actions)
        self.critic_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_tf, self.return_range[0],
                             self.return_range[1]), self.ret_rms)
        self.normalized_critic_with_actor_tf = self.critic(normalized_obs0,
                                                           self.actor_tf,
                                                           reuse=True)
        self.critic_with_actor_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_with_actor_tf,
                             self.return_range[0], self.return_range[1]),
            self.ret_rms)
        Q_obs1 = denormalize(
            target_critic(normalized_obs1, target_actor(normalized_obs1)),
            self.ret_rms)
        # self.target_Q = self.rewards + (1. - self.terminals1) * self.gamma * Q_obs1
        self.target_Q = self.rewards + self.gamma * Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

    def setup_target_network_updates(self):
        actor_init_updates, actor_soft_updates = get_target_updates(
            self.actor.vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(
            self.critic.vars, self.target_critic.vars, self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

    def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
        actor_shapes = [
            var.get_shape().as_list() for var in self.actor.trainable_vars
        ]
        actor_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = U.flatgrad(self.actor_loss,
                                      self.actor.trainable_vars,
                                      clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
                                       beta1=0.9,
                                       beta2=0.999,
                                       epsilon=1e-08)

    def setup_critic_optimizer(self):
        logger.info('setting up critic optimizer')
        normalized_critic_target_tf = tf.clip_by_value(
            normalize(self.critic_target, self.ret_rms), self.return_range[0],
            self.return_range[1])
        self.critic_loss = tf.reduce_mean(
            tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
        if self.critic_l2_reg > 0.:
            critic_reg_vars = [
                var for var in self.critic.trainable_vars
                if 'kernel' in var.name and 'output' not in var.name
            ]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(
                self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars)
            self.critic_loss += critic_reg
        critic_shapes = [
            var.get_shape().as_list() for var in self.critic.trainable_vars
        ]
        critic_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = U.flatgrad(self.critic_loss,
                                       self.critic.trainable_vars,
                                       clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
                                        beta1=0.9,
                                        beta2=0.999,
                                        epsilon=1e-08)

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean

        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [
                M.assign(M * self.old_std / new_std)
            ]
            self.renormalize_Q_outputs_op += [
                b.assign(
                    (b * self.old_std + self.old_mean - new_mean) / new_std)
            ]

    def setup_stats(self):
        ops = []
        names = []

        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']

        if self.normalize_observations:
            ops += [
                tf.reduce_mean(self.obs_rms.mean),
                tf.reduce_mean(self.obs_rms.std)
            ]
            names += ['obs_rms_mean', 'obs_rms_std']

        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['reference_Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']

        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf)]
        names += ['reference_action_std']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_mean']
            ops += [reduce_std(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_std']

        self.stats_ops = ops
        self.stats_names = names

    def pi(self, obs, apply_noise=True, compute_Q=True):
        if self.param_noise is not None and apply_noise:
            actor_tf = self.perturbed_actor_tf
        else:
            actor_tf = self.actor_tf
        feed_dict = {self.obs0: [obs]}
        if compute_Q:
            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf],
                                      feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None
        action = action.flatten()
        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action.shape
            action += noise
        action = np.clip(action, self.action_range[0], self.action_range[1])
        return action, q

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        reward *= self.reward_scale
        self.memory.append(obs0, action, reward, obs1, terminal1)
        if self.normalize_observations:
            self.obs_rms.update(np.array([obs0]))

    def train(self):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        if self.normalize_returns and self.enable_popart:
            old_mean, old_std, target_Q = self.sess.run(
                [self.ret_rms.mean, self.ret_rms.std, self.target_Q],
                feed_dict={
                    self.obs1: batch['obs1'],
                    self.rewards: batch['rewards'],
                    # self.terminals1: batch['terminals1'].astype('float32'),
                })
            self.ret_rms.update(target_Q.flatten())
            self.sess.run(self.renormalize_Q_outputs_op,
                          feed_dict={
                              self.old_std: np.array([old_std]),
                              self.old_mean: np.array([old_mean]),
                          })

            # Run sanity check. Disabled by default since it slows down things considerably.
            # print('running sanity check')
            # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })
            # print(target_Q_new, target_Q, new_mean, new_std)
            # assert (np.abs(target_Q - target_Q_new) < 1e-3).all()
        else:
            target_Q = self.sess.run(
                self.target_Q,
                feed_dict={
                    self.obs1: batch['obs1'],
                    self.rewards: batch['rewards'],
                    # self.terminals1: batch['terminals1'].astype('float32'),
                })

        # Get all gradients and perform a synced update.
        ops = [
            self.actor_grads, self.actor_loss, self.critic_grads,
            self.critic_loss
        ]
        actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(
            ops,
            feed_dict={
                self.obs0: batch['obs0'],
                self.actions: batch['actions'],
                self.critic_target: target_Q,
            })
        self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)

        return critic_loss, actor_loss

    def initialize(self, sess):
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)

    def update_target_net(self):
        self.sess.run(self.target_soft_updates)

    def get_stats(self):
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        values = self.sess.run(self.stats_ops,
                               feed_dict={
                                   self.obs0: self.stats_sample['obs0'],
                                   self.actions: self.stats_sample['actions'],
                               })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats

    def adapt_param_noise(self):
        if self.param_noise is None:
            return 0.

        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops,
                      feed_dict={
                          self.param_noise_stddev:
                          self.param_noise.current_stddev,
                      })
        distance = self.sess.run(self.adaptive_policy_distance,
                                 feed_dict={
                                     self.obs0:
                                     batch['obs0'],
                                     self.param_noise_stddev:
                                     self.param_noise.current_stddev,
                                 })

        mean_distance = mpi_mean(distance)
        self.param_noise.adapt(mean_distance)
        return mean_distance

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops,
                          feed_dict={
                              self.param_noise_stddev:
                              self.param_noise.current_stddev,
                          })
        self.flush()

    def flush(self):
        if self.her:
            self.memory.flush()

    def get_save_tf(self):
        all_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        return self.sess.run(all_variables)

    def restore_tf(self, save):
        all_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        restore_ops = []
        for x, y in zip(all_variables, save):
            restore_ops.append(tf.assign(x, y))
        self.sess.run(restore_ops)

    def __getstate__(self):
        exclude_vars = set(["env"])
        args = {}
        for k in self.init_args:
            if k not in exclude_vars:
                args[k] = self.init_args[k]
        return {'tf': self.get_save_tf(), 'init': args}

    def __setstate__(self, state):
        self.__init__(**state['init'])

        self.sess = tf.InteractiveSession(
        )  # for now just make ourself a session
        self.sess.run(tf.global_variables_initializer())
        self.restore_tf(state['tf'])
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()