示例#1
0
    def __init__(self, sess, config):
        BasePlayer.__init__(self, sess, config)
        self.network = config['network']
        self.use_action_masks = config.get('use_action_masks', False)
        self.obs_ph = tf.placeholder(self.obs_space.dtype,
                                     (None, ) + self.obs_space.shape,
                                     name='obs')
        self.actions_num = self.action_space.n
        if self.use_action_masks:
            print('using masks for action')
            self.action_mask_ph = tf.placeholder('int32',
                                                 (None, self.actions_num),
                                                 name='actions_mask')
        else:
            self.action_mask_ph = None
        self.mask = [False] * self.num_agents
        self.epoch_num = tf.Variable(tf.constant(0, shape=(),
                                                 dtype=tf.float32),
                                     trainable=False)

        self.normalize_input = self.config['normalize_input']
        self.input_obs = self.obs_ph
        if self.obs_space.dtype == np.uint8:
            self.input_obs = tf.to_float(self.input_obs) / 255.0

        if self.normalize_input:
            self.moving_mean_std = MovingMeanStd(shape=self.obs_space.shape,
                                                 epsilon=1e-5,
                                                 decay=0.99)
            self.input_obs = self.moving_mean_std.normalize(self.input_obs,
                                                            train=False)

        self.run_dict = {
            'name': 'agent',
            'inputs': self.input_obs,
            'batch_num': self.num_agents,
            'games_num': self.num_agents,
            'actions_num': self.actions_num,
            'prev_actions_ph': None,
            'action_mask_ph': self.action_mask_ph
        }
        self.last_state = None
        if self.network.is_rnn():
            self.neglop, self.value, self.action, _, self.states_ph, self.masks_ph, self.lstm_state, self.initial_state, self.logits = self.network(
                self.run_dict, reuse=False)
            self.last_state = self.initial_state * self.num_agents
        else:
            self.neglop, self.value, self.action, _, self.logits = self.network(
                self.run_dict, reuse=False)

        self.variables = TensorFlowVariables(
            [self.neglop, self.value, self.action], self.sess)

        self.saver = tf.train.Saver()
        self.sess.run(tf.global_variables_initializer())
示例#2
0
    def __init__(self, sess, config):
        BasePlayer.__init__(self, sess, config)
        self.network = config['network']
        self.obs_ph = tf.placeholder('float32',
                                     (None, ) + self.obs_space.shape,
                                     name='obs')
        self.actions_num = self.action_space.shape[0]
        self.actions_low = self.action_space.low
        self.actions_high = self.action_space.high
        self.mask = [False]
        self.epoch_num = tf.Variable(tf.constant(0, shape=(),
                                                 dtype=tf.float32),
                                     trainable=False)

        self.normalize_input = self.config['normalize_input']
        self.input_obs = self.obs_ph

        if self.obs_space.dtype == np.uint8:
            self.input_obs = tf.to_float(self.input_obs) / 255.0

        if self.normalize_input:
            self.moving_mean_std = MovingMeanStd(shape=self.obs_space.shape,
                                                 epsilon=1e-5,
                                                 decay=0.99)
            self.input_obs = self.moving_mean_std.normalize(self.input_obs,
                                                            train=False)

        self.run_dict = {
            'name': 'agent',
            'inputs': self.input_obs,
            'batch_num': 1,
            'games_num': 1,
            'actions_num': self.actions_num,
            'prev_actions_ph': None
        }
        self.last_state = None
        if self.network.is_rnn():
            self.neglop, self.value, self.action, _, self.mu, _, self.states_ph, self.masks_ph, self.lstm_state, self.initial_state = self.network(
                self.run_dict, reuse=False)
            self.last_state = self.initial_state
        else:
            self.neglop, self.value, self.action, _, self.mu, _ = self.network(
                self.run_dict, reuse=False)

        self.saver = tf.train.Saver()
        self.sess.run(tf.global_variables_initializer())
示例#3
0
class PpoPlayerDiscrete(BasePlayer):
    def __init__(self, sess, config):
        BasePlayer.__init__(self, sess, config)
        self.network = config['network']
        self.use_action_masks = config.get('use_action_masks', False)
        self.obs_ph = tf.placeholder(self.obs_space.dtype,
                                     (None, ) + self.obs_space.shape,
                                     name='obs')
        self.actions_num = self.action_space.n
        if self.use_action_masks:
            print('using masks for action')
            self.action_mask_ph = tf.placeholder('int32',
                                                 (None, self.actions_num),
                                                 name='actions_mask')
        else:
            self.action_mask_ph = None
        self.mask = [False] * self.num_agents
        self.epoch_num = tf.Variable(tf.constant(0, shape=(),
                                                 dtype=tf.float32),
                                     trainable=False)

        self.normalize_input = self.config['normalize_input']
        self.input_obs = self.obs_ph
        if self.obs_space.dtype == np.uint8:
            self.input_obs = tf.to_float(self.input_obs) / 255.0

        if self.normalize_input:
            self.moving_mean_std = MovingMeanStd(shape=self.obs_space.shape,
                                                 epsilon=1e-5,
                                                 decay=0.99)
            self.input_obs = self.moving_mean_std.normalize(self.input_obs,
                                                            train=False)

        self.run_dict = {
            'name': 'agent',
            'inputs': self.input_obs,
            'batch_num': self.num_agents,
            'games_num': self.num_agents,
            'actions_num': self.actions_num,
            'prev_actions_ph': None,
            'action_mask_ph': self.action_mask_ph
        }
        self.last_state = None
        if self.network.is_rnn():
            self.neglop, self.value, self.action, _, self.states_ph, self.masks_ph, self.lstm_state, self.initial_state, self.logits = self.network(
                self.run_dict, reuse=False)
            self.last_state = self.initial_state * self.num_agents
        else:
            self.neglop, self.value, self.action, _, self.logits = self.network(
                self.run_dict, reuse=False)

        self.variables = TensorFlowVariables(
            [self.neglop, self.value, self.action], self.sess)

        self.saver = tf.train.Saver()
        self.sess.run(tf.global_variables_initializer())

    def get_action(self, obs, is_determenistic=True):
        ret_action = self.action

        if self.network.is_rnn():
            action, self.last_state, logits = self.sess.run(
                [ret_action, self.lstm_state, self.logits], {
                    self.obs_ph: obs,
                    self.states_ph: self.last_state,
                    self.masks_ph: self.mask
                })
        else:
            action, logits = self.sess.run([ret_action, self.logits],
                                           {self.obs_ph: obs})

        if is_determenistic:
            return np.argmax(logits, axis=-1).astype(np.int32)
        else:
            return int(np.squeeze(action))

    def get_masked_action(self, obs, mask, is_determenistic=False):
        #if is_determenistic:
        ret_action = self.action

        if self.network.is_rnn():
            action, self.last_state, logits = self.sess.run(
                [ret_action, self.lstm_state, self.logits], {
                    self.action_mask_ph: mask,
                    self.obs_ph: obs,
                    self.states_ph: self.last_state,
                    self.masks_ph: self.mask
                })
        else:
            action, logits = self.sess.run([ret_action, self.logits], {
                self.action_mask_ph: mask,
                self.obs_ph: obs
            })
        if is_determenistic:
            logits = np.array(logits)
            return np.argmax(logits, axis=-1).astype(np.int32)
        else:
            return np.squeeze(action).astype(np.int32)

    def restore(self, fn):
        self.saver.restore(self.sess, fn)

    def reset(self):
        if self.network.is_rnn():
            self.last_state = self.initial_state
示例#4
0
class PpoPlayerContinuous(BasePlayer):
    def __init__(self, sess, config):
        BasePlayer.__init__(self, sess, config)
        self.network = config['network']
        self.obs_ph = tf.placeholder('float32',
                                     (None, ) + self.obs_space.shape,
                                     name='obs')
        self.actions_num = self.action_space.shape[0]
        self.actions_low = self.action_space.low
        self.actions_high = self.action_space.high
        self.mask = [False]
        self.epoch_num = tf.Variable(tf.constant(0, shape=(),
                                                 dtype=tf.float32),
                                     trainable=False)

        self.normalize_input = self.config['normalize_input']
        self.input_obs = self.obs_ph

        if self.obs_space.dtype == np.uint8:
            self.input_obs = tf.to_float(self.input_obs) / 255.0

        if self.normalize_input:
            self.moving_mean_std = MovingMeanStd(shape=self.obs_space.shape,
                                                 epsilon=1e-5,
                                                 decay=0.99)
            self.input_obs = self.moving_mean_std.normalize(self.input_obs,
                                                            train=False)

        self.run_dict = {
            'name': 'agent',
            'inputs': self.input_obs,
            'batch_num': 1,
            'games_num': 1,
            'actions_num': self.actions_num,
            'prev_actions_ph': None
        }
        self.last_state = None
        if self.network.is_rnn():
            self.neglop, self.value, self.action, _, self.mu, _, self.states_ph, self.masks_ph, self.lstm_state, self.initial_state = self.network(
                self.run_dict, reuse=False)
            self.last_state = self.initial_state
        else:
            self.neglop, self.value, self.action, _, self.mu, _ = self.network(
                self.run_dict, reuse=False)

        self.saver = tf.train.Saver()
        self.sess.run(tf.global_variables_initializer())

    def get_action(self, obs, is_determenistic=True):
        if is_determenistic:
            ret_action = self.mu
        else:
            ret_action = self.action

        if self.network.is_rnn():
            action, self.last_state = self.sess.run(
                [ret_action, self.lstm_state], {
                    self.obs_ph: obs,
                    self.states_ph: self.last_state,
                    self.masks_ph: self.mask
                })
        else:
            action = self.sess.run([ret_action], {self.obs_ph: obs})
        action = np.squeeze(action)
        return rescale_actions(self.actions_low, self.actions_high,
                               np.clip(action, -1.0, 1.0))

    def restore(self, fn):
        self.saver.restore(self.sess, fn)

    def reset(self):
        if self.network.is_rnn():
            self.last_state = self.initial_state
示例#5
0
    def __init__(self, sess, base_name, observation_space, action_space,
                 config):
        self.name = base_name
        self.actions_low = action_space.low
        self.actions_high = action_space.high
        self.env_name = config['env_name']
        self.ppo = config['ppo']
        self.is_adaptive_lr = config['lr_schedule'] == 'adaptive'
        self.is_polynom_decay_lr = config['lr_schedule'] == 'polynom_decay'
        self.is_exp_decay_lr = config['lr_schedule'] == 'exp_decay'
        self.lr_multiplier = tf.constant(1, shape=(), dtype=tf.float32)

        self.e_clip = config['e_clip']
        self.clip_value = config['clip_value']
        self.network = config['network']
        self.rewards_shaper = config['reward_shaper']
        self.num_actors = config['num_actors']
        self.env_config = config.get('env_config', {})
        self.vec_env = vecenv.create_vec_env(self.env_name, self.num_actors,
                                             **self.env_config)
        self.num_agents = self.vec_env.get_number_of_agents()
        self.steps_num = config['steps_num']
        self.normalize_advantage = config['normalize_advantage']
        self.config = config
        self.state_shape = observation_space.shape
        self.critic_coef = config['critic_coef']
        self.writer = SummaryWriter('runs/' + config['name'] +
                                    datetime.now().strftime("_%d-%H-%M-%S"))

        self.sess = sess
        self.grad_norm = config['grad_norm']
        self.gamma = self.config['gamma']
        self.tau = self.config['tau']
        self.normalize_input = self.config['normalize_input']
        self.seq_len = self.config['seq_length']
        self.dones = np.asarray([False] * self.num_actors, dtype=np.bool)

        self.current_rewards = np.asarray([0] * self.num_actors,
                                          dtype=np.float32)
        self.current_lengths = np.asarray([0] * self.num_actors,
                                          dtype=np.float32)
        self.game_rewards = deque([], maxlen=100)
        self.game_lengths = deque([], maxlen=100)

        self.obs_ph = tf.placeholder('float32', (None, ) + self.state_shape,
                                     name='obs')
        self.target_obs_ph = tf.placeholder('float32',
                                            (None, ) + self.state_shape,
                                            name='target_obs')
        self.actions_num = action_space.shape[0]
        self.actions_ph = tf.placeholder('float32',
                                         (None, ) + action_space.shape,
                                         name='actions')
        self.old_mu_ph = tf.placeholder('float32',
                                        (None, ) + action_space.shape,
                                        name='old_mu_ph')
        self.old_sigma_ph = tf.placeholder('float32',
                                           (None, ) + action_space.shape,
                                           name='old_sigma_ph')
        self.old_neglogp_actions_ph = tf.placeholder('float32', (None, ),
                                                     name='old_logpactions')
        self.rewards_ph = tf.placeholder('float32', (None, ), name='rewards')
        self.old_values_ph = tf.placeholder('float32', (None, ),
                                            name='old_values')
        self.advantages_ph = tf.placeholder('float32', (None, ),
                                            name='advantages')
        self.learning_rate_ph = tf.placeholder('float32', (), name='lr_ph')
        self.epoch_num = tf.Variable(tf.constant(0, shape=(),
                                                 dtype=tf.float32),
                                     trainable=False)
        self.update_epoch_op = self.epoch_num.assign(self.epoch_num + 1)
        self.current_lr = self.learning_rate_ph

        self.bounds_loss_coef = config.get('bounds_loss_coef', None)

        if self.is_adaptive_lr:
            self.lr_threshold = config['lr_threshold']
        if self.is_polynom_decay_lr:
            self.lr_multiplier = tf.train.polynomial_decay(
                1.0,
                global_step=self.epoch_num,
                decay_steps=config['max_epochs'],
                end_learning_rate=0.001,
                power=config.get('decay_power', 1.0))
        if self.is_exp_decay_lr:
            self.lr_multiplier = tf.train.exponential_decay(
                1.0,
                global_step=self.epoch_num,
                decay_steps=config['max_epochs'],
                decay_rate=config['decay_rate'])

        self.input_obs = self.obs_ph
        self.input_target_obs = self.target_obs_ph

        if observation_space.dtype == np.uint8:
            self.input_obs = tf.to_float(self.input_obs) / 255.0
            self.input_target_obs = tf.to_float(self.input_target_obs) / 255.0

        if self.normalize_input:
            self.moving_mean_std = MovingMeanStd(shape=observation_space.shape,
                                                 epsilon=1e-5,
                                                 decay=0.99)
            self.input_obs = self.moving_mean_std.normalize(self.input_obs,
                                                            train=True)
            self.input_target_obs = self.moving_mean_std.normalize(
                self.input_target_obs, train=False)

        games_num = self.config[
            'minibatch_size'] // self.seq_len  # it is used only for current rnn implementation

        self.train_dict = {
            'name': 'agent',
            'inputs': self.input_obs,
            'batch_num': self.config['minibatch_size'],
            'games_num': games_num,
            'actions_num': self.actions_num,
            'prev_actions_ph': self.actions_ph,
        }

        self.run_dict = {
            'name': 'agent',
            'inputs': self.input_target_obs,
            'batch_num': self.num_actors,
            'games_num': self.num_actors,
            'actions_num': self.actions_num,
            'prev_actions_ph': None,
        }

        self.states = None
        if self.network.is_rnn():
            self.neglogp_actions, self.state_values, self.action, self.entropy, self.mu, self.sigma, self.states_ph, self.masks_ph, self.lstm_state, self.initial_state = self.network(
                self.train_dict, reuse=False)
            self.target_neglogp, self.target_state_values, self.target_action, _, self.target_mu, self.target_sigma, self.target_states_ph, self.target_masks_ph, self.target_lstm_state, self.target_initial_state = self.network(
                self.run_dict, reuse=True)
            self.states = self.target_initial_state
        else:
            self.neglogp_actions, self.state_values, self.action, self.entropy, self.mu, self.sigma = self.network(
                self.train_dict, reuse=False)
            self.target_neglogp, self.target_state_values, self.target_action, _, self.target_mu, self.target_sigma = self.network(
                self.run_dict, reuse=True)

        curr_e_clip = self.e_clip * self.lr_multiplier
        if (self.ppo):
            self.prob_ratio = tf.exp(self.old_neglogp_actions_ph -
                                     self.neglogp_actions)
            self.prob_ratio = tf.clip_by_value(self.prob_ratio, 0.0, 16.0)
            self.pg_loss_unclipped = -tf.multiply(self.advantages_ph,
                                                  self.prob_ratio)
            self.pg_loss_clipped = -tf.multiply(
                self.advantages_ph,
                tf.clip_by_value(self.prob_ratio, 1. - curr_e_clip,
                                 1. + curr_e_clip))
            self.actor_loss = tf.reduce_mean(
                tf.maximum(self.pg_loss_unclipped, self.pg_loss_clipped))
        else:
            self.actor_loss = tf.reduce_mean(self.neglogp_actions *
                                             self.advantages_ph)

        self.c_loss = (tf.squeeze(self.state_values) - self.rewards_ph)**2

        if self.clip_value:
            self.cliped_values = self.old_values_ph + tf.clip_by_value(
                tf.squeeze(self.state_values) - self.old_values_ph,
                -curr_e_clip, curr_e_clip)
            self.c_loss_clipped = tf.square(self.cliped_values -
                                            self.rewards_ph)
            self.critic_loss = tf.reduce_mean(
                tf.maximum(self.c_loss, self.c_loss_clipped))
        else:
            self.critic_loss = tf.reduce_mean(self.c_loss)

        self._calc_kl_dist()

        self.loss = self.actor_loss + 0.5 * self.critic_coef * self.critic_loss - self.config[
            'entropy_coef'] * self.entropy
        self._apply_bound_loss()
        self.reg_loss = tf.losses.get_regularization_loss()
        self.loss += self.reg_loss
        self.train_step = tf.train.AdamOptimizer(self.current_lr *
                                                 self.lr_multiplier)
        self.weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                         scope='agent')

        grads = tf.gradients(self.loss, self.weights)
        if self.config['truncate_grads']:
            grads, _ = tf.clip_by_global_norm(grads, self.grad_norm)
        grads = list(zip(grads, self.weights))

        self.train_op = self.train_step.apply_gradients(grads)
        self.saver = tf.train.Saver()
        self.sess.run(tf.global_variables_initializer())
示例#6
0
class A2CAgent:
    def __init__(self, sess, base_name, observation_space, action_space,
                 config):
        self.name = base_name
        self.actions_low = action_space.low
        self.actions_high = action_space.high
        self.env_name = config['env_name']
        self.ppo = config['ppo']
        self.is_adaptive_lr = config['lr_schedule'] == 'adaptive'
        self.is_polynom_decay_lr = config['lr_schedule'] == 'polynom_decay'
        self.is_exp_decay_lr = config['lr_schedule'] == 'exp_decay'
        self.lr_multiplier = tf.constant(1, shape=(), dtype=tf.float32)

        self.e_clip = config['e_clip']
        self.clip_value = config['clip_value']
        self.network = config['network']
        self.rewards_shaper = config['reward_shaper']
        self.num_actors = config['num_actors']
        self.env_config = config.get('env_config', {})
        self.vec_env = vecenv.create_vec_env(self.env_name, self.num_actors,
                                             **self.env_config)
        self.num_agents = self.vec_env.get_number_of_agents()
        self.steps_num = config['steps_num']
        self.normalize_advantage = config['normalize_advantage']
        self.config = config
        self.state_shape = observation_space.shape
        self.critic_coef = config['critic_coef']
        self.writer = SummaryWriter('runs/' + config['name'] +
                                    datetime.now().strftime("_%d-%H-%M-%S"))

        self.sess = sess
        self.grad_norm = config['grad_norm']
        self.gamma = self.config['gamma']
        self.tau = self.config['tau']
        self.normalize_input = self.config['normalize_input']
        self.seq_len = self.config['seq_length']
        self.dones = np.asarray([False] * self.num_actors, dtype=np.bool)

        self.current_rewards = np.asarray([0] * self.num_actors,
                                          dtype=np.float32)
        self.current_lengths = np.asarray([0] * self.num_actors,
                                          dtype=np.float32)
        self.game_rewards = deque([], maxlen=100)
        self.game_lengths = deque([], maxlen=100)

        self.obs_ph = tf.placeholder('float32', (None, ) + self.state_shape,
                                     name='obs')
        self.target_obs_ph = tf.placeholder('float32',
                                            (None, ) + self.state_shape,
                                            name='target_obs')
        self.actions_num = action_space.shape[0]
        self.actions_ph = tf.placeholder('float32',
                                         (None, ) + action_space.shape,
                                         name='actions')
        self.old_mu_ph = tf.placeholder('float32',
                                        (None, ) + action_space.shape,
                                        name='old_mu_ph')
        self.old_sigma_ph = tf.placeholder('float32',
                                           (None, ) + action_space.shape,
                                           name='old_sigma_ph')
        self.old_neglogp_actions_ph = tf.placeholder('float32', (None, ),
                                                     name='old_logpactions')
        self.rewards_ph = tf.placeholder('float32', (None, ), name='rewards')
        self.old_values_ph = tf.placeholder('float32', (None, ),
                                            name='old_values')
        self.advantages_ph = tf.placeholder('float32', (None, ),
                                            name='advantages')
        self.learning_rate_ph = tf.placeholder('float32', (), name='lr_ph')
        self.epoch_num = tf.Variable(tf.constant(0, shape=(),
                                                 dtype=tf.float32),
                                     trainable=False)
        self.update_epoch_op = self.epoch_num.assign(self.epoch_num + 1)
        self.current_lr = self.learning_rate_ph

        self.bounds_loss_coef = config.get('bounds_loss_coef', None)

        if self.is_adaptive_lr:
            self.lr_threshold = config['lr_threshold']
        if self.is_polynom_decay_lr:
            self.lr_multiplier = tf.train.polynomial_decay(
                1.0,
                global_step=self.epoch_num,
                decay_steps=config['max_epochs'],
                end_learning_rate=0.001,
                power=config.get('decay_power', 1.0))
        if self.is_exp_decay_lr:
            self.lr_multiplier = tf.train.exponential_decay(
                1.0,
                global_step=self.epoch_num,
                decay_steps=config['max_epochs'],
                decay_rate=config['decay_rate'])

        self.input_obs = self.obs_ph
        self.input_target_obs = self.target_obs_ph

        if observation_space.dtype == np.uint8:
            self.input_obs = tf.to_float(self.input_obs) / 255.0
            self.input_target_obs = tf.to_float(self.input_target_obs) / 255.0

        if self.normalize_input:
            self.moving_mean_std = MovingMeanStd(shape=observation_space.shape,
                                                 epsilon=1e-5,
                                                 decay=0.99)
            self.input_obs = self.moving_mean_std.normalize(self.input_obs,
                                                            train=True)
            self.input_target_obs = self.moving_mean_std.normalize(
                self.input_target_obs, train=False)

        games_num = self.config[
            'minibatch_size'] // self.seq_len  # it is used only for current rnn implementation

        self.train_dict = {
            'name': 'agent',
            'inputs': self.input_obs,
            'batch_num': self.config['minibatch_size'],
            'games_num': games_num,
            'actions_num': self.actions_num,
            'prev_actions_ph': self.actions_ph,
        }

        self.run_dict = {
            'name': 'agent',
            'inputs': self.input_target_obs,
            'batch_num': self.num_actors,
            'games_num': self.num_actors,
            'actions_num': self.actions_num,
            'prev_actions_ph': None,
        }

        self.states = None
        if self.network.is_rnn():
            self.neglogp_actions, self.state_values, self.action, self.entropy, self.mu, self.sigma, self.states_ph, self.masks_ph, self.lstm_state, self.initial_state = self.network(
                self.train_dict, reuse=False)
            self.target_neglogp, self.target_state_values, self.target_action, _, self.target_mu, self.target_sigma, self.target_states_ph, self.target_masks_ph, self.target_lstm_state, self.target_initial_state = self.network(
                self.run_dict, reuse=True)
            self.states = self.target_initial_state
        else:
            self.neglogp_actions, self.state_values, self.action, self.entropy, self.mu, self.sigma = self.network(
                self.train_dict, reuse=False)
            self.target_neglogp, self.target_state_values, self.target_action, _, self.target_mu, self.target_sigma = self.network(
                self.run_dict, reuse=True)

        curr_e_clip = self.e_clip * self.lr_multiplier
        if (self.ppo):
            self.prob_ratio = tf.exp(self.old_neglogp_actions_ph -
                                     self.neglogp_actions)
            self.prob_ratio = tf.clip_by_value(self.prob_ratio, 0.0, 16.0)
            self.pg_loss_unclipped = -tf.multiply(self.advantages_ph,
                                                  self.prob_ratio)
            self.pg_loss_clipped = -tf.multiply(
                self.advantages_ph,
                tf.clip_by_value(self.prob_ratio, 1. - curr_e_clip,
                                 1. + curr_e_clip))
            self.actor_loss = tf.reduce_mean(
                tf.maximum(self.pg_loss_unclipped, self.pg_loss_clipped))
        else:
            self.actor_loss = tf.reduce_mean(self.neglogp_actions *
                                             self.advantages_ph)

        self.c_loss = (tf.squeeze(self.state_values) - self.rewards_ph)**2

        if self.clip_value:
            self.cliped_values = self.old_values_ph + tf.clip_by_value(
                tf.squeeze(self.state_values) - self.old_values_ph,
                -curr_e_clip, curr_e_clip)
            self.c_loss_clipped = tf.square(self.cliped_values -
                                            self.rewards_ph)
            self.critic_loss = tf.reduce_mean(
                tf.maximum(self.c_loss, self.c_loss_clipped))
        else:
            self.critic_loss = tf.reduce_mean(self.c_loss)

        self._calc_kl_dist()

        self.loss = self.actor_loss + 0.5 * self.critic_coef * self.critic_loss - self.config[
            'entropy_coef'] * self.entropy
        self._apply_bound_loss()
        self.reg_loss = tf.losses.get_regularization_loss()
        self.loss += self.reg_loss
        self.train_step = tf.train.AdamOptimizer(self.current_lr *
                                                 self.lr_multiplier)
        self.weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                         scope='agent')

        grads = tf.gradients(self.loss, self.weights)
        if self.config['truncate_grads']:
            grads, _ = tf.clip_by_global_norm(grads, self.grad_norm)
        grads = list(zip(grads, self.weights))

        self.train_op = self.train_step.apply_gradients(grads)
        self.saver = tf.train.Saver()
        self.sess.run(tf.global_variables_initializer())

    def _calc_kl_dist(self):
        self.kl_dist = policy_kl_tf(self.mu, self.sigma, self.old_mu_ph,
                                    self.old_sigma_ph)
        if self.is_adaptive_lr:
            self.current_lr = tf.where(
                self.kl_dist > (2.0 * self.lr_threshold),
                tf.maximum(self.current_lr / 1.5, 1e-6), self.current_lr)
            self.current_lr = tf.where(
                self.kl_dist < (0.5 * self.lr_threshold),
                tf.minimum(self.current_lr * 1.5, 1e-2), self.current_lr)

    def _apply_bound_loss(self):
        if self.bounds_loss_coef:
            soft_bound = 1.1
            mu_loss_high = tf.square(tf.maximum(0.0, self.mu - soft_bound))
            mu_loss_low = tf.square(tf.maximum(0.0, -soft_bound - self.mu))
            self.bounds_loss = tf.reduce_sum(mu_loss_high + mu_loss_low,
                                             axis=1)
            self.loss += self.bounds_loss * self.bounds_loss_coef
        else:
            self.bounds_loss = None

    def update_epoch(self):
        return self.sess.run([self.update_epoch_op])[0]

    def get_action_values(self, obs):
        run_ops = [
            self.target_action, self.target_state_values, self.target_neglogp,
            self.target_mu, self.target_sigma
        ]
        if self.network.is_rnn():
            run_ops.append(self.target_lstm_state)
            return self.sess.run(
                run_ops, {
                    self.target_obs_ph: obs,
                    self.target_states_ph: self.states,
                    self.target_masks_ph: self.dones
                })
        else:
            return (*self.sess.run(run_ops, {self.target_obs_ph: obs}), None)

    def get_values(self, obs):
        if self.network.is_rnn():
            return self.sess.run(
                [self.target_state_values], {
                    self.target_obs_ph: obs,
                    self.target_states_ph: self.states,
                    self.target_masks_ph: self.dones
                })
        else:
            return self.sess.run([self.target_state_values],
                                 {self.target_obs_ph: obs})

    def play_steps(self):
        # Here, we init the lists that will contain the mb of experiences
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs, mb_mus, mb_sigmas = [],[],[],[],[],[],[],[]
        mb_states = []
        epinfos = []
        # For n in range number of steps
        for _ in range(self.steps_num):
            if self.network.is_rnn():
                mb_states.append(self.states)
            actions, values, neglogpacs, mu, sigma, self.states = self.get_action_values(
                self.obs)
            #actions = np.squeeze(actions)
            values = np.squeeze(values)
            neglogpacs = np.squeeze(neglogpacs)
            mb_obs.append(self.obs.copy())
            mb_actions.append(actions)
            mb_values.append(values)
            mb_neglogpacs.append(neglogpacs)
            mb_dones.append(self.dones.copy())
            mb_mus.append(mu)
            mb_sigmas.append(sigma)

            self.obs[:], rewards, self.dones, infos = self.vec_env.step(
                rescale_actions(self.actions_low, self.actions_high,
                                np.clip(actions, -1.0, 1.0)))
            self.current_rewards += rewards
            self.current_lengths += 1

            for reward, length, done in zip(self.current_rewards,
                                            self.current_lengths, self.dones):
                if done:
                    self.game_rewards.append(reward)
                    self.game_lengths.append(length)

            shaped_rewards = self.rewards_shaper(rewards)
            epinfos.append(infos)
            mb_rewards.append(shaped_rewards)

            self.current_rewards = self.current_rewards * (1.0 - self.dones)
            self.current_lengths = self.current_lengths * (1.0 - self.dones)

        #using openai baseline approach
        mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
        mb_actions = np.asarray(mb_actions, dtype=np.float32)
        mb_values = np.asarray(mb_values, dtype=np.float32)
        mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)
        mb_mus = np.asarray(mb_mus, dtype=np.float32)
        mb_sigmas = np.asarray(mb_sigmas, dtype=np.float32)
        mb_dones = np.asarray(mb_dones, dtype=np.bool)
        mb_states = np.asarray(mb_states, dtype=np.float32)

        last_values = self.get_values(self.obs)
        last_values = np.squeeze(last_values)

        mb_returns = np.zeros_like(mb_rewards)
        mb_advs = np.zeros_like(mb_rewards)
        lastgaelam = 0

        for t in reversed(range(self.steps_num)):
            if t == self.steps_num - 1:
                nextnonterminal = 1.0 - self.dones
                nextvalues = last_values
            else:
                nextnonterminal = 1.0 - mb_dones[t + 1]
                nextvalues = mb_values[t + 1]

            delta = mb_rewards[
                t] + self.gamma * nextvalues * nextnonterminal - mb_values[t]
            mb_advs[
                t] = lastgaelam = delta + self.gamma * self.tau * nextnonterminal * lastgaelam

        mb_returns = mb_advs + mb_values
        if self.network.is_rnn():
            result = (*map(
                swap_and_flatten01,
                (mb_obs, mb_returns, mb_dones, mb_actions, mb_values,
                 mb_neglogpacs, mb_mus, mb_sigmas, mb_states)), epinfos)
        else:
            result = (*map(swap_and_flatten01,
                           (mb_obs, mb_returns, mb_dones, mb_actions,
                            mb_values, mb_neglogpacs, mb_mus, mb_sigmas)),
                      None, epinfos)

        return result

    def save(self, fn):
        self.saver.save(self.sess, fn)

    def restore(self, fn):
        self.saver.restore(self.sess, fn)

    def train(self):
        max_epochs = self.config.get('max_epochs', 1e6)
        self.obs = self.vec_env.reset()
        batch_size = self.steps_num * self.num_actors * self.num_agents
        minibatch_size = self.config['minibatch_size']
        mini_epochs_num = self.config['mini_epochs']
        num_minibatches = batch_size // minibatch_size
        last_lr = self.config['learning_rate']

        self.last_mean_rewards = -100500

        epoch_num = 0
        frame = 0
        update_time = 0
        play_time = 0
        start_time = time.time()
        total_time = 0

        while True:
            play_time_start = time.time()
            epoch_num = self.update_epoch()
            frame += batch_size
            obses, returns, dones, actions, values, neglogpacs, mus, sigmas, lstm_states, _ = self.play_steps(
            )
            advantages = returns - values

            if self.normalize_advantage:
                advantages = (advantages -
                              advantages.mean()) / (advantages.std() + 1e-8)

            a_losses = []
            c_losses = []
            b_losses = []
            entropies = []
            kls = []
            play_time_end = time.time()
            play_time = play_time_end - play_time_start
            update_time_start = time.time()

            if self.network.is_rnn():
                total_games = batch_size // self.seq_len
                num_games_batch = minibatch_size // self.seq_len
                game_indexes = np.arange(total_games)
                flat_indexes = np.arange(total_games * self.seq_len).reshape(
                    total_games, self.seq_len)
                lstm_states = lstm_states[::self.seq_len]

                for _ in range(0, mini_epochs_num):
                    np.random.shuffle(game_indexes)

                    for i in range(0, num_minibatches):
                        batch = range(i * num_games_batch,
                                      (i + 1) * num_games_batch)
                        mb_indexes = game_indexes[batch]
                        mbatch = flat_indexes[mb_indexes].ravel()

                        dict = {}
                        dict[self.old_values_ph] = values[mbatch]
                        dict[self.old_neglogp_actions_ph] = neglogpacs[mbatch]
                        dict[self.advantages_ph] = advantages[mbatch]
                        dict[self.rewards_ph] = returns[mbatch]
                        dict[self.actions_ph] = actions[mbatch]
                        dict[self.obs_ph] = obses[mbatch]
                        dict[self.old_mu_ph] = mus[mbatch]
                        dict[self.old_sigma_ph] = sigmas[mbatch]
                        dict[self.masks_ph] = dones[mbatch]
                        dict[self.states_ph] = lstm_states[mb_indexes]

                        dict[self.learning_rate_ph] = last_lr
                        run_ops = [
                            self.actor_loss, self.critic_loss, self.entropy,
                            self.kl_dist, self.current_lr, self.mu, self.sigma,
                            self.lr_multiplier
                        ]
                        if self.bounds_loss is not None:
                            run_ops.append(self.bounds_loss)

                        run_ops.append(self.train_op)
                        run_ops.append(
                            tf.get_collection(tf.GraphKeys.UPDATE_OPS))

                        res_dict = self.sess.run(run_ops, dict)
                        a_loss = res_dict[0]
                        c_loss = res_dict[1]
                        entropy = res_dict[2]
                        kl = res_dict[3]
                        last_lr = res_dict[4]
                        cmu = res_dict[5]
                        csigma = res_dict[6]
                        lr_mul = res_dict[7]
                        if self.bounds_loss is not None:
                            b_loss = res_dict[8]
                            b_losses.append(b_loss)

                        mus[mbatch] = cmu
                        sigmas[mbatch] = csigma
                        a_losses.append(a_loss)
                        c_losses.append(c_loss)
                        kls.append(kl)
                        entropies.append(entropy)
            else:
                for _ in range(0, mini_epochs_num):
                    permutation = np.random.permutation(batch_size)

                    obses = obses[permutation]
                    returns = returns[permutation]
                    actions = actions[permutation]
                    values = values[permutation]
                    neglogpacs = neglogpacs[permutation]
                    advantages = advantages[permutation]
                    mus = mus[permutation]
                    sigmas = sigmas[permutation]

                    for i in range(0, num_minibatches):
                        batch = range(i * minibatch_size,
                                      (i + 1) * minibatch_size)
                        dict = {
                            self.obs_ph: obses[batch],
                            self.actions_ph: actions[batch],
                            self.rewards_ph: returns[batch],
                            self.advantages_ph: advantages[batch],
                            self.old_neglogp_actions_ph: neglogpacs[batch],
                            self.old_values_ph: values[batch]
                        }

                        dict[self.old_mu_ph] = mus[batch]
                        dict[self.old_sigma_ph] = sigmas[batch]
                        dict[self.learning_rate_ph] = last_lr
                        run_ops = [
                            self.actor_loss, self.critic_loss, self.entropy,
                            self.kl_dist, self.current_lr, self.mu, self.sigma,
                            self.lr_multiplier
                        ]
                        if self.bounds_loss is not None:
                            run_ops.append(self.bounds_loss)

                        run_ops.append(self.train_op)
                        run_ops.append(
                            tf.get_collection(tf.GraphKeys.UPDATE_OPS))

                        res_dict = self.sess.run(run_ops, dict)
                        a_loss = res_dict[0]
                        c_loss = res_dict[1]
                        entropy = res_dict[2]
                        kl = res_dict[3]
                        last_lr = res_dict[4]
                        cmu = res_dict[5]
                        csigma = res_dict[6]
                        lr_mul = res_dict[7]
                        if self.bounds_loss is not None:
                            b_loss = res_dict[8]
                            b_losses.append(b_loss)
                        mus[batch] = cmu
                        sigmas[batch] = csigma
                        a_losses.append(a_loss)
                        c_losses.append(c_loss)
                        kls.append(kl)
                        entropies.append(entropy)

            update_time_end = time.time()
            update_time = update_time_end - update_time_start
            sum_time = update_time + play_time

            total_time = update_time_end - start_time

            if self.rank == 0:
                scaled_time = sum_time  # self.num_agents *
                scaled_play_time = play_time  # self.num_agents *
                if self.print_stats:
                    fps_step = batch_size / scaled_play_time
                    fps_total = batch_size / scaled_time
                    print(
                        f'fps step: {fps_step:.1f} fps total: {fps_total:.1f}')

                # performance
                self.writer.add_scalar('performance/total_fps',
                                       batch_size / sum_time, frame)
                self.writer.add_scalar('performance/step_fps',
                                       batch_size / play_time, frame)
                self.writer.add_scalar('performance/play_time', play_time,
                                       frame)
                self.writer.add_scalar('performance/update_time', update_time,
                                       frame)

                # losses
                self.writer.add_scalar('losses/a_loss', np.mean(a_losses),
                                       frame)
                self.writer.add_scalar('losses/c_loss', np.mean(c_losses),
                                       frame)
                if len(b_losses) > 0:
                    self.writer.add_scalar('losses/bounds_loss',
                                           np.mean(b_losses), frame)
                self.writer.add_scalar('losses/entropy', np.mean(entropies),
                                       frame)

                # info
                self.writer.add_scalar('info/last_lr', last_lr * lr_mul, frame)
                self.writer.add_scalar('info/lr_mul', lr_mul, frame)
                self.writer.add_scalar('info/e_clip', self.e_clip * lr_mul,
                                       frame)
                self.writer.add_scalar('info/kl', np.mean(kls), frame)
                self.writer.add_scalar('info/epochs', epoch_num, frame)

                if len(self.game_rewards) > 0:
                    mean_rewards = np.mean(self.game_rewards)
                    mean_lengths = np.mean(self.game_lengths)
                    self.writer.add_scalar('rewards/frame', mean_rewards,
                                           frame)
                    self.writer.add_scalar('rewards/time', mean_rewards,
                                           total_time)
                    self.writer.add_scalar('episode_lengths/frame',
                                           mean_lengths, frame)
                    self.writer.add_scalar('episode_lengths/time',
                                           mean_lengths, total_time)

                    if mean_rewards > self.last_mean_rewards:
                        print('saving next best rewards: ', mean_rewards)
                        self.last_mean_rewards = mean_rewards
                        self.save("./nn/" + self.name)
                        if self.last_mean_rewards > self.config['score_to_win']:
                            self.save("./nn/" + self.config['name'] + 'ep=' +
                                      str(epoch_num) + 'rew=' +
                                      str(mean_rewards))
                            return self.last_mean_rewards, epoch_num

                if epoch_num > max_epochs:
                    print('MAX EPOCHS NUM!')
                    self.save("./nn/" + 'last_' + self.config['name'] + 'ep=' +
                              str(epoch_num) + 'rew=' + str(mean_rewards))
                    return self.last_mean_rewards, epoch_num
                update_time = 0
示例#7
0
    def __init__(self, sess, base_name, observation_space, action_space,
                 config):
        observation_shape = observation_space.shape
        self.use_action_masks = config.get('use_action_masks', False)
        self.is_train = config.get('is_train', True)
        self.self_play = config.get('self_play', False)
        self.name = base_name
        self.config = config
        self.env_name = config['env_name']
        self.ppo = config['ppo']
        self.is_adaptive_lr = config['lr_schedule'] == 'adaptive'
        self.is_polynom_decay_lr = config['lr_schedule'] == 'polynom_decay'
        self.is_exp_decay_lr = config['lr_schedule'] == 'exp_decay'
        self.lr_multiplier = tf.constant(1, shape=(), dtype=tf.float32)
        self.epoch_num = tf.Variable(tf.constant(0, shape=(),
                                                 dtype=tf.float32),
                                     trainable=False)

        self.e_clip = config['e_clip']
        self.clip_value = config['clip_value']
        self.network = config['network']
        self.rewards_shaper = config['reward_shaper']
        self.num_actors = config['num_actors']
        self.env_config = self.config.get('env_config', {})
        self.vec_env = vecenv.create_vec_env(self.env_name, self.num_actors,
                                             **self.env_config)
        self.num_agents = self.vec_env.get_number_of_agents()
        self.steps_num = config['steps_num']
        self.seq_len = self.config['seq_length']
        self.normalize_advantage = config['normalize_advantage']
        self.normalize_input = self.config['normalize_input']

        self.state_shape = observation_shape
        self.critic_coef = config['critic_coef']
        self.writer = SummaryWriter('runs/' + config['name'] +
                                    datetime.now().strftime("_%d-%H-%M-%S"))
        self.sess = sess
        self.grad_norm = config['grad_norm']
        self.gamma = self.config['gamma']
        self.tau = self.config['tau']

        self.ignore_dead_batches = self.config.get('ignore_dead_batches',
                                                   False)

        self.dones = np.asarray([False] * self.num_actors * self.num_agents,
                                dtype=np.bool)
        self.current_rewards = np.asarray([0] * self.num_actors *
                                          self.num_agents,
                                          dtype=np.float32)
        self.current_lengths = np.asarray([0] * self.num_actors *
                                          self.num_agents,
                                          dtype=np.float32)
        self.games_to_track = self.config.get('games_to_track', 100)
        self.game_rewards = deque([], maxlen=self.games_to_track)
        self.game_lengths = deque([], maxlen=self.games_to_track)
        self.game_scores = deque([], maxlen=self.games_to_track)
        self.obs_ph = tf.placeholder(observation_space.dtype,
                                     (None, ) + observation_shape,
                                     name='obs')
        self.target_obs_ph = tf.placeholder(observation_space.dtype,
                                            (None, ) + observation_shape,
                                            name='target_obs')
        self.actions_num = action_space.n
        self.actions_ph = tf.placeholder('int32', (None, ), name='actions')
        if self.use_action_masks:
            self.action_mask_ph = tf.placeholder('int32',
                                                 (None, self.actions_num),
                                                 name='actions_mask')
        else:
            self.action_mask_ph = None

        self.old_logp_actions_ph = tf.placeholder('float32', (None, ),
                                                  name='old_logpactions')
        self.rewards_ph = tf.placeholder('float32', (None, ), name='rewards')
        self.old_values_ph = tf.placeholder('float32', (None, ),
                                            name='old_values')
        self.advantages_ph = tf.placeholder('float32', (None, ),
                                            name='advantages')
        self.learning_rate_ph = tf.placeholder('float32', (), name='lr_ph')

        self.update_epoch_op = self.epoch_num.assign(self.epoch_num + 1)
        self.current_lr = self.learning_rate_ph

        self.input_obs = self.obs_ph
        self.input_target_obs = self.target_obs_ph

        if observation_space.dtype == np.uint8:
            self.input_obs = tf.to_float(self.input_obs) / 255.0
            self.input_target_obs = tf.to_float(self.input_target_obs) / 255.0

        if self.is_adaptive_lr:
            self.lr_threshold = config['lr_threshold']
        if self.is_polynom_decay_lr:
            self.lr_multiplier = tf.train.polynomial_decay(
                1.0,
                self.epoch_num,
                config['max_epochs'],
                end_learning_rate=0.001,
                power=tr_helpers.get_or_default(config, 'decay_power', 1.0))
        if self.is_exp_decay_lr:
            self.lr_multiplier = tf.train.exponential_decay(
                1.0,
                self.epoch_num,
                config['max_epochs'],
                decay_rate=config['decay_rate'])
        if self.normalize_input:
            self.moving_mean_std = MovingMeanStd(shape=observation_space.shape,
                                                 epsilon=1e-5,
                                                 decay=0.99)
            self.input_obs = self.moving_mean_std.normalize(self.input_obs,
                                                            train=True)
            self.input_target_obs = self.moving_mean_std.normalize(
                self.input_target_obs, train=False)

        games_num = self.config[
            'minibatch_size'] // self.seq_len  # it is used only for current rnn implementation

        self.train_dict = {
            'name': 'agent',
            'inputs': self.input_obs,
            'batch_num': self.config['minibatch_size'],
            'games_num': games_num,
            'actions_num': self.actions_num,
            'prev_actions_ph': self.actions_ph,
            'action_mask_ph': None
        }

        self.run_dict = {
            'name': 'agent',
            'inputs': self.input_target_obs,
            'batch_num': self.num_actors * self.num_agents,
            'games_num': self.num_actors * self.num_agents,
            'actions_num': self.actions_num,
            'prev_actions_ph': None,
            'action_mask_ph': self.action_mask_ph
        }

        self.states = None
        if self.network.is_rnn():
            self.logp_actions, self.state_values, self.action, self.entropy, self.states_ph, self.masks_ph, self.lstm_state, self.initial_state = self.network(
                self.train_dict, reuse=False)
            self.target_neglogp, self.target_state_values, self.target_action, _, self.target_states_ph, self.target_masks_ph, self.target_lstm_state, self.target_initial_state, self.logits = self.network(
                self.run_dict, reuse=True)
            self.states = self.target_initial_state
        else:
            self.logp_actions, self.state_values, self.action, self.entropy = self.network(
                self.train_dict, reuse=False)
            self.target_neglogp, self.target_state_values, self.target_action, _, self.logits = self.network(
                self.run_dict, reuse=True)

        self.saver = tf.train.Saver()
        self.variables = TensorFlowVariables([
            self.target_action, self.target_state_values, self.target_neglogp
        ], self.sess)

        if self.is_train:
            self.setup_losses()

        self.sess.run(tf.global_variables_initializer())