def __init__(self, sess, base_name, observation_space, action_space, config): self.name = base_name self.actions_low = action_space.low self.actions_high = action_space.high self.env_name = config['env_name'] self.ppo = config['ppo'] self.is_adaptive_lr = config['lr_schedule'] == 'adaptive' self.is_polynom_decay_lr = config['lr_schedule'] == 'polynom_decay' self.is_exp_decay_lr = config['lr_schedule'] == 'exp_decay' self.lr_multiplier = tf.constant(1, shape=(), dtype=tf.float32) self.e_clip = config['e_clip'] self.clip_value = config['clip_value'] self.network = config['network'] self.rewards_shaper = config['reward_shaper'] self.num_actors = config['num_actors'] self.env_config = config.get('env_config', {}) self.vec_env = vecenv.create_vec_env(self.env_name, self.num_actors, **self.env_config) self.num_agents = self.vec_env.get_number_of_agents() self.steps_num = config['steps_num'] self.normalize_advantage = config['normalize_advantage'] self.config = config self.state_shape = observation_space.shape self.critic_coef = config['critic_coef'] self.writer = SummaryWriter('runs/' + config['name'] + datetime.now().strftime("_%d-%H-%M-%S")) self.sess = sess self.grad_norm = config['grad_norm'] self.gamma = self.config['gamma'] self.tau = self.config['tau'] self.normalize_input = self.config['normalize_input'] self.seq_len = self.config['seq_length'] self.dones = np.asarray([False] * self.num_actors, dtype=np.bool) self.current_rewards = np.asarray([0] * self.num_actors, dtype=np.float32) self.current_lengths = np.asarray([0] * self.num_actors, dtype=np.float32) self.game_rewards = deque([], maxlen=100) self.game_lengths = deque([], maxlen=100) self.obs_ph = tf.placeholder('float32', (None, ) + self.state_shape, name='obs') self.target_obs_ph = tf.placeholder('float32', (None, ) + self.state_shape, name='target_obs') self.actions_num = action_space.shape[0] self.actions_ph = tf.placeholder('float32', (None, ) + action_space.shape, name='actions') self.old_mu_ph = tf.placeholder('float32', (None, ) + action_space.shape, name='old_mu_ph') self.old_sigma_ph = tf.placeholder('float32', (None, ) + action_space.shape, name='old_sigma_ph') self.old_neglogp_actions_ph = tf.placeholder('float32', (None, ), name='old_logpactions') self.rewards_ph = tf.placeholder('float32', (None, ), name='rewards') self.old_values_ph = tf.placeholder('float32', (None, ), name='old_values') self.advantages_ph = tf.placeholder('float32', (None, ), name='advantages') self.learning_rate_ph = tf.placeholder('float32', (), name='lr_ph') self.epoch_num = tf.Variable(tf.constant(0, shape=(), dtype=tf.float32), trainable=False) self.update_epoch_op = self.epoch_num.assign(self.epoch_num + 1) self.current_lr = self.learning_rate_ph self.bounds_loss_coef = config.get('bounds_loss_coef', None) if self.is_adaptive_lr: self.lr_threshold = config['lr_threshold'] if self.is_polynom_decay_lr: self.lr_multiplier = tf.train.polynomial_decay( 1.0, global_step=self.epoch_num, decay_steps=config['max_epochs'], end_learning_rate=0.001, power=config.get('decay_power', 1.0)) if self.is_exp_decay_lr: self.lr_multiplier = tf.train.exponential_decay( 1.0, global_step=self.epoch_num, decay_steps=config['max_epochs'], decay_rate=config['decay_rate']) self.input_obs = self.obs_ph self.input_target_obs = self.target_obs_ph if observation_space.dtype == np.uint8: self.input_obs = tf.to_float(self.input_obs) / 255.0 self.input_target_obs = tf.to_float(self.input_target_obs) / 255.0 if self.normalize_input: self.moving_mean_std = MovingMeanStd(shape=observation_space.shape, epsilon=1e-5, decay=0.99) self.input_obs = self.moving_mean_std.normalize(self.input_obs, train=True) self.input_target_obs = self.moving_mean_std.normalize( self.input_target_obs, train=False) games_num = self.config[ 'minibatch_size'] // self.seq_len # it is used only for current rnn implementation self.train_dict = { 'name': 'agent', 'inputs': self.input_obs, 'batch_num': self.config['minibatch_size'], 'games_num': games_num, 'actions_num': self.actions_num, 'prev_actions_ph': self.actions_ph, } self.run_dict = { 'name': 'agent', 'inputs': self.input_target_obs, 'batch_num': self.num_actors, 'games_num': self.num_actors, 'actions_num': self.actions_num, 'prev_actions_ph': None, } self.states = None if self.network.is_rnn(): self.neglogp_actions, self.state_values, self.action, self.entropy, self.mu, self.sigma, self.states_ph, self.masks_ph, self.lstm_state, self.initial_state = self.network( self.train_dict, reuse=False) self.target_neglogp, self.target_state_values, self.target_action, _, self.target_mu, self.target_sigma, self.target_states_ph, self.target_masks_ph, self.target_lstm_state, self.target_initial_state = self.network( self.run_dict, reuse=True) self.states = self.target_initial_state else: self.neglogp_actions, self.state_values, self.action, self.entropy, self.mu, self.sigma = self.network( self.train_dict, reuse=False) self.target_neglogp, self.target_state_values, self.target_action, _, self.target_mu, self.target_sigma = self.network( self.run_dict, reuse=True) curr_e_clip = self.e_clip * self.lr_multiplier if (self.ppo): self.prob_ratio = tf.exp(self.old_neglogp_actions_ph - self.neglogp_actions) self.prob_ratio = tf.clip_by_value(self.prob_ratio, 0.0, 16.0) self.pg_loss_unclipped = -tf.multiply(self.advantages_ph, self.prob_ratio) self.pg_loss_clipped = -tf.multiply( self.advantages_ph, tf.clip_by_value(self.prob_ratio, 1. - curr_e_clip, 1. + curr_e_clip)) self.actor_loss = tf.reduce_mean( tf.maximum(self.pg_loss_unclipped, self.pg_loss_clipped)) else: self.actor_loss = tf.reduce_mean(self.neglogp_actions * self.advantages_ph) self.c_loss = (tf.squeeze(self.state_values) - self.rewards_ph)**2 if self.clip_value: self.cliped_values = self.old_values_ph + tf.clip_by_value( tf.squeeze(self.state_values) - self.old_values_ph, -curr_e_clip, curr_e_clip) self.c_loss_clipped = tf.square(self.cliped_values - self.rewards_ph) self.critic_loss = tf.reduce_mean( tf.maximum(self.c_loss, self.c_loss_clipped)) else: self.critic_loss = tf.reduce_mean(self.c_loss) self._calc_kl_dist() self.loss = self.actor_loss + 0.5 * self.critic_coef * self.critic_loss - self.config[ 'entropy_coef'] * self.entropy self._apply_bound_loss() self.reg_loss = tf.losses.get_regularization_loss() self.loss += self.reg_loss self.train_step = tf.train.AdamOptimizer(self.current_lr * self.lr_multiplier) self.weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='agent') grads = tf.gradients(self.loss, self.weights) if self.config['truncate_grads']: grads, _ = tf.clip_by_global_norm(grads, self.grad_norm) grads = list(zip(grads, self.weights)) self.train_op = self.train_step.apply_gradients(grads) self.saver = tf.train.Saver() self.sess.run(tf.global_variables_initializer())
def base_init(self, base_name, config): self.env_config = config.get('env_config', {}) self.num_actors = config.get('num_actors', 1) self.env_name = config['env_name'] print("Env name:", self.env_name) self.env_info = config.get('env_info') if self.env_info is None: self.vec_env = vecenv.create_vec_env(self.env_name, self.num_actors, **self.env_config) self.env_info = self.vec_env.get_env_info() self.sac_device = config.get('device', 'cuda:0') #temporary: self.ppo_device = self.sac_device print('Env info:') print(self.env_info) self.rewards_shaper = config['reward_shaper'] self.observation_space = self.env_info['observation_space'] self.weight_decay = config.get('weight_decay', 0.0) #self.use_action_masks = config.get('use_action_masks', False) self.is_train = config.get('is_train', True) self.c_loss = nn.MSELoss() # self.c2_loss = nn.SmoothL1Loss() self.save_best_after = config.get('save_best_after', 500) self.print_stats = config.get('print_stats', True) self.rnn_states = None self.name = base_name self.max_epochs = self.config.get('max_epochs', 1e6) self.network = config['network'] self.rewards_shaper = config['reward_shaper'] self.num_agents = self.env_info.get('agents', 1) self.obs_shape = self.observation_space.shape self.games_to_track = self.config.get('games_to_track', 100) self.game_rewards = torch_ext.AverageMeter(1, self.games_to_track).to( self.sac_device) self.game_lengths = torch_ext.AverageMeter(1, self.games_to_track).to( self.sac_device) self.obs = None self.min_alpha = torch.tensor(np.log(1)).float().to(self.sac_device) self.frame = 0 self.update_time = 0 self.last_mean_rewards = -100500 self.play_time = 0 self.epoch_num = 0 self.writer = SummaryWriter('runs/' + config['name'] + datetime.now().strftime("_%d-%H-%M-%S")) print("Run Directory:", config['name'] + datetime.now().strftime("_%d-%H-%M-%S")) self.is_tensor_obses = None self.is_rnn = False self.last_rnn_indices = None self.last_state_indices = None
def __init__(self, base_name, config): self.config = config self.env_config = config.get('env_config', {}) self.num_actors = config['num_actors'] self.env_name = config['env_name'] self.env_info = config.get('env_info') if self.env_info is None: self.vec_env = vecenv.create_vec_env(self.env_name, self.num_actors, **self.env_config) self.env_info = self.vec_env.get_env_info() self.ppo_device = config.get('device', 'cuda:0') print('Env info:') print(self.env_info) self.value_size = self.env_info.get('value_size',1) self.observation_space = self.env_info['observation_space'] self.weight_decay = config.get('weight_decay', 0.0) self.use_action_masks = config.get('use_action_masks', False) self.is_train = config.get('is_train', True) self.central_value_config = self.config.get('central_value_config', None) self.has_central_value = self.central_value_config is not None if self.has_central_value: self.state_space = self.env_info.get('state_space', None) self.state_shape = None if self.state_space.shape != None: self.state_shape = self.state_space.shape self.self_play_config = self.config.get('self_play_config', None) self.has_self_play_config = self.self_play_config is not None self.self_play = config.get('self_play', False) self.save_freq = config.get('save_frequency', 0) self.save_best_after = config.get('save_best_after', 100) self.print_stats = config.get('print_stats', True) self.rnn_states = None self.name = base_name self.ppo = config['ppo'] self.max_epochs = self.config.get('max_epochs', 1e6) self.is_adaptive_lr = config['lr_schedule'] == 'adaptive' self.linear_lr = config['lr_schedule'] == 'linear' self.schedule_type = config.get('schedule_type', 'legacy') if self.is_adaptive_lr: self.lr_threshold = config['lr_threshold'] self.scheduler = schedulers.AdaptiveScheduler(self.lr_threshold) elif self.linear_lr: self.scheduler = schedulers.LinearScheduler(float(config['learning_rate']), max_steps=self.max_epochs, apply_to_entropy=config.get('schedule_entropy', False), start_entropy_coef=config.get('entropy_coef')) else: self.scheduler = schedulers.IdentityScheduler() self.e_clip = config['e_clip'] self.clip_value = config['clip_value'] self.network = config['network'] self.rewards_shaper = config['reward_shaper'] self.num_agents = self.env_info.get('agents', 1) self.steps_num = config['steps_num'] self.seq_len = self.config.get('seq_length', 4) self.normalize_advantage = config['normalize_advantage'] self.normalize_input = self.config['normalize_input'] self.normalize_value = self.config.get('normalize_value', False) self.obs_shape = self.observation_space.shape self.critic_coef = config['critic_coef'] self.grad_norm = config['grad_norm'] self.gamma = self.config['gamma'] self.tau = self.config['tau'] self.games_to_track = self.config.get('games_to_track', 100) self.game_rewards = torch_ext.AverageMeter(self.value_size, self.games_to_track).to(self.ppo_device) self.game_lengths = torch_ext.AverageMeter(1, self.games_to_track).to(self.ppo_device) self.obs = None self.games_num = self.config['minibatch_size'] // self.seq_len # it is used only for current rnn implementation self.batch_size = self.steps_num * self.num_actors * self.num_agents self.batch_size_envs = self.steps_num * self.num_actors self.minibatch_size = self.config['minibatch_size'] self.mini_epochs_num = self.config['mini_epochs'] self.num_minibatches = self.batch_size // self.minibatch_size assert(self.batch_size % self.minibatch_size == 0) self.last_lr = self.config['learning_rate'] self.frame = 0 self.update_time = 0 self.last_mean_rewards = -100500 self.play_time = 0 self.epoch_num = 0 self.entropy_coef = self.config['entropy_coef'] self.writer = SummaryWriter('runs/' + config['name'] + datetime.now().strftime("_%d-%H-%M-%S")) if self.normalize_value: self.value_mean_std = RunningMeanStd((1,)).to(self.ppo_device) self.is_tensor_obses = False self.last_rnn_indices = None self.last_state_indices = None #self_play if self.has_self_play_config: print('Initializing SelfPlay Manager') self.self_play_manager = SelfPlayManager(self.self_play_config, self.writer) # features self.algo_observer = config['features']['observer']
def __init__(self, sess, base_name, observation_space, action_space, config): observation_shape = observation_space.shape self.use_action_masks = config.get('use_action_masks', False) self.is_train = config.get('is_train', True) self.self_play = config.get('self_play', False) self.name = base_name self.config = config self.env_name = config['env_name'] self.ppo = config['ppo'] self.is_adaptive_lr = config['lr_schedule'] == 'adaptive' self.is_polynom_decay_lr = config['lr_schedule'] == 'polynom_decay' self.is_exp_decay_lr = config['lr_schedule'] == 'exp_decay' self.lr_multiplier = tf.constant(1, shape=(), dtype=tf.float32) self.epoch_num = tf.Variable(tf.constant(0, shape=(), dtype=tf.float32), trainable=False) self.e_clip = config['e_clip'] self.clip_value = config['clip_value'] self.network = config['network'] self.rewards_shaper = config['reward_shaper'] self.num_actors = config['num_actors'] self.env_config = self.config.get('env_config', {}) self.vec_env = vecenv.create_vec_env(self.env_name, self.num_actors, **self.env_config) self.num_agents = self.vec_env.get_number_of_agents() self.steps_num = config['steps_num'] self.seq_len = self.config['seq_length'] self.normalize_advantage = config['normalize_advantage'] self.normalize_input = self.config['normalize_input'] self.state_shape = observation_shape self.critic_coef = config['critic_coef'] self.writer = SummaryWriter('runs/' + config['name'] + datetime.now().strftime("_%d-%H-%M-%S")) self.sess = sess self.grad_norm = config['grad_norm'] self.gamma = self.config['gamma'] self.tau = self.config['tau'] self.ignore_dead_batches = self.config.get('ignore_dead_batches', False) self.dones = np.asarray([False] * self.num_actors * self.num_agents, dtype=np.bool) self.current_rewards = np.asarray([0] * self.num_actors * self.num_agents, dtype=np.float32) self.current_lengths = np.asarray([0] * self.num_actors * self.num_agents, dtype=np.float32) self.games_to_track = self.config.get('games_to_track', 100) self.game_rewards = deque([], maxlen=self.games_to_track) self.game_lengths = deque([], maxlen=self.games_to_track) self.game_scores = deque([], maxlen=self.games_to_track) self.obs_ph = tf.placeholder(observation_space.dtype, (None, ) + observation_shape, name='obs') self.target_obs_ph = tf.placeholder(observation_space.dtype, (None, ) + observation_shape, name='target_obs') self.actions_num = action_space.n self.actions_ph = tf.placeholder('int32', (None, ), name='actions') if self.use_action_masks: self.action_mask_ph = tf.placeholder('int32', (None, self.actions_num), name='actions_mask') else: self.action_mask_ph = None self.old_logp_actions_ph = tf.placeholder('float32', (None, ), name='old_logpactions') self.rewards_ph = tf.placeholder('float32', (None, ), name='rewards') self.old_values_ph = tf.placeholder('float32', (None, ), name='old_values') self.advantages_ph = tf.placeholder('float32', (None, ), name='advantages') self.learning_rate_ph = tf.placeholder('float32', (), name='lr_ph') self.update_epoch_op = self.epoch_num.assign(self.epoch_num + 1) self.current_lr = self.learning_rate_ph self.input_obs = self.obs_ph self.input_target_obs = self.target_obs_ph if observation_space.dtype == np.uint8: self.input_obs = tf.to_float(self.input_obs) / 255.0 self.input_target_obs = tf.to_float(self.input_target_obs) / 255.0 if self.is_adaptive_lr: self.lr_threshold = config['lr_threshold'] if self.is_polynom_decay_lr: self.lr_multiplier = tf.train.polynomial_decay( 1.0, self.epoch_num, config['max_epochs'], end_learning_rate=0.001, power=tr_helpers.get_or_default(config, 'decay_power', 1.0)) if self.is_exp_decay_lr: self.lr_multiplier = tf.train.exponential_decay( 1.0, self.epoch_num, config['max_epochs'], decay_rate=config['decay_rate']) if self.normalize_input: self.moving_mean_std = MovingMeanStd(shape=observation_space.shape, epsilon=1e-5, decay=0.99) self.input_obs = self.moving_mean_std.normalize(self.input_obs, train=True) self.input_target_obs = self.moving_mean_std.normalize( self.input_target_obs, train=False) games_num = self.config[ 'minibatch_size'] // self.seq_len # it is used only for current rnn implementation self.train_dict = { 'name': 'agent', 'inputs': self.input_obs, 'batch_num': self.config['minibatch_size'], 'games_num': games_num, 'actions_num': self.actions_num, 'prev_actions_ph': self.actions_ph, 'action_mask_ph': None } self.run_dict = { 'name': 'agent', 'inputs': self.input_target_obs, 'batch_num': self.num_actors * self.num_agents, 'games_num': self.num_actors * self.num_agents, 'actions_num': self.actions_num, 'prev_actions_ph': None, 'action_mask_ph': self.action_mask_ph } self.states = None if self.network.is_rnn(): self.logp_actions, self.state_values, self.action, self.entropy, self.states_ph, self.masks_ph, self.lstm_state, self.initial_state = self.network( self.train_dict, reuse=False) self.target_neglogp, self.target_state_values, self.target_action, _, self.target_states_ph, self.target_masks_ph, self.target_lstm_state, self.target_initial_state, self.logits = self.network( self.run_dict, reuse=True) self.states = self.target_initial_state else: self.logp_actions, self.state_values, self.action, self.entropy = self.network( self.train_dict, reuse=False) self.target_neglogp, self.target_state_values, self.target_action, _, self.logits = self.network( self.run_dict, reuse=True) self.saver = tf.train.Saver() self.variables = TensorFlowVariables([ self.target_action, self.target_state_values, self.target_neglogp ], self.sess) if self.is_train: self.setup_losses() self.sess.run(tf.global_variables_initializer())
def base_init(self, base_name, config): self.config = config self.env_config = config.get('env_config', {}) self.num_actors = config.get('num_actors', 1) self.env_name = config['env_name'] print("Env name:", self.env_name) self.env_info = config.get('env_info') if self.env_info is None: self.vec_env = vecenv.create_vec_env(self.env_name, self.num_actors, **self.env_config) self.env_info = self.vec_env.get_env_info() self.sac_device = config.get('device', 'cuda:0') print('Env info:') print(self.env_info) self.rewards_shaper = config['reward_shaper'] self.observation_space = self.env_info['observation_space'] self.weight_decay = config.get('weight_decay', 0.0) self.use_action_masks = config.get('use_action_masks', False) self.is_train = config.get('is_train', True) self.central_value_config = self.config.get('central_value_config', None) self.has_central_value = self.central_value_config is not None if self.has_central_value: self.state_space = self.env_info.get('state_space', None) self.state_shape = None if self.state_space.shape != None: self.state_shape = self.state_space.shape self.self_play_config = self.config.get('self_play_config', None) self.has_self_play_config = self.self_play_config is not None self.self_play = config.get('self_play', False) self.save_freq = config.get('save_frequency', 0) self.save_best_after = config.get('save_best_after', 500) self.print_stats = config.get('print_stats', True) self.rnn_states = None self.name = base_name self.max_epochs = self.config.get('max_epochs', 1e6) self.network = config['network'] self.rewards_shaper = config['reward_shaper'] self.num_agents = self.env_info.get('agents', 1) self.obs_shape = self.observation_space.shape self.games_to_track = self.config.get('games_to_track', 100) self.game_rewards = torch_ext.AverageMeter(1, self.games_to_track).to( self.sac_device) self.game_lengths = torch_ext.AverageMeter(1, self.games_to_track).to( self.sac_device) self.obs = None self.frame = 0 self.update_time = 0 self.last_mean_rewards = -100500 self.play_time = 0 self.epoch_num = 0 # self.writer = SummaryWriter('ant_runs/' + config['name'] + datetime.now().strftime("_%d-%H-%M-%S")) self.writer = SummaryWriter('tested_new/' + 'shadowhand') # self.writer = SummaryWriter('walker/'+'fixed_buffer') print("Run Directory:", config['name'] + datetime.now().strftime("_%d-%H-%M-%S")) self.is_tensor_obses = None self.curiosity_config = self.config.get('rnd_config', None) self.has_curiosity = self.curiosity_config is not None if self.has_curiosity: self.curiosity_gamma = self.curiosity_config['gamma'] self.curiosity_lr = self.curiosity_config['lr'] self.curiosity_rewards = deque([], maxlen=self.games_to_track) self.curiosity_mins = deque([], maxlen=self.games_to_track) self.curiosity_maxs = deque([], maxlen=self.games_to_track) self.rnd_adv_coef = self.curiosity_config.get('adv_coef', 1.0) self.is_rnn = False self.last_rnn_indices = None self.last_state_indices = None