def __init__(self, config, scope, define_network=None): super(PGModel, self).__init__(config, scope) self.batch_size = self.config.batch_size self.action_count = self.config.actions self.use_gae = self.config.use_gae self.gae_lambda = self.config.gae_lambda self.gamma = self.config.gamma self.continuous = self.config.continuous self.normalize_advantage = self.config.normalise_advantage if self.config.deterministic_mode: self.random = global_seed() else: self.random = np.random.RandomState() self.state = tf.placeholder(tf.float32, self.batch_shape + list(self.config.state_shape), name="state") self.episode = 0 self.input_feed = None self.advantage = tf.placeholder(tf.float32, shape=[None, 1], name='advantage') self.policy = None scope = '' if self.config.tf_scope is None else self.config.tf_scope + '-' if define_network is None: define_network = NeuralNetwork.layered_network(self.config.network_layers) self.hidden_layers = NeuralNetwork(define_network, [self.state], scope=scope + 'value_function') self.saver = tf.train.Saver() self.actions = tf.placeholder(tf.float32, [None, self.action_count], name='actions') self.prev_action_means = tf.placeholder(tf.float32, [None, self.action_count], name='prev_actions') # From an API perspective, continuous vs discrete might be easier than # requiring to set the concrete policy, at least currently if self.continuous: self.policy = GaussianPolicy(self.hidden_layers, self.session, self.state, self.random, self.action_count, 'gaussian_policy') self.prev_action_log_stds = tf.placeholder(tf.float32, [None, self.action_count]) self.prev_dist = dict(policy_output=self.prev_action_means, policy_log_std=self.prev_action_log_stds) else: self.policy = CategoricalOneHotPolicy(self.hidden_layers, self.session, self.state, self.random, self.action_count, 'categorical_policy') self.prev_dist = dict(policy_output=self.prev_action_means) # Probability distribution used in the current policy self.dist = self.policy.get_distribution() # TODO configurable value functions self.baseline_value_function = MLPValueFunction(self.session, 100, 64)
def __init__(self, config, scope, task_index, cluster_spec, define_network=None): """ A distributed agent must synchronise local and global parameters under different scopes. :param config: Configuration parameters :param scope: TensorFlow scope """ self.session = None self.saver = None self.config = create_config(config, default=self.default_config) self.scope = scope self.task_index = task_index self.batch_size = self.config.batch_size self.action_count = self.config.actions self.use_gae = self.config.use_gae self.gae_lambda = self.config.gae_lambda self.gamma = self.config.gamma self.continuous = self.config.continuous self.normalize_advantage = self.config.normalise_advantage if self.config.deterministic_mode: self.random = global_seed() else: self.random = np.random.RandomState() if define_network is None: self.define_network = NeuralNetwork.layered_network( self.config.network_layers) else: self.define_network = define_network # This is the scope used to prefix variable creation for distributed TensorFlow self.batch_shape = [None] self.deterministic_mode = config.get('deterministic_mode', False) self.alpha = config.get('alpha', 0.001) self.optimizer = None self.worker_device = "/job:worker/task:{}/cpu:0".format(task_index) with tf.device( tf.train.replica_device_setter( 1, worker_device=self.worker_device, cluster=cluster_spec)): with tf.variable_scope("global"): self.global_state = tf.placeholder( tf.float32, self.batch_shape + list(self.config.state_shape), name="global_state") self.global_network = NeuralNetwork(self.define_network, [self.global_state]) self.global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) self.global_prev_action_means = tf.placeholder( tf.float32, [None, self.action_count], name='prev_actions') if self.continuous: self.global_policy = GaussianPolicy( self.global_network, self.session, self.global_state, self.random, self.action_count, 'gaussian_policy') self.global_prev_action_log_stds = tf.placeholder( tf.float32, [None, self.action_count]) self.global_prev_dist = dict( policy_output=self.global_prev_action_means, policy_log_std=self.global_prev_action_log_stds) else: self.global_policy = CategoricalOneHotPolicy( self.global_network, self.session, self.global_state, self.random, self.action_count, 'categorical_policy') self.global_prev_dist = dict( policy_output=self.global_prev_action_means) # Probability distribution used in the current policy self.global_baseline_value_function = LinearValueFunction() # self.optimizer = config.get('optimizer') # self.optimizer_args = config.get('optimizer_args', []) # self.optimizer_kwargs = config.get('optimizer_kwargs', {}) exploration = config.get('exploration') if not exploration: self.exploration = exploration_mode['constant'](self, 0) else: args = config.get('exploration_args', []) kwargs = config.get('exploration_kwargs', {}) self.exploration = exploration_mode[exploration](self, *args, **kwargs) self.create_training_operations()
class DistributedPGModel(object): default_config = {} def __init__(self, config, scope, task_index, cluster_spec, define_network=None): """ A distributed agent must synchronise local and global parameters under different scopes. :param config: Configuration parameters :param scope: TensorFlow scope """ self.session = None self.saver = None self.config = create_config(config, default=self.default_config) self.scope = scope self.task_index = task_index self.batch_size = self.config.batch_size self.action_count = self.config.actions self.use_gae = self.config.use_gae self.gae_lambda = self.config.gae_lambda self.gamma = self.config.gamma self.continuous = self.config.continuous self.normalize_advantage = self.config.normalise_advantage if self.config.deterministic_mode: self.random = global_seed() else: self.random = np.random.RandomState() if define_network is None: self.define_network = NeuralNetwork.layered_network( self.config.network_layers) else: self.define_network = define_network # This is the scope used to prefix variable creation for distributed TensorFlow self.batch_shape = [None] self.deterministic_mode = config.get('deterministic_mode', False) self.alpha = config.get('alpha', 0.001) self.optimizer = None self.worker_device = "/job:worker/task:{}/cpu:0".format(task_index) with tf.device( tf.train.replica_device_setter( 1, worker_device=self.worker_device, cluster=cluster_spec)): with tf.variable_scope("global"): self.global_state = tf.placeholder( tf.float32, self.batch_shape + list(self.config.state_shape), name="global_state") self.global_network = NeuralNetwork(self.define_network, [self.global_state]) self.global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) self.global_prev_action_means = tf.placeholder( tf.float32, [None, self.action_count], name='prev_actions') if self.continuous: self.global_policy = GaussianPolicy( self.global_network, self.session, self.global_state, self.random, self.action_count, 'gaussian_policy') self.global_prev_action_log_stds = tf.placeholder( tf.float32, [None, self.action_count]) self.global_prev_dist = dict( policy_output=self.global_prev_action_means, policy_log_std=self.global_prev_action_log_stds) else: self.global_policy = CategoricalOneHotPolicy( self.global_network, self.session, self.global_state, self.random, self.action_count, 'categorical_policy') self.global_prev_dist = dict( policy_output=self.global_prev_action_means) # Probability distribution used in the current policy self.global_baseline_value_function = LinearValueFunction() # self.optimizer = config.get('optimizer') # self.optimizer_args = config.get('optimizer_args', []) # self.optimizer_kwargs = config.get('optimizer_kwargs', {}) exploration = config.get('exploration') if not exploration: self.exploration = exploration_mode['constant'](self, 0) else: args = config.get('exploration_args', []) kwargs = config.get('exploration_kwargs', {}) self.exploration = exploration_mode[exploration](self, *args, **kwargs) self.create_training_operations() def set_session(self, session): self.session = session # Session in policy was still 'None' when # we initialised policy, hence need to set again self.policy.session = session def create_training_operations(self): """ Currently a duplicate of the pg agent logic, to be made generic later to allow all models to be executed asynchronously/distributed seamlessly. """ # TODO rewrite agent logic so core update logic can be composed into # TODO distributed logic with tf.device(self.worker_device): with tf.variable_scope("local"): self.state = tf.placeholder(tf.float32, self.batch_shape + list(self.config.state_shape), name="state") self.prev_action_means = tf.placeholder( tf.float32, [None, self.action_count], name='prev_actions') self.local_network = NeuralNetwork(self.define_network, [self.state]) # TODO possibly problematic, check self.local_step = self.global_step if self.continuous: self.policy = GaussianPolicy(self.local_network, self.session, self.state, self.random, self.action_count, 'gaussian_policy') self.prev_action_log_stds = tf.placeholder( tf.float32, [None, self.action_count]) self.prev_dist = dict( policy_output=self.prev_action_means, policy_log_std=self.prev_action_log_stds) else: self.policy = CategoricalOneHotPolicy( self.local_network, self.session, self.state, self.random, self.action_count, 'categorical_policy') self.prev_dist = dict(policy_output=self.prev_action_means) # Probability distribution used in the current policy self.baseline_value_function = LinearValueFunction() self.actions = tf.placeholder(tf.float32, [None, self.action_count], name='actions') self.advantage = tf.placeholder(tf.float32, shape=[None, 1], name='advantage') self.dist = self.policy.get_distribution() self.log_probabilities = self.dist.log_prob( self.policy.get_policy_variables(), self.actions) # Concise: Get log likelihood of actions, weigh by advantages, compute gradient on that self.loss = -tf.reduce_mean( self.log_probabilities * self.advantage, name="loss_op") self.gradients = tf.gradients(self.loss, self.local_network.get_variables()) grad_var_list = list( zip(self.gradients, self.global_network.get_variables())) global_step_inc = self.global_step.assign_add( tf.shape(self.state)[0]) self.assign_global_to_local = tf.group(*[ v1.assign(v2) for v1, v2 in zip(self.local_network.get_variables(), self.global_network.get_variables()) ]) # TODO write summaries # self.summary_writer = tf.summary.FileWriter('log' + "_%d" % self.task_index) if not self.optimizer: self.optimizer = tf.train.AdamOptimizer(self.alpha) else: optimizer_cls = get_function(self.optimizer) self.optimizer = optimizer_cls(self.alpha, *self.optimizer_args, **self.optimizer_kwargs) self.optimize_op = tf.group( self.optimizer.apply_gradients(grad_var_list), global_step_inc) def get_action(self, state, episode=1): return self.policy.sample(state) def update(self, batch): """ Get global parameters, compute update, then send results to parameter server. :param batch: :return: """ self.compute_gae_advantage(batch, self.gamma, self.gae_lambda) # Update linear value function for baseline prediction self.baseline_value_function.fit(batch) # Merge episode inputs into single arrays _, _, actions, batch_advantage, states = self.merge_episodes(batch) self.session.run( [self.optimize_op, self.global_step], { self.state: states, self.actions: actions, self.advantage: batch_advantage }) def get_global_step(self): """ Returns global step to coordinator. :return: """ return self.session.run(self.global_step) def sync_global_to_local(self): """ Copy shared global weights to local network. """ self.session.run(self.assign_global_to_local) def load_model(self, path): self.saver.restore(self.session, path) def save_model(self, path): self.saver.save(self.session, path) # TODO remove this duplication, move to util or let distributed agent # have a pg agent as a field def merge_episodes(self, batch): """ Merge episodes of a batch into single input variables. :param batch: :return: """ if self.continuous: action_log_stds = np.concatenate( [path['action_log_stds'] for path in batch]) action_log_stds = np.expand_dims(action_log_stds, axis=1) else: action_log_stds = None action_means = np.concatenate([path['action_means'] for path in batch]) actions = np.concatenate([path['actions'] for path in batch]) batch_advantage = np.concatenate([path["advantage"] for path in batch]) if self.normalize_advantage: batch_advantage = zero_mean_unit_variance(batch_advantage) batch_advantage = np.expand_dims(batch_advantage, axis=1) states = np.concatenate([path['states'] for path in batch]) return action_log_stds, action_means, actions, batch_advantage, states # TODO duplicate code -> refactor from pg model def compute_gae_advantage(self, batch, gamma, gae_lambda, use_gae=False): """ Expects a batch containing at least one episode, sets advantages according to use_gae. :param batch: Sequence of observations for at least one episode. """ for episode in batch: baseline = self.baseline_value_function.predict(episode) if episode['terminated']: adjusted_baseline = np.append(baseline, [0]) else: adjusted_baseline = np.append(baseline, baseline[-1]) episode['returns'] = discount(episode['rewards'], gamma) if use_gae: deltas = episode['rewards'] + gamma * adjusted_baseline[ 1:] - adjusted_baseline[:-1] episode['advantage'] = discount(deltas, gamma * gae_lambda) else: episode['advantage'] = episode['returns'] - baseline
def create_training_operations(self): """ Currently a duplicate of the pg agent logic, to be made generic later to allow all models to be executed asynchronously/distributed seamlessly. """ # TODO rewrite agent logic so core update logic can be composed into # TODO distributed logic with tf.device(self.worker_device): with tf.variable_scope("local"): self.state = tf.placeholder(tf.float32, self.batch_shape + list(self.config.state_shape), name="state") self.prev_action_means = tf.placeholder( tf.float32, [None, self.action_count], name='prev_actions') self.local_network = NeuralNetwork(self.define_network, [self.state]) # TODO possibly problematic, check self.local_step = self.global_step if self.continuous: self.policy = GaussianPolicy(self.local_network, self.session, self.state, self.random, self.action_count, 'gaussian_policy') self.prev_action_log_stds = tf.placeholder( tf.float32, [None, self.action_count]) self.prev_dist = dict( policy_output=self.prev_action_means, policy_log_std=self.prev_action_log_stds) else: self.policy = CategoricalOneHotPolicy( self.local_network, self.session, self.state, self.random, self.action_count, 'categorical_policy') self.prev_dist = dict(policy_output=self.prev_action_means) # Probability distribution used in the current policy self.baseline_value_function = LinearValueFunction() self.actions = tf.placeholder(tf.float32, [None, self.action_count], name='actions') self.advantage = tf.placeholder(tf.float32, shape=[None, 1], name='advantage') self.dist = self.policy.get_distribution() self.log_probabilities = self.dist.log_prob( self.policy.get_policy_variables(), self.actions) # Concise: Get log likelihood of actions, weigh by advantages, compute gradient on that self.loss = -tf.reduce_mean( self.log_probabilities * self.advantage, name="loss_op") self.gradients = tf.gradients(self.loss, self.local_network.get_variables()) grad_var_list = list( zip(self.gradients, self.global_network.get_variables())) global_step_inc = self.global_step.assign_add( tf.shape(self.state)[0]) self.assign_global_to_local = tf.group(*[ v1.assign(v2) for v1, v2 in zip(self.local_network.get_variables(), self.global_network.get_variables()) ]) # TODO write summaries # self.summary_writer = tf.summary.FileWriter('log' + "_%d" % self.task_index) if not self.optimizer: self.optimizer = tf.train.AdamOptimizer(self.alpha) else: optimizer_cls = get_function(self.optimizer) self.optimizer = optimizer_cls(self.alpha, *self.optimizer_args, **self.optimizer_kwargs) self.optimize_op = tf.group( self.optimizer.apply_gradients(grad_var_list), global_step_inc)
class DistributedPGModel(object): default_config = {} def __init__(self, config, scope, task_index, cluster_spec, define_network=None): """ A distributed agent must synchronise local and global parameters under different scopes. :param config: Configuration parameters :param scope: TensorFlow scope """ self.logger = logging.getLogger(__name__) self.logger.setLevel(logging.DEBUG) self.session = None self.saver = None self.config = create_config(config, default=self.default_config) self.scope = scope self.task_index = task_index self.batch_size = self.config.batch_size self.action_count = self.config.actions self.generalized_advantage_estimation = self.config.use_gae self.gae_lambda = self.config.gae_lambda self.gamma = self.config.gamma self.continuous = self.config.continuous self.normalize_advantage = self.config.normalise_advantage self.episode_length = tf.placeholder(tf.int32, (None, ), name='episode_length') if self.config.deterministic_mode: self.random = global_seed() else: self.random = np.random.RandomState() if define_network is None: self.define_network = NeuralNetwork.layered_network( self.config.network_layers) else: self.define_network = define_network # This is the scope used to prefix variable creation for distributed TensorFlow self.batch_shape = [None] self.deterministic_mode = config.get('deterministic_mode', False) self.learning_rate = config.get('learning_rate', 0.001) self.optimizer = None self.worker_device = "/job:worker/task:{}/cpu:0".format(task_index) self.state_shape = tuple(self.config.state_shape) with tf.device( tf.train.replica_device_setter( 1, worker_device=self.worker_device, cluster=cluster_spec)): with tf.variable_scope("global"): self.global_state = tf.placeholder(tf.float32, (None, None) + self.state_shape, name="global_state") self.global_network = NeuralNetwork( self.define_network, [self.global_state], episode_length=self.episode_length) self.global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) self.global_states = self.global_network.internal_state_inits self.global_prev_action_means = tf.placeholder( tf.float32, (None, None, self.action_count), name='prev_actions') if self.continuous: self.global_policy = GaussianPolicy( self.global_network, self.session, self.global_state, self.random, self.action_count, 'gaussian_policy') self.global_prev_action_log_stds = tf.placeholder( tf.float32, (None, None, self.action_count)) self.global_prev_dist = dict( policy_output=self.global_prev_action_means, policy_log_std=self.global_prev_action_log_stds) else: self.global_policy = CategoricalOneHotPolicy( self.global_network, self.session, self.global_state, self.random, self.action_count, 'categorical_policy') self.global_prev_dist = dict( policy_output=self.global_prev_action_means) # Probability distribution used in the current policy self.global_baseline_value_function = LinearValueFunction() # self.optimizer = config.get('optimizer') # self.optimizer_args = config.get('optimizer_args', []) # self.optimizer_kwargs = config.get('optimizer_kwargs', {}) exploration = config.get('exploration') if not exploration: self.exploration = exploration_mode['constant'](self, 0) else: args = config.get('exploration_args', []) kwargs = config.get('exploration_kwargs', {}) self.exploration = exploration_mode[exploration](self, *args, **kwargs) self.create_training_operations() def set_session(self, session): self.session = session # Session in policy was still 'None' when # we initialised policy, hence need to set again self.policy.session = session def create_training_operations(self): """ Currently a duplicate of the pg agent logic, to be made generic later to allow all models to be executed asynchronously/distributed seamlessly. """ # TODO rewrite agent logic so core update logic can be composed into # TODO distributed logic with tf.device(self.worker_device): with tf.variable_scope("local"): self.state = tf.placeholder(tf.float32, (None, None) + self.state_shape, name="state") self.prev_action_means = tf.placeholder( tf.float32, (None, None, self.action_count), name='prev_actions') self.local_network = NeuralNetwork( self.define_network, [self.state], episode_length=self.episode_length) self.local_states = self.local_network.internal_state_inits # TODO possibly problematic, check self.local_step = self.global_step if self.continuous: self.policy = GaussianPolicy(self.local_network, self.session, self.state, self.random, self.action_count, 'gaussian_policy') self.prev_action_log_stds = tf.placeholder( tf.float32, (None, None, self.action_count)) self.prev_dist = dict( policy_output=self.prev_action_means, policy_log_std=self.prev_action_log_stds) else: self.policy = CategoricalOneHotPolicy( self.local_network, self.session, self.state, self.random, self.action_count, 'categorical_policy') self.prev_dist = dict(policy_output=self.prev_action_means) # Probability distribution used in the current policy self.baseline_value_function = LinearValueFunction() self.actions = tf.placeholder(tf.float32, (None, None, self.action_count), name='actions') self.advantage = tf.placeholder(tf.float32, shape=(None, None, 1), name='advantage') self.dist = self.policy.get_distribution() self.log_probabilities = self.dist.log_prob( self.policy.get_policy_variables(), self.actions) # Concise: Get log likelihood of actions, weigh by advantages, compute gradient on that self.loss = -tf.reduce_mean( self.log_probabilities * self.advantage, name="loss_op") self.gradients = tf.gradients(self.loss, self.local_network.variables) grad_var_list = list( zip(self.gradients, self.global_network.variables)) global_step_inc = self.global_step.assign_add( tf.shape(self.state)[0]) self.assign_global_to_local = tf.group(*[ v1.assign(v2) for v1, v2 in zip(self.local_network.variables, self.global_network.variables) ]) # TODO write summaries # self.summary_writer = tf.summary.FileWriter('log' + "_%d" % self.task_index) if not self.optimizer: self.optimizer = tf.train.AdamOptimizer(self.learning_rate) else: optimizer_cls = get_function(self.optimizer) self.optimizer = optimizer_cls(self.learning_rate, *self.optimizer_args, **self.optimizer_kwargs) self.optimize_op = tf.group( self.optimizer.apply_gradients(grad_var_list), global_step_inc) def get_action(self, state, episode=1): return self.policy.sample(state) def update(self, batch): """ Get global parameters, compute update, then send results to parameter server. :param batch: :return: """ for episode in batch: episode['returns'] = discount(episode['rewards'], self.gamma) episode['advantages'] = self.generalised_advantage_estimation( episode) # Update linear value function for baseline prediction self.baseline_value_function.fit(batch) fetches = [self.loss, self.optimize_op, self.global_step] fetches.extend(self.local_network.internal_state_outputs) print(len(batch)) print(batch[0]['episode_length']) # Merge episode inputs into single arrays feed_dict = { self.episode_length: [episode['episode_length'] for episode in batch], self.state: [episode['states'] for episode in batch], self.actions: [episode['actions'] for episode in batch], self.advantage: [episode['advantages'] for episode in batch] } for n, internal_state in enumerate( self.local_network.internal_state_inputs): feed_dict[internal_state] = self.local_states[n] fetched = self.session.run(fetches, feed_dict) loss = fetched[0] self.local_states = fetched[3:] self.logger.debug('Distributed model loss = ' + str(loss)) def get_global_step(self): """ Returns global step to coordinator. :return: """ return self.session.run(self.global_step) def sync_global_to_local(self): """ Copy shared global weights to local network. """ self.session.run(self.assign_global_to_local) def load_model(self, path): self.saver.restore(self.session, path) def save_model(self, path): self.saver.save(self.session, path) # TODO duplicate code -> refactor from pg model def generalised_advantage_estimation(self, episode): """ Expects an episode, returns advantages according to config. """ baseline = self.baseline_value_function.predict(episode) if self.generalized_advantage_estimation: if episode['terminated']: adjusted_baseline = np.append(baseline, [0]) else: adjusted_baseline = np.append(baseline, baseline[-1]) deltas = episode['rewards'] + self.gamma * adjusted_baseline[ 1:] - adjusted_baseline[:-1] advantage = discount(deltas, self.gamma * self.gae_lambda) else: advantage = episode['returns'] - baseline if self.normalize_advantage: return zero_mean_unit_variance(advantage) else: return advantage # TODO remove this duplicate def zero_episode(self): """ Creates a new episode dict. :return: """ zero_episode = { 'episode_length': 0, 'terminated': False, 'states': np.zeros(shape=((self.batch_size, ) + self.state_shape)), 'actions': np.zeros(shape=(self.batch_size, self.action_count)), 'action_means': np.zeros(shape=(self.batch_size, self.action_count)), 'rewards': np.zeros(shape=(self.batch_size, 1)) } if self.continuous: zero_episode['action_log_stds'] = np.zeros( shape=(self.batch_size, self.action_count)) return zero_episode
class PGModel(Model): def __init__(self, config, scope, define_network=None): super(PGModel, self).__init__(config, scope) self.batch_size = self.config.batch_size self.action_count = self.config.actions self.use_gae = self.config.use_gae self.gae_lambda = self.config.gae_lambda self.gamma = self.config.gamma self.continuous = self.config.continuous self.normalize_advantage = self.config.normalise_advantage if self.config.deterministic_mode: self.random = global_seed() else: self.random = np.random.RandomState() self.state = tf.placeholder(tf.float32, self.batch_shape + list(self.config.state_shape), name="state") self.episode = 0 self.input_feed = None self.advantage = tf.placeholder(tf.float32, shape=[None, 1], name='advantage') self.policy = None scope = '' if self.config.tf_scope is None else self.config.tf_scope + '-' if define_network is None: define_network = NeuralNetwork.layered_network(self.config.network_layers) self.hidden_layers = NeuralNetwork(define_network, [self.state], scope=scope + 'value_function') self.saver = tf.train.Saver() self.actions = tf.placeholder(tf.float32, [None, self.action_count], name='actions') self.prev_action_means = tf.placeholder(tf.float32, [None, self.action_count], name='prev_actions') # From an API perspective, continuous vs discrete might be easier than # requiring to set the concrete policy, at least currently if self.continuous: self.policy = GaussianPolicy(self.hidden_layers, self.session, self.state, self.random, self.action_count, 'gaussian_policy') self.prev_action_log_stds = tf.placeholder(tf.float32, [None, self.action_count]) self.prev_dist = dict(policy_output=self.prev_action_means, policy_log_std=self.prev_action_log_stds) else: self.policy = CategoricalOneHotPolicy(self.hidden_layers, self.session, self.state, self.random, self.action_count, 'categorical_policy') self.prev_dist = dict(policy_output=self.prev_action_means) # Probability distribution used in the current policy self.dist = self.policy.get_distribution() # TODO configurable value functions self.baseline_value_function = MLPValueFunction(self.session, 100, 64) def get_action(self, state, episode=1): """ Actions are directly sampled from the policy. :param state: :param episode: :return: """ return self.policy.sample(state) def update(self, batch): """ Update needs to be implemented by specific PG algorithm. :param batch: Batch of experiences :return: """ raise NotImplementedError def merge_episodes(self, batch): """ Merge episodes of a batch into single input variables. :param batch: :return: """ if self.continuous: action_log_stds = np.concatenate([path['action_log_stds'] for path in batch]) action_log_stds = np.expand_dims(action_log_stds, axis=1) else: action_log_stds = None action_means = np.concatenate([path['action_means'] for path in batch]) actions = np.concatenate([path['actions'] for path in batch]) batch_advantage = np.concatenate([path["advantage"] for path in batch]) if self.normalize_advantage: batch_advantage = zero_mean_unit_variance(batch_advantage) batch_advantage = np.expand_dims(batch_advantage, axis=1) states = np.concatenate([path['states'] for path in batch]) return action_log_stds, action_means, actions, batch_advantage, states def compute_gae_advantage(self, batch, gamma, gae_lambda, use_gae=False): """ Expects a batch containing at least one episode, sets advantages according to use_gae. :param batch: Sequence of observations for at least one episode. :param batch: :param gamma: :param gae_lambda: :param use_gae: :return: """ for episode in batch: baseline = self.baseline_value_function.predict(episode) if episode['terminated']: adjusted_baseline = np.append(baseline, [0]) else: adjusted_baseline = np.append(baseline, baseline[-1]) episode['returns'] = discount(episode['rewards'], gamma) if use_gae: deltas = episode['rewards'] + gamma * adjusted_baseline[1:] - adjusted_baseline[:-1] episode['advantage'] = discount(deltas, gamma * gae_lambda) else: episode['advantage'] = episode['returns'] - baseline
def __init__(self, config, scope, network_builder=None): super(PGModel, self).__init__(config, scope) self.continuous = self.config.continuous self.batch_size = self.config.batch_size self.max_episode_length = min(self.config.max_episode_length, self.batch_size) self.action_count = self.config.actions # advantage estimation self.gamma = self.config.gamma self.generalized_advantage_estimation = self.config.gae self.gae_lambda = self.config.gae_lambda self.normalize_advantage = self.config.normalize_advantage if self.config.deterministic_mode: self.random = global_seed() else: self.random = np.random.RandomState() self.state_shape = tuple(self.config.state_shape) self.state = tf.placeholder(tf.float32, (None, None) + self.state_shape, name="state") self.actions = tf.placeholder(tf.float32, (None, None, self.action_count), name='actions') self.prev_action_means = tf.placeholder( tf.float32, (None, None, self.action_count), name='prev_actions') self.advantage = tf.placeholder(tf.float32, shape=(None, None, 1), name='advantage') if network_builder is None: network_builder = NeuralNetwork.layered_network( self.config.network_layers) if self.config.tf_scope is None: scope = '' else: scope = self.config.tf_scope + '-' self.network = NeuralNetwork(network_builder, inputs=[self.state], episode_length=self.episode_length, scope=scope + 'value_function') self.internal_states = self.network.internal_state_inits # From an API perspective, continuous vs discrete might be easier than # requiring to set the concrete policy, at least currently if self.continuous: self.policy = GaussianPolicy(self.network, self.session, self.state, self.random, self.action_count, 'gaussian_policy') self.prev_action_log_stds = tf.placeholder( tf.float32, (None, None, self.action_count)) self.prev_dist = dict(policy_output=self.prev_action_means, policy_log_std=self.prev_action_log_stds) else: self.policy = CategoricalOneHotPolicy(self.network, self.session, self.state, self.random, self.action_count, 'categorical_policy') self.prev_dist = dict(policy_output=self.prev_action_means) # Probability distribution used in the current policy self.dist = self.policy.get_distribution() size = 1 for dims in self.state_shape: size *= dims self.baseline_value_function = LinearValueFunction()
class PGModel(Model): def __init__(self, config, scope, network_builder=None): super(PGModel, self).__init__(config, scope) self.continuous = self.config.continuous self.batch_size = self.config.batch_size self.max_episode_length = min(self.config.max_episode_length, self.batch_size) self.action_count = self.config.actions # advantage estimation self.gamma = self.config.gamma self.generalized_advantage_estimation = self.config.gae self.gae_lambda = self.config.gae_lambda self.normalize_advantage = self.config.normalize_advantage if self.config.deterministic_mode: self.random = global_seed() else: self.random = np.random.RandomState() self.state_shape = tuple(self.config.state_shape) self.state = tf.placeholder(tf.float32, (None, None) + self.state_shape, name="state") self.actions = tf.placeholder(tf.float32, (None, None, self.action_count), name='actions') self.prev_action_means = tf.placeholder( tf.float32, (None, None, self.action_count), name='prev_actions') self.advantage = tf.placeholder(tf.float32, shape=(None, None, 1), name='advantage') if network_builder is None: network_builder = NeuralNetwork.layered_network( self.config.network_layers) if self.config.tf_scope is None: scope = '' else: scope = self.config.tf_scope + '-' self.network = NeuralNetwork(network_builder, inputs=[self.state], episode_length=self.episode_length, scope=scope + 'value_function') self.internal_states = self.network.internal_state_inits # From an API perspective, continuous vs discrete might be easier than # requiring to set the concrete policy, at least currently if self.continuous: self.policy = GaussianPolicy(self.network, self.session, self.state, self.random, self.action_count, 'gaussian_policy') self.prev_action_log_stds = tf.placeholder( tf.float32, (None, None, self.action_count)) self.prev_dist = dict(policy_output=self.prev_action_means, policy_log_std=self.prev_action_log_stds) else: self.policy = CategoricalOneHotPolicy(self.network, self.session, self.state, self.random, self.action_count, 'categorical_policy') self.prev_dist = dict(policy_output=self.prev_action_means) # Probability distribution used in the current policy self.dist = self.policy.get_distribution() size = 1 for dims in self.state_shape: size *= dims self.baseline_value_function = LinearValueFunction() # self.saver = tf.train.Saver() def get_action(self, state, episode=1): """ Actions are directly sampled from the policy. :param state: :param episode: :return: """ return self.policy.sample(state) def update(self, batch): """ Update needs to be implemented by specific PG algorithm. :param batch: Batch of experiences :return: """ raise NotImplementedError def zero_episode(self): """ Creates a new episode dict. :return: """ zero_episode = { 'episode_length': 0, 'terminated': False, 'states': np.zeros(shape=((self.max_episode_length, ) + self.state_shape)), 'actions': np.zeros(shape=(self.max_episode_length, self.action_count)), 'action_means': np.zeros(shape=(self.max_episode_length, self.action_count)), 'rewards': np.zeros(shape=(self.max_episode_length, 1)) } if self.continuous: zero_episode['action_log_stds'] = np.zeros( shape=(self.max_episode_length, self.action_count)) return zero_episode def advantage_estimation(self, episode): """ Expects an episode, returns advantages according to config. """ baseline = self.baseline_value_function.predict(episode) if self.generalized_advantage_estimation: if episode['terminated']: adjusted_baseline = np.append(baseline, [0]) else: adjusted_baseline = np.append(baseline, baseline[-1]) deltas = episode['rewards'] + self.gamma * adjusted_baseline[ 1:] - adjusted_baseline[:-1] advantage = discount(deltas, self.gamma * self.gae_lambda) else: advantage = episode['returns'] - baseline if self.normalize_advantage: return zero_mean_unit_variance(advantage) else: return advantage