예제 #1
0
    def __init__(self, config, scope, define_network=None):
        super(PGModel, self).__init__(config, scope)
        self.batch_size = self.config.batch_size
        self.action_count = self.config.actions
        self.use_gae = self.config.use_gae
        self.gae_lambda = self.config.gae_lambda

        self.gamma = self.config.gamma
        self.continuous = self.config.continuous
        self.normalize_advantage = self.config.normalise_advantage

        if self.config.deterministic_mode:
            self.random = global_seed()
        else:
            self.random = np.random.RandomState()

        self.state = tf.placeholder(tf.float32, self.batch_shape + list(self.config.state_shape), name="state")
        self.episode = 0
        self.input_feed = None

        self.advantage = tf.placeholder(tf.float32, shape=[None, 1], name='advantage')
        self.policy = None

        scope = '' if self.config.tf_scope is None else self.config.tf_scope + '-'

        if define_network is None:
            define_network = NeuralNetwork.layered_network(self.config.network_layers)

        self.hidden_layers = NeuralNetwork(define_network, [self.state], scope=scope + 'value_function')

        self.saver = tf.train.Saver()
        self.actions = tf.placeholder(tf.float32, [None, self.action_count], name='actions')
        self.prev_action_means = tf.placeholder(tf.float32, [None, self.action_count], name='prev_actions')

        # From an API perspective, continuous vs discrete might be easier than
        # requiring to set the concrete policy, at least currently
        if self.continuous:
            self.policy = GaussianPolicy(self.hidden_layers, self.session, self.state, self.random,
                                         self.action_count, 'gaussian_policy')
            self.prev_action_log_stds = tf.placeholder(tf.float32, [None, self.action_count])

            self.prev_dist = dict(policy_output=self.prev_action_means,
                                  policy_log_std=self.prev_action_log_stds)

        else:
            self.policy = CategoricalOneHotPolicy(self.hidden_layers, self.session, self.state, self.random,
                                                  self.action_count, 'categorical_policy')
            self.prev_dist = dict(policy_output=self.prev_action_means)

        # Probability distribution used in the current policy
        self.dist = self.policy.get_distribution()

        # TODO configurable value functions
        self.baseline_value_function = MLPValueFunction(self.session, 100, 64)
    def __init__(self,
                 config,
                 scope,
                 task_index,
                 cluster_spec,
                 define_network=None):
        """

        A distributed agent must synchronise local and global parameters under different
        scopes.

        :param config: Configuration parameters
        :param scope: TensorFlow scope
        """

        self.session = None
        self.saver = None
        self.config = create_config(config, default=self.default_config)
        self.scope = scope
        self.task_index = task_index
        self.batch_size = self.config.batch_size
        self.action_count = self.config.actions
        self.use_gae = self.config.use_gae
        self.gae_lambda = self.config.gae_lambda

        self.gamma = self.config.gamma
        self.continuous = self.config.continuous
        self.normalize_advantage = self.config.normalise_advantage

        if self.config.deterministic_mode:
            self.random = global_seed()
        else:
            self.random = np.random.RandomState()

        if define_network is None:
            self.define_network = NeuralNetwork.layered_network(
                self.config.network_layers)
        else:
            self.define_network = define_network

        # This is the scope used to prefix variable creation for distributed TensorFlow
        self.batch_shape = [None]
        self.deterministic_mode = config.get('deterministic_mode', False)
        self.alpha = config.get('alpha', 0.001)
        self.optimizer = None

        self.worker_device = "/job:worker/task:{}/cpu:0".format(task_index)

        with tf.device(
                tf.train.replica_device_setter(
                    1, worker_device=self.worker_device,
                    cluster=cluster_spec)):
            with tf.variable_scope("global"):
                self.global_state = tf.placeholder(
                    tf.float32,
                    self.batch_shape + list(self.config.state_shape),
                    name="global_state")

                self.global_network = NeuralNetwork(self.define_network,
                                                    [self.global_state])
                self.global_step = tf.get_variable(
                    "global_step", [],
                    tf.int32,
                    initializer=tf.constant_initializer(0, dtype=tf.int32),
                    trainable=False)

                self.global_prev_action_means = tf.placeholder(
                    tf.float32, [None, self.action_count], name='prev_actions')

                if self.continuous:
                    self.global_policy = GaussianPolicy(
                        self.global_network, self.session, self.global_state,
                        self.random, self.action_count, 'gaussian_policy')
                    self.global_prev_action_log_stds = tf.placeholder(
                        tf.float32, [None, self.action_count])

                    self.global_prev_dist = dict(
                        policy_output=self.global_prev_action_means,
                        policy_log_std=self.global_prev_action_log_stds)

                else:
                    self.global_policy = CategoricalOneHotPolicy(
                        self.global_network, self.session, self.global_state,
                        self.random, self.action_count, 'categorical_policy')
                    self.global_prev_dist = dict(
                        policy_output=self.global_prev_action_means)

                # Probability distribution used in the current policy
                self.global_baseline_value_function = LinearValueFunction()

            # self.optimizer = config.get('optimizer')
            # self.optimizer_args = config.get('optimizer_args', [])
            # self.optimizer_kwargs = config.get('optimizer_kwargs', {})

        exploration = config.get('exploration')
        if not exploration:
            self.exploration = exploration_mode['constant'](self, 0)
        else:
            args = config.get('exploration_args', [])
            kwargs = config.get('exploration_kwargs', {})
            self.exploration = exploration_mode[exploration](self, *args,
                                                             **kwargs)

        self.create_training_operations()
class DistributedPGModel(object):
    default_config = {}

    def __init__(self,
                 config,
                 scope,
                 task_index,
                 cluster_spec,
                 define_network=None):
        """

        A distributed agent must synchronise local and global parameters under different
        scopes.

        :param config: Configuration parameters
        :param scope: TensorFlow scope
        """

        self.session = None
        self.saver = None
        self.config = create_config(config, default=self.default_config)
        self.scope = scope
        self.task_index = task_index
        self.batch_size = self.config.batch_size
        self.action_count = self.config.actions
        self.use_gae = self.config.use_gae
        self.gae_lambda = self.config.gae_lambda

        self.gamma = self.config.gamma
        self.continuous = self.config.continuous
        self.normalize_advantage = self.config.normalise_advantage

        if self.config.deterministic_mode:
            self.random = global_seed()
        else:
            self.random = np.random.RandomState()

        if define_network is None:
            self.define_network = NeuralNetwork.layered_network(
                self.config.network_layers)
        else:
            self.define_network = define_network

        # This is the scope used to prefix variable creation for distributed TensorFlow
        self.batch_shape = [None]
        self.deterministic_mode = config.get('deterministic_mode', False)
        self.alpha = config.get('alpha', 0.001)
        self.optimizer = None

        self.worker_device = "/job:worker/task:{}/cpu:0".format(task_index)

        with tf.device(
                tf.train.replica_device_setter(
                    1, worker_device=self.worker_device,
                    cluster=cluster_spec)):
            with tf.variable_scope("global"):
                self.global_state = tf.placeholder(
                    tf.float32,
                    self.batch_shape + list(self.config.state_shape),
                    name="global_state")

                self.global_network = NeuralNetwork(self.define_network,
                                                    [self.global_state])
                self.global_step = tf.get_variable(
                    "global_step", [],
                    tf.int32,
                    initializer=tf.constant_initializer(0, dtype=tf.int32),
                    trainable=False)

                self.global_prev_action_means = tf.placeholder(
                    tf.float32, [None, self.action_count], name='prev_actions')

                if self.continuous:
                    self.global_policy = GaussianPolicy(
                        self.global_network, self.session, self.global_state,
                        self.random, self.action_count, 'gaussian_policy')
                    self.global_prev_action_log_stds = tf.placeholder(
                        tf.float32, [None, self.action_count])

                    self.global_prev_dist = dict(
                        policy_output=self.global_prev_action_means,
                        policy_log_std=self.global_prev_action_log_stds)

                else:
                    self.global_policy = CategoricalOneHotPolicy(
                        self.global_network, self.session, self.global_state,
                        self.random, self.action_count, 'categorical_policy')
                    self.global_prev_dist = dict(
                        policy_output=self.global_prev_action_means)

                # Probability distribution used in the current policy
                self.global_baseline_value_function = LinearValueFunction()

            # self.optimizer = config.get('optimizer')
            # self.optimizer_args = config.get('optimizer_args', [])
            # self.optimizer_kwargs = config.get('optimizer_kwargs', {})

        exploration = config.get('exploration')
        if not exploration:
            self.exploration = exploration_mode['constant'](self, 0)
        else:
            args = config.get('exploration_args', [])
            kwargs = config.get('exploration_kwargs', {})
            self.exploration = exploration_mode[exploration](self, *args,
                                                             **kwargs)

        self.create_training_operations()

    def set_session(self, session):
        self.session = session

        # Session in policy was still 'None' when
        # we initialised policy, hence need to set again
        self.policy.session = session

    def create_training_operations(self):
        """
        Currently a duplicate of the pg agent logic, to be made generic later to allow
        all models to be executed asynchronously/distributed seamlessly.

        """
        # TODO rewrite agent logic so core update logic can be composed into
        # TODO distributed logic

        with tf.device(self.worker_device):
            with tf.variable_scope("local"):
                self.state = tf.placeholder(tf.float32,
                                            self.batch_shape +
                                            list(self.config.state_shape),
                                            name="state")
                self.prev_action_means = tf.placeholder(
                    tf.float32, [None, self.action_count], name='prev_actions')

                self.local_network = NeuralNetwork(self.define_network,
                                                   [self.state])
                # TODO possibly problematic, check
                self.local_step = self.global_step

                if self.continuous:
                    self.policy = GaussianPolicy(self.local_network,
                                                 self.session, self.state,
                                                 self.random,
                                                 self.action_count,
                                                 'gaussian_policy')
                    self.prev_action_log_stds = tf.placeholder(
                        tf.float32, [None, self.action_count])

                    self.prev_dist = dict(
                        policy_output=self.prev_action_means,
                        policy_log_std=self.prev_action_log_stds)

                else:
                    self.policy = CategoricalOneHotPolicy(
                        self.local_network, self.session, self.state,
                        self.random, self.action_count, 'categorical_policy')
                    self.prev_dist = dict(policy_output=self.prev_action_means)

                # Probability distribution used in the current policy
                self.baseline_value_function = LinearValueFunction()

            self.actions = tf.placeholder(tf.float32,
                                          [None, self.action_count],
                                          name='actions')
            self.advantage = tf.placeholder(tf.float32,
                                            shape=[None, 1],
                                            name='advantage')

            self.dist = self.policy.get_distribution()
            self.log_probabilities = self.dist.log_prob(
                self.policy.get_policy_variables(), self.actions)

            # Concise: Get log likelihood of actions, weigh by advantages, compute gradient on that
            self.loss = -tf.reduce_mean(
                self.log_probabilities * self.advantage, name="loss_op")

            self.gradients = tf.gradients(self.loss,
                                          self.local_network.get_variables())

            grad_var_list = list(
                zip(self.gradients, self.global_network.get_variables()))

            global_step_inc = self.global_step.assign_add(
                tf.shape(self.state)[0])

            self.assign_global_to_local = tf.group(*[
                v1.assign(v2)
                for v1, v2 in zip(self.local_network.get_variables(),
                                  self.global_network.get_variables())
            ])

            # TODO write summaries
            # self.summary_writer = tf.summary.FileWriter('log' + "_%d" % self.task_index)
            if not self.optimizer:
                self.optimizer = tf.train.AdamOptimizer(self.alpha)

            else:
                optimizer_cls = get_function(self.optimizer)
                self.optimizer = optimizer_cls(self.alpha,
                                               *self.optimizer_args,
                                               **self.optimizer_kwargs)

            self.optimize_op = tf.group(
                self.optimizer.apply_gradients(grad_var_list), global_step_inc)

    def get_action(self, state, episode=1):
        return self.policy.sample(state)

    def update(self, batch):
        """
        Get global parameters, compute update, then send results to parameter server.
        :param batch:
        :return:
        """

        self.compute_gae_advantage(batch, self.gamma, self.gae_lambda)

        # Update linear value function for baseline prediction
        self.baseline_value_function.fit(batch)

        # Merge episode inputs into single arrays
        _, _, actions, batch_advantage, states = self.merge_episodes(batch)

        self.session.run(
            [self.optimize_op, self.global_step], {
                self.state: states,
                self.actions: actions,
                self.advantage: batch_advantage
            })

    def get_global_step(self):
        """
        Returns global step to coordinator.
        :return:
        """
        return self.session.run(self.global_step)

    def sync_global_to_local(self):
        """
        Copy shared global weights to local network.

        """
        self.session.run(self.assign_global_to_local)

    def load_model(self, path):
        self.saver.restore(self.session, path)

    def save_model(self, path):
        self.saver.save(self.session, path)

    # TODO remove this duplication, move to util or let distributed agent
    # have a pg agent as a field
    def merge_episodes(self, batch):
        """
        Merge episodes of a batch into single input variables.

        :param batch:
        :return:
        """
        if self.continuous:
            action_log_stds = np.concatenate(
                [path['action_log_stds'] for path in batch])
            action_log_stds = np.expand_dims(action_log_stds, axis=1)
        else:
            action_log_stds = None

        action_means = np.concatenate([path['action_means'] for path in batch])
        actions = np.concatenate([path['actions'] for path in batch])
        batch_advantage = np.concatenate([path["advantage"] for path in batch])

        if self.normalize_advantage:
            batch_advantage = zero_mean_unit_variance(batch_advantage)

        batch_advantage = np.expand_dims(batch_advantage, axis=1)
        states = np.concatenate([path['states'] for path in batch])

        return action_log_stds, action_means, actions, batch_advantage, states

    # TODO duplicate code -> refactor from pg model
    def compute_gae_advantage(self, batch, gamma, gae_lambda, use_gae=False):
        """
        Expects a batch containing at least one episode, sets advantages according to use_gae.

        :param batch: Sequence of observations for at least one episode.
        """
        for episode in batch:
            baseline = self.baseline_value_function.predict(episode)
            if episode['terminated']:
                adjusted_baseline = np.append(baseline, [0])
            else:
                adjusted_baseline = np.append(baseline, baseline[-1])

            episode['returns'] = discount(episode['rewards'], gamma)

            if use_gae:
                deltas = episode['rewards'] + gamma * adjusted_baseline[
                    1:] - adjusted_baseline[:-1]
                episode['advantage'] = discount(deltas, gamma * gae_lambda)
            else:
                episode['advantage'] = episode['returns'] - baseline
    def create_training_operations(self):
        """
        Currently a duplicate of the pg agent logic, to be made generic later to allow
        all models to be executed asynchronously/distributed seamlessly.

        """
        # TODO rewrite agent logic so core update logic can be composed into
        # TODO distributed logic

        with tf.device(self.worker_device):
            with tf.variable_scope("local"):
                self.state = tf.placeholder(tf.float32,
                                            self.batch_shape +
                                            list(self.config.state_shape),
                                            name="state")
                self.prev_action_means = tf.placeholder(
                    tf.float32, [None, self.action_count], name='prev_actions')

                self.local_network = NeuralNetwork(self.define_network,
                                                   [self.state])
                # TODO possibly problematic, check
                self.local_step = self.global_step

                if self.continuous:
                    self.policy = GaussianPolicy(self.local_network,
                                                 self.session, self.state,
                                                 self.random,
                                                 self.action_count,
                                                 'gaussian_policy')
                    self.prev_action_log_stds = tf.placeholder(
                        tf.float32, [None, self.action_count])

                    self.prev_dist = dict(
                        policy_output=self.prev_action_means,
                        policy_log_std=self.prev_action_log_stds)

                else:
                    self.policy = CategoricalOneHotPolicy(
                        self.local_network, self.session, self.state,
                        self.random, self.action_count, 'categorical_policy')
                    self.prev_dist = dict(policy_output=self.prev_action_means)

                # Probability distribution used in the current policy
                self.baseline_value_function = LinearValueFunction()

            self.actions = tf.placeholder(tf.float32,
                                          [None, self.action_count],
                                          name='actions')
            self.advantage = tf.placeholder(tf.float32,
                                            shape=[None, 1],
                                            name='advantage')

            self.dist = self.policy.get_distribution()
            self.log_probabilities = self.dist.log_prob(
                self.policy.get_policy_variables(), self.actions)

            # Concise: Get log likelihood of actions, weigh by advantages, compute gradient on that
            self.loss = -tf.reduce_mean(
                self.log_probabilities * self.advantage, name="loss_op")

            self.gradients = tf.gradients(self.loss,
                                          self.local_network.get_variables())

            grad_var_list = list(
                zip(self.gradients, self.global_network.get_variables()))

            global_step_inc = self.global_step.assign_add(
                tf.shape(self.state)[0])

            self.assign_global_to_local = tf.group(*[
                v1.assign(v2)
                for v1, v2 in zip(self.local_network.get_variables(),
                                  self.global_network.get_variables())
            ])

            # TODO write summaries
            # self.summary_writer = tf.summary.FileWriter('log' + "_%d" % self.task_index)
            if not self.optimizer:
                self.optimizer = tf.train.AdamOptimizer(self.alpha)

            else:
                optimizer_cls = get_function(self.optimizer)
                self.optimizer = optimizer_cls(self.alpha,
                                               *self.optimizer_args,
                                               **self.optimizer_kwargs)

            self.optimize_op = tf.group(
                self.optimizer.apply_gradients(grad_var_list), global_step_inc)
예제 #5
0
class DistributedPGModel(object):
    default_config = {}

    def __init__(self,
                 config,
                 scope,
                 task_index,
                 cluster_spec,
                 define_network=None):
        """

        A distributed agent must synchronise local and global parameters under different
        scopes.

        :param config: Configuration parameters
        :param scope: TensorFlow scope
        """
        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(logging.DEBUG)

        self.session = None
        self.saver = None
        self.config = create_config(config, default=self.default_config)
        self.scope = scope
        self.task_index = task_index
        self.batch_size = self.config.batch_size
        self.action_count = self.config.actions
        self.generalized_advantage_estimation = self.config.use_gae
        self.gae_lambda = self.config.gae_lambda

        self.gamma = self.config.gamma
        self.continuous = self.config.continuous
        self.normalize_advantage = self.config.normalise_advantage
        self.episode_length = tf.placeholder(tf.int32, (None, ),
                                             name='episode_length')

        if self.config.deterministic_mode:
            self.random = global_seed()
        else:
            self.random = np.random.RandomState()

        if define_network is None:
            self.define_network = NeuralNetwork.layered_network(
                self.config.network_layers)
        else:
            self.define_network = define_network

        # This is the scope used to prefix variable creation for distributed TensorFlow
        self.batch_shape = [None]
        self.deterministic_mode = config.get('deterministic_mode', False)
        self.learning_rate = config.get('learning_rate', 0.001)
        self.optimizer = None

        self.worker_device = "/job:worker/task:{}/cpu:0".format(task_index)
        self.state_shape = tuple(self.config.state_shape)

        with tf.device(
                tf.train.replica_device_setter(
                    1, worker_device=self.worker_device,
                    cluster=cluster_spec)):
            with tf.variable_scope("global"):
                self.global_state = tf.placeholder(tf.float32, (None, None) +
                                                   self.state_shape,
                                                   name="global_state")

                self.global_network = NeuralNetwork(
                    self.define_network, [self.global_state],
                    episode_length=self.episode_length)
                self.global_step = tf.get_variable(
                    "global_step", [],
                    tf.int32,
                    initializer=tf.constant_initializer(0, dtype=tf.int32),
                    trainable=False)
                self.global_states = self.global_network.internal_state_inits

                self.global_prev_action_means = tf.placeholder(
                    tf.float32, (None, None, self.action_count),
                    name='prev_actions')

                if self.continuous:
                    self.global_policy = GaussianPolicy(
                        self.global_network, self.session, self.global_state,
                        self.random, self.action_count, 'gaussian_policy')
                    self.global_prev_action_log_stds = tf.placeholder(
                        tf.float32, (None, None, self.action_count))

                    self.global_prev_dist = dict(
                        policy_output=self.global_prev_action_means,
                        policy_log_std=self.global_prev_action_log_stds)

                else:
                    self.global_policy = CategoricalOneHotPolicy(
                        self.global_network, self.session, self.global_state,
                        self.random, self.action_count, 'categorical_policy')
                    self.global_prev_dist = dict(
                        policy_output=self.global_prev_action_means)

                # Probability distribution used in the current policy
                self.global_baseline_value_function = LinearValueFunction()

            # self.optimizer = config.get('optimizer')
            # self.optimizer_args = config.get('optimizer_args', [])
            # self.optimizer_kwargs = config.get('optimizer_kwargs', {})

        exploration = config.get('exploration')
        if not exploration:
            self.exploration = exploration_mode['constant'](self, 0)
        else:
            args = config.get('exploration_args', [])
            kwargs = config.get('exploration_kwargs', {})
            self.exploration = exploration_mode[exploration](self, *args,
                                                             **kwargs)

        self.create_training_operations()

    def set_session(self, session):
        self.session = session

        # Session in policy was still 'None' when
        # we initialised policy, hence need to set again
        self.policy.session = session

    def create_training_operations(self):
        """
        Currently a duplicate of the pg agent logic, to be made generic later to allow
        all models to be executed asynchronously/distributed seamlessly.

        """
        # TODO rewrite agent logic so core update logic can be composed into
        # TODO distributed logic

        with tf.device(self.worker_device):
            with tf.variable_scope("local"):
                self.state = tf.placeholder(tf.float32,
                                            (None, None) + self.state_shape,
                                            name="state")
                self.prev_action_means = tf.placeholder(
                    tf.float32, (None, None, self.action_count),
                    name='prev_actions')

                self.local_network = NeuralNetwork(
                    self.define_network, [self.state],
                    episode_length=self.episode_length)
                self.local_states = self.local_network.internal_state_inits

                # TODO possibly problematic, check
                self.local_step = self.global_step

                if self.continuous:
                    self.policy = GaussianPolicy(self.local_network,
                                                 self.session, self.state,
                                                 self.random,
                                                 self.action_count,
                                                 'gaussian_policy')
                    self.prev_action_log_stds = tf.placeholder(
                        tf.float32, (None, None, self.action_count))

                    self.prev_dist = dict(
                        policy_output=self.prev_action_means,
                        policy_log_std=self.prev_action_log_stds)

                else:
                    self.policy = CategoricalOneHotPolicy(
                        self.local_network, self.session, self.state,
                        self.random, self.action_count, 'categorical_policy')
                    self.prev_dist = dict(policy_output=self.prev_action_means)

                # Probability distribution used in the current policy
                self.baseline_value_function = LinearValueFunction()

            self.actions = tf.placeholder(tf.float32,
                                          (None, None, self.action_count),
                                          name='actions')
            self.advantage = tf.placeholder(tf.float32,
                                            shape=(None, None, 1),
                                            name='advantage')

            self.dist = self.policy.get_distribution()
            self.log_probabilities = self.dist.log_prob(
                self.policy.get_policy_variables(), self.actions)

            # Concise: Get log likelihood of actions, weigh by advantages, compute gradient on that
            self.loss = -tf.reduce_mean(
                self.log_probabilities * self.advantage, name="loss_op")

            self.gradients = tf.gradients(self.loss,
                                          self.local_network.variables)

            grad_var_list = list(
                zip(self.gradients, self.global_network.variables))

            global_step_inc = self.global_step.assign_add(
                tf.shape(self.state)[0])

            self.assign_global_to_local = tf.group(*[
                v1.assign(v2) for v1, v2 in zip(self.local_network.variables,
                                                self.global_network.variables)
            ])

            # TODO write summaries
            # self.summary_writer = tf.summary.FileWriter('log' + "_%d" % self.task_index)
            if not self.optimizer:
                self.optimizer = tf.train.AdamOptimizer(self.learning_rate)

            else:
                optimizer_cls = get_function(self.optimizer)
                self.optimizer = optimizer_cls(self.learning_rate,
                                               *self.optimizer_args,
                                               **self.optimizer_kwargs)

            self.optimize_op = tf.group(
                self.optimizer.apply_gradients(grad_var_list), global_step_inc)

    def get_action(self, state, episode=1):
        return self.policy.sample(state)

    def update(self, batch):
        """
        Get global parameters, compute update, then send results to parameter server.
        :param batch:
        :return:
        """

        for episode in batch:
            episode['returns'] = discount(episode['rewards'], self.gamma)
            episode['advantages'] = self.generalised_advantage_estimation(
                episode)

        # Update linear value function for baseline prediction
        self.baseline_value_function.fit(batch)

        fetches = [self.loss, self.optimize_op, self.global_step]
        fetches.extend(self.local_network.internal_state_outputs)

        print(len(batch))
        print(batch[0]['episode_length'])
        # Merge episode inputs into single arrays
        feed_dict = {
            self.episode_length:
            [episode['episode_length'] for episode in batch],
            self.state: [episode['states'] for episode in batch],
            self.actions: [episode['actions'] for episode in batch],
            self.advantage: [episode['advantages'] for episode in batch]
        }
        for n, internal_state in enumerate(
                self.local_network.internal_state_inputs):
            feed_dict[internal_state] = self.local_states[n]

        fetched = self.session.run(fetches, feed_dict)
        loss = fetched[0]
        self.local_states = fetched[3:]

        self.logger.debug('Distributed model loss = ' + str(loss))

    def get_global_step(self):
        """
        Returns global step to coordinator.
        :return:
        """
        return self.session.run(self.global_step)

    def sync_global_to_local(self):
        """
        Copy shared global weights to local network.

        """
        self.session.run(self.assign_global_to_local)

    def load_model(self, path):
        self.saver.restore(self.session, path)

    def save_model(self, path):
        self.saver.save(self.session, path)

    # TODO duplicate code -> refactor from pg model
    def generalised_advantage_estimation(self, episode):
        """
         Expects an episode, returns advantages according to config.
        """
        baseline = self.baseline_value_function.predict(episode)

        if self.generalized_advantage_estimation:
            if episode['terminated']:
                adjusted_baseline = np.append(baseline, [0])
            else:
                adjusted_baseline = np.append(baseline, baseline[-1])
            deltas = episode['rewards'] + self.gamma * adjusted_baseline[
                1:] - adjusted_baseline[:-1]
            advantage = discount(deltas, self.gamma * self.gae_lambda)
        else:
            advantage = episode['returns'] - baseline

        if self.normalize_advantage:
            return zero_mean_unit_variance(advantage)
        else:
            return advantage

    # TODO remove this duplicate
    def zero_episode(self):
        """
        Creates a new episode dict.

        :return: 
        """
        zero_episode = {
            'episode_length': 0,
            'terminated': False,
            'states': np.zeros(shape=((self.batch_size, ) + self.state_shape)),
            'actions': np.zeros(shape=(self.batch_size, self.action_count)),
            'action_means':
            np.zeros(shape=(self.batch_size, self.action_count)),
            'rewards': np.zeros(shape=(self.batch_size, 1))
        }

        if self.continuous:
            zero_episode['action_log_stds'] = np.zeros(
                shape=(self.batch_size, self.action_count))

        return zero_episode
예제 #6
0
class PGModel(Model):
    def __init__(self, config, scope, define_network=None):
        super(PGModel, self).__init__(config, scope)
        self.batch_size = self.config.batch_size
        self.action_count = self.config.actions
        self.use_gae = self.config.use_gae
        self.gae_lambda = self.config.gae_lambda

        self.gamma = self.config.gamma
        self.continuous = self.config.continuous
        self.normalize_advantage = self.config.normalise_advantage

        if self.config.deterministic_mode:
            self.random = global_seed()
        else:
            self.random = np.random.RandomState()

        self.state = tf.placeholder(tf.float32, self.batch_shape + list(self.config.state_shape), name="state")
        self.episode = 0
        self.input_feed = None

        self.advantage = tf.placeholder(tf.float32, shape=[None, 1], name='advantage')
        self.policy = None

        scope = '' if self.config.tf_scope is None else self.config.tf_scope + '-'

        if define_network is None:
            define_network = NeuralNetwork.layered_network(self.config.network_layers)

        self.hidden_layers = NeuralNetwork(define_network, [self.state], scope=scope + 'value_function')

        self.saver = tf.train.Saver()
        self.actions = tf.placeholder(tf.float32, [None, self.action_count], name='actions')
        self.prev_action_means = tf.placeholder(tf.float32, [None, self.action_count], name='prev_actions')

        # From an API perspective, continuous vs discrete might be easier than
        # requiring to set the concrete policy, at least currently
        if self.continuous:
            self.policy = GaussianPolicy(self.hidden_layers, self.session, self.state, self.random,
                                         self.action_count, 'gaussian_policy')
            self.prev_action_log_stds = tf.placeholder(tf.float32, [None, self.action_count])

            self.prev_dist = dict(policy_output=self.prev_action_means,
                                  policy_log_std=self.prev_action_log_stds)

        else:
            self.policy = CategoricalOneHotPolicy(self.hidden_layers, self.session, self.state, self.random,
                                                  self.action_count, 'categorical_policy')
            self.prev_dist = dict(policy_output=self.prev_action_means)

        # Probability distribution used in the current policy
        self.dist = self.policy.get_distribution()

        # TODO configurable value functions
        self.baseline_value_function = MLPValueFunction(self.session, 100, 64)

    def get_action(self, state, episode=1):
        """
        Actions are directly sampled from the policy.

        :param state:
        :param episode:
        :return:
        """
        return self.policy.sample(state)

    def update(self, batch):
        """
        Update needs to be implemented by specific PG algorithm.

        :param batch: Batch of experiences
        :return:
        """
        raise NotImplementedError

    def merge_episodes(self, batch):
        """
        Merge episodes of a batch into single input variables.

        :param batch:
        :return:
        """
        if self.continuous:
            action_log_stds = np.concatenate([path['action_log_stds'] for path in batch])
            action_log_stds = np.expand_dims(action_log_stds, axis=1)
        else:
            action_log_stds = None

        action_means = np.concatenate([path['action_means'] for path in batch])
        actions = np.concatenate([path['actions'] for path in batch])
        batch_advantage = np.concatenate([path["advantage"] for path in batch])

        if self.normalize_advantage:
            batch_advantage = zero_mean_unit_variance(batch_advantage)

        batch_advantage = np.expand_dims(batch_advantage, axis=1)
        states = np.concatenate([path['states'] for path in batch])

        return action_log_stds, action_means, actions, batch_advantage, states

    def compute_gae_advantage(self, batch, gamma, gae_lambda, use_gae=False):
        """
         Expects a batch containing at least one episode, sets advantages according to use_gae.

        :param batch: Sequence of observations for at least one episode.
        :param batch:
        :param gamma:
        :param gae_lambda:
        :param use_gae:
        :return:
        """

        for episode in batch:
            baseline = self.baseline_value_function.predict(episode)

            if episode['terminated']:
                adjusted_baseline = np.append(baseline, [0])
            else:
                adjusted_baseline = np.append(baseline, baseline[-1])

            episode['returns'] = discount(episode['rewards'], gamma)

            if use_gae:
                deltas = episode['rewards'] + gamma * adjusted_baseline[1:] - adjusted_baseline[:-1]
                episode['advantage'] = discount(deltas, gamma * gae_lambda)
            else:
                episode['advantage'] = episode['returns'] - baseline
예제 #7
0
    def __init__(self, config, scope, network_builder=None):
        super(PGModel, self).__init__(config, scope)

        self.continuous = self.config.continuous
        self.batch_size = self.config.batch_size
        self.max_episode_length = min(self.config.max_episode_length,
                                      self.batch_size)
        self.action_count = self.config.actions

        # advantage estimation
        self.gamma = self.config.gamma
        self.generalized_advantage_estimation = self.config.gae
        self.gae_lambda = self.config.gae_lambda
        self.normalize_advantage = self.config.normalize_advantage

        if self.config.deterministic_mode:
            self.random = global_seed()
        else:
            self.random = np.random.RandomState()

        self.state_shape = tuple(self.config.state_shape)
        self.state = tf.placeholder(tf.float32,
                                    (None, None) + self.state_shape,
                                    name="state")
        self.actions = tf.placeholder(tf.float32,
                                      (None, None, self.action_count),
                                      name='actions')
        self.prev_action_means = tf.placeholder(
            tf.float32, (None, None, self.action_count), name='prev_actions')
        self.advantage = tf.placeholder(tf.float32,
                                        shape=(None, None, 1),
                                        name='advantage')

        if network_builder is None:
            network_builder = NeuralNetwork.layered_network(
                self.config.network_layers)
        if self.config.tf_scope is None:
            scope = ''
        else:
            scope = self.config.tf_scope + '-'
        self.network = NeuralNetwork(network_builder,
                                     inputs=[self.state],
                                     episode_length=self.episode_length,
                                     scope=scope + 'value_function')
        self.internal_states = self.network.internal_state_inits

        # From an API perspective, continuous vs discrete might be easier than
        # requiring to set the concrete policy, at least currently
        if self.continuous:
            self.policy = GaussianPolicy(self.network, self.session,
                                         self.state, self.random,
                                         self.action_count, 'gaussian_policy')
            self.prev_action_log_stds = tf.placeholder(
                tf.float32, (None, None, self.action_count))
            self.prev_dist = dict(policy_output=self.prev_action_means,
                                  policy_log_std=self.prev_action_log_stds)

        else:
            self.policy = CategoricalOneHotPolicy(self.network, self.session,
                                                  self.state, self.random,
                                                  self.action_count,
                                                  'categorical_policy')
            self.prev_dist = dict(policy_output=self.prev_action_means)

        # Probability distribution used in the current policy
        self.dist = self.policy.get_distribution()

        size = 1
        for dims in self.state_shape:
            size *= dims
        self.baseline_value_function = LinearValueFunction()
예제 #8
0
class PGModel(Model):
    def __init__(self, config, scope, network_builder=None):
        super(PGModel, self).__init__(config, scope)

        self.continuous = self.config.continuous
        self.batch_size = self.config.batch_size
        self.max_episode_length = min(self.config.max_episode_length,
                                      self.batch_size)
        self.action_count = self.config.actions

        # advantage estimation
        self.gamma = self.config.gamma
        self.generalized_advantage_estimation = self.config.gae
        self.gae_lambda = self.config.gae_lambda
        self.normalize_advantage = self.config.normalize_advantage

        if self.config.deterministic_mode:
            self.random = global_seed()
        else:
            self.random = np.random.RandomState()

        self.state_shape = tuple(self.config.state_shape)
        self.state = tf.placeholder(tf.float32,
                                    (None, None) + self.state_shape,
                                    name="state")
        self.actions = tf.placeholder(tf.float32,
                                      (None, None, self.action_count),
                                      name='actions')
        self.prev_action_means = tf.placeholder(
            tf.float32, (None, None, self.action_count), name='prev_actions')
        self.advantage = tf.placeholder(tf.float32,
                                        shape=(None, None, 1),
                                        name='advantage')

        if network_builder is None:
            network_builder = NeuralNetwork.layered_network(
                self.config.network_layers)
        if self.config.tf_scope is None:
            scope = ''
        else:
            scope = self.config.tf_scope + '-'
        self.network = NeuralNetwork(network_builder,
                                     inputs=[self.state],
                                     episode_length=self.episode_length,
                                     scope=scope + 'value_function')
        self.internal_states = self.network.internal_state_inits

        # From an API perspective, continuous vs discrete might be easier than
        # requiring to set the concrete policy, at least currently
        if self.continuous:
            self.policy = GaussianPolicy(self.network, self.session,
                                         self.state, self.random,
                                         self.action_count, 'gaussian_policy')
            self.prev_action_log_stds = tf.placeholder(
                tf.float32, (None, None, self.action_count))
            self.prev_dist = dict(policy_output=self.prev_action_means,
                                  policy_log_std=self.prev_action_log_stds)

        else:
            self.policy = CategoricalOneHotPolicy(self.network, self.session,
                                                  self.state, self.random,
                                                  self.action_count,
                                                  'categorical_policy')
            self.prev_dist = dict(policy_output=self.prev_action_means)

        # Probability distribution used in the current policy
        self.dist = self.policy.get_distribution()

        size = 1
        for dims in self.state_shape:
            size *= dims
        self.baseline_value_function = LinearValueFunction()

    # self.saver = tf.train.Saver()

    def get_action(self, state, episode=1):
        """
        Actions are directly sampled from the policy.

        :param state:
        :param episode:
        :return:
        """

        return self.policy.sample(state)

    def update(self, batch):
        """
        Update needs to be implemented by specific PG algorithm.

        :param batch: Batch of experiences
        :return:
        """
        raise NotImplementedError

    def zero_episode(self):
        """
        Creates a new episode dict.
        
        :return: 
        """
        zero_episode = {
            'episode_length':
            0,
            'terminated':
            False,
            'states':
            np.zeros(shape=((self.max_episode_length, ) + self.state_shape)),
            'actions':
            np.zeros(shape=(self.max_episode_length, self.action_count)),
            'action_means':
            np.zeros(shape=(self.max_episode_length, self.action_count)),
            'rewards':
            np.zeros(shape=(self.max_episode_length, 1))
        }

        if self.continuous:
            zero_episode['action_log_stds'] = np.zeros(
                shape=(self.max_episode_length, self.action_count))

        return zero_episode

    def advantage_estimation(self, episode):
        """
         Expects an episode, returns advantages according to config.
        """
        baseline = self.baseline_value_function.predict(episode)

        if self.generalized_advantage_estimation:
            if episode['terminated']:
                adjusted_baseline = np.append(baseline, [0])
            else:
                adjusted_baseline = np.append(baseline, baseline[-1])
            deltas = episode['rewards'] + self.gamma * adjusted_baseline[
                1:] - adjusted_baseline[:-1]
            advantage = discount(deltas, self.gamma * self.gae_lambda)
        else:
            advantage = episode['returns'] - baseline

        if self.normalize_advantage:
            return zero_mean_unit_variance(advantage)
        else:
            return advantage