예제 #1
0
    def __init__(self, config, scope, define_network=None):
        super(PGModel, self).__init__(config, scope)
        self.batch_size = self.config.batch_size
        self.action_count = self.config.actions
        self.use_gae = self.config.use_gae
        self.gae_lambda = self.config.gae_lambda

        self.gamma = self.config.gamma
        self.continuous = self.config.continuous
        self.normalize_advantage = self.config.normalise_advantage

        if self.config.deterministic_mode:
            self.random = global_seed()
        else:
            self.random = np.random.RandomState()

        self.state = tf.placeholder(tf.float32, self.batch_shape + list(self.config.state_shape), name="state")
        self.episode = 0
        self.input_feed = None

        self.advantage = tf.placeholder(tf.float32, shape=[None, 1], name='advantage')
        self.policy = None

        scope = '' if self.config.tf_scope is None else self.config.tf_scope + '-'

        if define_network is None:
            define_network = NeuralNetwork.layered_network(self.config.network_layers)

        self.hidden_layers = NeuralNetwork(define_network, [self.state], scope=scope + 'value_function')

        self.saver = tf.train.Saver()
        self.actions = tf.placeholder(tf.float32, [None, self.action_count], name='actions')
        self.prev_action_means = tf.placeholder(tf.float32, [None, self.action_count], name='prev_actions')

        # From an API perspective, continuous vs discrete might be easier than
        # requiring to set the concrete policy, at least currently
        if self.continuous:
            self.policy = GaussianPolicy(self.hidden_layers, self.session, self.state, self.random,
                                         self.action_count, 'gaussian_policy')
            self.prev_action_log_stds = tf.placeholder(tf.float32, [None, self.action_count])

            self.prev_dist = dict(policy_output=self.prev_action_means,
                                  policy_log_std=self.prev_action_log_stds)

        else:
            self.policy = CategoricalOneHotPolicy(self.hidden_layers, self.session, self.state, self.random,
                                                  self.action_count, 'categorical_policy')
            self.prev_dist = dict(policy_output=self.prev_action_means)

        # Probability distribution used in the current policy
        self.dist = self.policy.get_distribution()

        # TODO configurable value functions
        self.baseline_value_function = MLPValueFunction(self.session, 100, 64)
    def __init__(self,
                 config,
                 scope,
                 task_index,
                 cluster_spec,
                 define_network=None):
        """

        A distributed agent must synchronise local and global parameters under different
        scopes.

        :param config: Configuration parameters
        :param scope: TensorFlow scope
        """

        self.session = None
        self.saver = None
        self.config = create_config(config, default=self.default_config)
        self.scope = scope
        self.task_index = task_index
        self.batch_size = self.config.batch_size
        self.action_count = self.config.actions
        self.use_gae = self.config.use_gae
        self.gae_lambda = self.config.gae_lambda

        self.gamma = self.config.gamma
        self.continuous = self.config.continuous
        self.normalize_advantage = self.config.normalise_advantage

        if self.config.deterministic_mode:
            self.random = global_seed()
        else:
            self.random = np.random.RandomState()

        if define_network is None:
            self.define_network = NeuralNetwork.layered_network(
                self.config.network_layers)
        else:
            self.define_network = define_network

        # This is the scope used to prefix variable creation for distributed TensorFlow
        self.batch_shape = [None]
        self.deterministic_mode = config.get('deterministic_mode', False)
        self.alpha = config.get('alpha', 0.001)
        self.optimizer = None

        self.worker_device = "/job:worker/task:{}/cpu:0".format(task_index)

        with tf.device(
                tf.train.replica_device_setter(
                    1, worker_device=self.worker_device,
                    cluster=cluster_spec)):
            with tf.variable_scope("global"):
                self.global_state = tf.placeholder(
                    tf.float32,
                    self.batch_shape + list(self.config.state_shape),
                    name="global_state")

                self.global_network = NeuralNetwork(self.define_network,
                                                    [self.global_state])
                self.global_step = tf.get_variable(
                    "global_step", [],
                    tf.int32,
                    initializer=tf.constant_initializer(0, dtype=tf.int32),
                    trainable=False)

                self.global_prev_action_means = tf.placeholder(
                    tf.float32, [None, self.action_count], name='prev_actions')

                if self.continuous:
                    self.global_policy = GaussianPolicy(
                        self.global_network, self.session, self.global_state,
                        self.random, self.action_count, 'gaussian_policy')
                    self.global_prev_action_log_stds = tf.placeholder(
                        tf.float32, [None, self.action_count])

                    self.global_prev_dist = dict(
                        policy_output=self.global_prev_action_means,
                        policy_log_std=self.global_prev_action_log_stds)

                else:
                    self.global_policy = CategoricalOneHotPolicy(
                        self.global_network, self.session, self.global_state,
                        self.random, self.action_count, 'categorical_policy')
                    self.global_prev_dist = dict(
                        policy_output=self.global_prev_action_means)

                # Probability distribution used in the current policy
                self.global_baseline_value_function = LinearValueFunction()

            # self.optimizer = config.get('optimizer')
            # self.optimizer_args = config.get('optimizer_args', [])
            # self.optimizer_kwargs = config.get('optimizer_kwargs', {})

        exploration = config.get('exploration')
        if not exploration:
            self.exploration = exploration_mode['constant'](self, 0)
        else:
            args = config.get('exploration_args', [])
            kwargs = config.get('exploration_kwargs', {})
            self.exploration = exploration_mode[exploration](self, *args,
                                                             **kwargs)

        self.create_training_operations()
    def create_training_operations(self):
        """
        Currently a duplicate of the pg agent logic, to be made generic later to allow
        all models to be executed asynchronously/distributed seamlessly.

        """
        # TODO rewrite agent logic so core update logic can be composed into
        # TODO distributed logic

        with tf.device(self.worker_device):
            with tf.variable_scope("local"):
                self.state = tf.placeholder(tf.float32,
                                            self.batch_shape +
                                            list(self.config.state_shape),
                                            name="state")
                self.prev_action_means = tf.placeholder(
                    tf.float32, [None, self.action_count], name='prev_actions')

                self.local_network = NeuralNetwork(self.define_network,
                                                   [self.state])
                # TODO possibly problematic, check
                self.local_step = self.global_step

                if self.continuous:
                    self.policy = GaussianPolicy(self.local_network,
                                                 self.session, self.state,
                                                 self.random,
                                                 self.action_count,
                                                 'gaussian_policy')
                    self.prev_action_log_stds = tf.placeholder(
                        tf.float32, [None, self.action_count])

                    self.prev_dist = dict(
                        policy_output=self.prev_action_means,
                        policy_log_std=self.prev_action_log_stds)

                else:
                    self.policy = CategoricalOneHotPolicy(
                        self.local_network, self.session, self.state,
                        self.random, self.action_count, 'categorical_policy')
                    self.prev_dist = dict(policy_output=self.prev_action_means)

                # Probability distribution used in the current policy
                self.baseline_value_function = LinearValueFunction()

            self.actions = tf.placeholder(tf.float32,
                                          [None, self.action_count],
                                          name='actions')
            self.advantage = tf.placeholder(tf.float32,
                                            shape=[None, 1],
                                            name='advantage')

            self.dist = self.policy.get_distribution()
            self.log_probabilities = self.dist.log_prob(
                self.policy.get_policy_variables(), self.actions)

            # Concise: Get log likelihood of actions, weigh by advantages, compute gradient on that
            self.loss = -tf.reduce_mean(
                self.log_probabilities * self.advantage, name="loss_op")

            self.gradients = tf.gradients(self.loss,
                                          self.local_network.get_variables())

            grad_var_list = list(
                zip(self.gradients, self.global_network.get_variables()))

            global_step_inc = self.global_step.assign_add(
                tf.shape(self.state)[0])

            self.assign_global_to_local = tf.group(*[
                v1.assign(v2)
                for v1, v2 in zip(self.local_network.get_variables(),
                                  self.global_network.get_variables())
            ])

            # TODO write summaries
            # self.summary_writer = tf.summary.FileWriter('log' + "_%d" % self.task_index)
            if not self.optimizer:
                self.optimizer = tf.train.AdamOptimizer(self.alpha)

            else:
                optimizer_cls = get_function(self.optimizer)
                self.optimizer = optimizer_cls(self.alpha,
                                               *self.optimizer_args,
                                               **self.optimizer_kwargs)

            self.optimize_op = tf.group(
                self.optimizer.apply_gradients(grad_var_list), global_step_inc)
예제 #4
0
    def __init__(self, config, scope, network_builder=None):
        super(PGModel, self).__init__(config, scope)

        self.continuous = self.config.continuous
        self.batch_size = self.config.batch_size
        self.max_episode_length = min(self.config.max_episode_length,
                                      self.batch_size)
        self.action_count = self.config.actions

        # advantage estimation
        self.gamma = self.config.gamma
        self.generalized_advantage_estimation = self.config.gae
        self.gae_lambda = self.config.gae_lambda
        self.normalize_advantage = self.config.normalize_advantage

        if self.config.deterministic_mode:
            self.random = global_seed()
        else:
            self.random = np.random.RandomState()

        self.state_shape = tuple(self.config.state_shape)
        self.state = tf.placeholder(tf.float32,
                                    (None, None) + self.state_shape,
                                    name="state")
        self.actions = tf.placeholder(tf.float32,
                                      (None, None, self.action_count),
                                      name='actions')
        self.prev_action_means = tf.placeholder(
            tf.float32, (None, None, self.action_count), name='prev_actions')
        self.advantage = tf.placeholder(tf.float32,
                                        shape=(None, None, 1),
                                        name='advantage')

        if network_builder is None:
            network_builder = NeuralNetwork.layered_network(
                self.config.network_layers)
        if self.config.tf_scope is None:
            scope = ''
        else:
            scope = self.config.tf_scope + '-'
        self.network = NeuralNetwork(network_builder,
                                     inputs=[self.state],
                                     episode_length=self.episode_length,
                                     scope=scope + 'value_function')
        self.internal_states = self.network.internal_state_inits

        # From an API perspective, continuous vs discrete might be easier than
        # requiring to set the concrete policy, at least currently
        if self.continuous:
            self.policy = GaussianPolicy(self.network, self.session,
                                         self.state, self.random,
                                         self.action_count, 'gaussian_policy')
            self.prev_action_log_stds = tf.placeholder(
                tf.float32, (None, None, self.action_count))
            self.prev_dist = dict(policy_output=self.prev_action_means,
                                  policy_log_std=self.prev_action_log_stds)

        else:
            self.policy = CategoricalOneHotPolicy(self.network, self.session,
                                                  self.state, self.random,
                                                  self.action_count,
                                                  'categorical_policy')
            self.prev_dist = dict(policy_output=self.prev_action_means)

        # Probability distribution used in the current policy
        self.dist = self.policy.get_distribution()

        size = 1
        for dims in self.state_shape:
            size *= dims
        self.baseline_value_function = LinearValueFunction()