Пример #1
0
    def __init__(self, config, scope):
        """
        Initialize model, build network and tensorflow ops

        :param config: Config object or dict
        :param scope: tensorflow scope name
        """
        super(SimpleQModel, self).__init__(config, scope)
        self.action_count = self.config.actions

        self.random = np.random.RandomState()

        with tf.device(self.config.tf_device):
            # Create state placeholder
            # self.batch_shape is [None] (set in Model.__init__)
            self.state = tf.placeholder(tf.float32, self.batch_shape + list(self.config.state_shape), name="state")

            # Create neural network
            output_layer = [{"type": "linear", "num_outputs": self.action_count}]
            self.network = NeuralNetwork(self.config.network_layers + output_layer, self.state, scope=self.scope + "network")
            self.network_out = self.network.get_output()

            # Create operations
            self.create_ops()
            self.init_op = tf.global_variables_initializer()

            # Create optimizer
            self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.config.alpha)
Пример #2
0
    def create_net(self, state_size, layer_size):
        with tf.variable_scope("mlp_value_function"):
            self.input = tf.placeholder(
                tf.float32,
                shape=[None, self.get_features_size(state_size)],
                name="input")

            network_builder = NeuralNetwork.layered_network(({
                'type':
                'dense',
                'num_outputs':
                layer_size
            }, {
                'type': 'dense',
                'num_outputs': 1
            }))
            network = NeuralNetwork(network_builder=network_builder,
                                    inputs=[self.input])

            # hidden_1 = dense(layer_input=self.input, {'num_outputs': input_shape}, scope='hidden_1')
            # hidden_2 = dense(hidden_1, {'num_outputs': self.layer_size}, scope='hidden_2')
            # out = dense(hidden_2, {'num_outputs': 1}, scope='out')
            self.mlp = tf.reshape(network.output, (-1, 1))

            l2 = tf.nn.l2_loss(self.mlp - self.labels)
            self.update = tf.train.AdamOptimizer().minimize(l2)

            self.session.run(tf.global_variables_initializer())
Пример #3
0
    def __init__(self, config, scope, network_builder=None):
        """
        Training logic for DQN.

        :param config: Configuration dict
        """
        super(DQNModel, self).__init__(config, scope)

        self.action_count = self.config.actions
        self.tau = self.config.tau
        self.gamma = self.config.gamma
        # self.batch_size = self.config.batch_size

        self.double_dqn = self.config.double_dqn

        self.clip_value = None
        if self.config.clip_gradients:
            self.clip_value = self.config.clip_value

        if self.config.deterministic_mode:
            self.random = global_seed()
        else:
            self.random = np.random.RandomState()

        self.target_network_update = []

        # output layer
        output_layer_config = [{"type": "linear", "num_outputs": self.config.actions, "trainable": True}]

        # Input placeholders
        self.state_shape = tuple(self.config.state_shape)
        self.state = tf.placeholder(tf.float32, (None, None) + self.state_shape, name="state")
        self.next_states = tf.placeholder(tf.float32, (None, None) + self.state_shape,
                                          name="next_states")
        self.terminals = tf.placeholder(tf.float32, (None, None), name='terminals')
        self.rewards = tf.placeholder(tf.float32, (None, None), name='rewards')

        if network_builder is None:
            network_builder = NeuralNetwork.layered_network(self.config.network_layers + output_layer_config)

        self.training_network = NeuralNetwork(network_builder, [self.state], episode_length=self.episode_length,
                                              scope=self.scope + 'training')
        self.target_network = NeuralNetwork(network_builder, [self.next_states], episode_length=self.episode_length,
                                            scope=self.scope + 'target')

        self.training_internal_states = self.training_network.internal_state_inits
        self.target_internal_states = self.target_network.internal_state_inits

        self.training_output = self.training_network.output
        self.target_output = self.target_network.output

        # Create training operations
        self.create_training_operations()

        self.init_op = tf.global_variables_initializer()

        self.saver = tf.train.Saver()
        self.writer = tf.summary.FileWriter('logs', graph=tf.get_default_graph())
        self.session.run(self.init_op)
Пример #4
0
    def __init__(self, config, scope, define_network=None):
        super(PGModel, self).__init__(config, scope)
        self.batch_size = self.config.batch_size
        self.action_count = self.config.actions
        self.use_gae = self.config.use_gae
        self.gae_lambda = self.config.gae_lambda

        self.gamma = self.config.gamma
        self.continuous = self.config.continuous
        self.normalize_advantage = self.config.normalise_advantage

        if self.config.deterministic_mode:
            self.random = global_seed()
        else:
            self.random = np.random.RandomState()

        self.state = tf.placeholder(tf.float32, self.batch_shape + list(self.config.state_shape), name="state")
        self.episode = 0
        self.input_feed = None

        self.advantage = tf.placeholder(tf.float32, shape=[None, 1], name='advantage')
        self.policy = None

        scope = '' if self.config.tf_scope is None else self.config.tf_scope + '-'

        if define_network is None:
            define_network = NeuralNetwork.layered_network(self.config.network_layers)

        self.hidden_layers = NeuralNetwork(define_network, [self.state], scope=scope + 'value_function')

        self.saver = tf.train.Saver()
        self.actions = tf.placeholder(tf.float32, [None, self.action_count], name='actions')
        self.prev_action_means = tf.placeholder(tf.float32, [None, self.action_count], name='prev_actions')

        # From an API perspective, continuous vs discrete might be easier than
        # requiring to set the concrete policy, at least currently
        if self.continuous:
            self.policy = GaussianPolicy(self.hidden_layers, self.session, self.state, self.random,
                                         self.action_count, 'gaussian_policy')
            self.prev_action_log_stds = tf.placeholder(tf.float32, [None, self.action_count])

            self.prev_dist = dict(policy_output=self.prev_action_means,
                                  policy_log_std=self.prev_action_log_stds)

        else:
            self.policy = CategoricalOneHotPolicy(self.hidden_layers, self.session, self.state, self.random,
                                                  self.action_count, 'categorical_policy')
            self.prev_dist = dict(policy_output=self.prev_action_means)

        # Probability distribution used in the current policy
        self.dist = self.policy.get_distribution()

        # TODO configurable value functions
        self.baseline_value_function = MLPValueFunction(self.session, 100, 64)
Пример #5
0
    def __init__(self, config, scope, network_builder=None):
        super(DQFDModel, self).__init__(config, scope)

        self.action_count = self.config.actions
        self.tau = self.config.tau
        self.gamma = self.config.gamma
        self.supervised_weight = self.config.supervised_weight
        # l = 0.8 in paper
        self.expert_margin = self.config.expert_margin

        self.clip_value = None
        if self.config.clip_gradients:
            self.clip_value = self.config.clip_value

        self.target_network_update = []

        # Output layer
        output_layer_config = [{"type": "linear", "num_outputs": self.config.actions, "trainable": True}]

        # Input placeholders
        self.state_shape = tuple(self.config.state_shape)
        self.state = tf.placeholder(tf.float32, (None, None) + self.state_shape, name="state")
        self.next_states = tf.placeholder(tf.float32, (None, None) + self.state_shape,
                                          name="next_states")
        self.terminals = tf.placeholder(tf.float32, (None, None), name='terminals')
        self.rewards = tf.placeholder(tf.float32, (None, None), name='rewards')

        if network_builder is None:
            network_builder = NeuralNetwork.layered_network(self.config.network_layers + output_layer_config)

        self.training_network = NeuralNetwork(network_builder, [self.state], episode_length=self.episode_length,
                                              scope=self.scope + 'training')
        self.target_network = NeuralNetwork(network_builder, [self.next_states], episode_length=self.episode_length,
                                            scope=self.scope + 'target')

        self.training_internal_states = self.training_network.internal_state_inits
        self.target_internal_states = self.target_network.internal_state_inits

        self.training_output = self.training_network.output
        self.target_output = self.target_network.output

        # Create training operations
        self.create_training_operations()

        self.init_op = tf.global_variables_initializer()

        self.saver = tf.train.Saver()
        self.writer = tf.summary.FileWriter('logs', graph=tf.get_default_graph())
        self.session.run(self.init_op)
Пример #6
0
    def test_vpg_agent(self):
        config = {
            'batch_size': 8,
            'max_episode_length': 4,
            'continuous': False,
            'state_shape': (2,),
            'actions': 2}
        tf.reset_default_graph()

        config = create_config(config)
        network_builder = NeuralNetwork.layered_network(layers=[{'type': 'dense', 'num_outputs': 32}])

        agent = VPGAgent(config=config, network_builder=network_builder)

        state = (1, 0)
        rewards = [0.0] * 100
        for n in range(10000):
            action = agent.get_action(state=state)
            if action == 0:
                state = (1, 0)
                reward = 0.0
                terminal = False
            else:
                state = (0, 1)
                reward = 1.0
                terminal = True
            agent.add_observation(state=state, action=action, reward=reward, terminal=terminal)
            rewards[n % 100] = reward
            if sum(rewards) == 100.0:
                return
        self.assertTrue(False)
Пример #7
0
    def test_dqn_agent(self):
        config = {
            'seed': 10,
            'batch_size': 16,
            'state_shape': (2, ),
            'actions': 2,
            'action_shape': (),
            'update_rate': 1,
            'update_repeat': 4,
            'min_replay_size': 50,
            'memory_capacity': 50,
            "exploration": "epsilon_decay",
            "exploration_param": {
                "epsilon": 1,
                "epsilon_final": 0,
                "epsilon_states": 50
            },
            'target_network_update_rate': 1.0,
            'use_target_network': True,
            "alpha": 0.0005,
            "gamma": 0.99,
            "tau": 1.0
        }

        tf.reset_default_graph()
        tf.set_random_seed(10)

        config = create_config(config)
        network_builder = NeuralNetwork.layered_network(
            layers=[{
                'type': 'dense',
                'num_outputs': 16
            }, {
                'type': 'linear',
                'num_outputs': 2
            }])
        agent = DQNAgent(config=config, network_builder=network_builder)

        state = (1, 0)
        rewards = [0.0] * 100
        for n in xrange(10000):
            action = agent.get_action(state=state)
            if action == 0:
                state = (1, 0)
                reward = 0.0
                terminal = False
            else:
                state = (0, 1)
                reward = 1.0
                terminal = False
            agent.add_observation(state=state,
                                  action=action,
                                  reward=reward,
                                  terminal=terminal)
            rewards[n % 100] = reward

            if sum(rewards) == 100.0:
                return

        assert (sum(rewards) == 100.0)
Пример #8
0
    def __init__(self, config, scope, define_network=None):
        """
        Training logic for NAFs.

        :param config: Configuration parameters
        """
        super(NAFModel, self).__init__(config, scope)
        self.action_count = self.config.actions
        self.tau = self.config.tau
        self.epsilon = self.config.epsilon
        self.gamma = self.config.gamma
        self.batch_size = self.config.batch_size

        if self.config.deterministic_mode:
            self.random = global_seed()
        else:
            self.random = np.random.RandomState()

        self.state = tf.placeholder(tf.float32, self.batch_shape + list(self.config.state_shape), name="state")
        self.next_states = tf.placeholder(tf.float32, self.batch_shape + list(self.config.state_shape),
                                          name="next_states")

        self.actions = tf.placeholder(tf.float32, [None, self.action_count], name='actions')
        self.terminals = tf.placeholder(tf.float32, [None], name='terminals')
        self.rewards = tf.placeholder(tf.float32, [None], name='rewards')
        self.q_targets = tf.placeholder(tf.float32, [None], name='q_targets')
        self.target_network_update = []
        self.episode = 0

        # Get hidden layers from network generator, then add NAF outputs, same for target network
        scope = '' if self.config.tf_scope is None else self.config.tf_scope + '-'

        if define_network is None:
            define_network = NeuralNetwork.layered_network(self.config.network_layers)

        self.training_model = NeuralNetwork(define_network, [self.state], scope=scope + 'training')
        self.target_model = NeuralNetwork(define_network, [self.next_states], scope=scope + 'target')

        # Create output fields
        self.training_v, self.mu, self.advantage, self.q, self.training_output_vars = self.create_outputs(
            self.training_model.get_output(), 'outputs_training')
        self.target_v, _, _, _, self.target_output_vars = self.create_outputs(self.target_model.get_output(),
                                                                              'outputs_target')
        self.create_training_operations()
        self.saver = tf.train.Saver()
        self.session.run(tf.global_variables_initializer())
Пример #9
0
    def test_trpo_agent(self):

        config = {
            'batch_size': 16,
            "override_line_search": False,
            "cg_iterations": 20,
            "use_gae": False,
            "normalize_advantage": False,
            "gae_lambda": 0.97,
            "cg_damping": 0.001,
            "line_search_steps": 20,
            'max_kl_divergence': 0.05,
            'max_episode_length': 4,
            'continuous': False,
            'state_shape': (2,),
            'actions': 2,
            'gamma': 0.99
        }

        config = create_config(config)
        tf.reset_default_graph()

        network_builder = NeuralNetwork.layered_network(layers=[{'type': 'dense',
                                                                 'num_outputs': 8}])
        agent = TRPOAgent(config=config, network_builder=network_builder)

        state = (1, 0)
        rewards = [0.0] * 100

        for n in xrange(10000):
            action = agent.get_action(state=state)
            if action == 0:
                state = (1, 0)
                reward = 0.0
                terminal = False
            else:
                state = (0, 1)
                reward = 1.0
                terminal = True
            agent.add_observation(state=state, action=action, reward=reward, terminal=terminal)
            rewards[n % 100] = reward

            if sum(rewards) == 100.0:
                print('Steps until passed = {:d}'.format(n))

                return
        print('sum = {:f}'.format(sum(rewards)))
Пример #10
0
    def test_trpo_agent(self):
        config = {
            'batch_size': 8,
            "cg_iterations": 20,
            "cg_damping": 0.001,
            "line_search_steps": 20,
            'max_kl_divergence': 0.01,
            'max_episode_length': 4,
            'continuous': False,
            'state_shape': (2, ),
            'actions': 2
        }
        tf.reset_default_graph()

        config = create_config(config)
        network_builder = NeuralNetwork.layered_network(
            layers=[{
                'type': 'dense',
                'num_outputs': 32
            }])
        agent = TRPOAgent(config=config, network_builder=network_builder)

        state = (1, 0)
        rewards = [0.0] * 100
        for n in range(100):
            action = agent.get_action(state=state)
            if action == 0:
                state = (1, 0)
                reward = 0.0
                terminal = False
            else:
                state = (0, 1)
                reward = 1.0
                terminal = True
            agent.add_observation(state=state,
                                  action=action,
                                  reward=reward,
                                  terminal=terminal)
            rewards[n % 100] = reward
            if sum(rewards) == 100.0:
                return
Пример #11
0
class NAFModel(Model):
    default_config = NAFModelConfig

    def __init__(self, config, scope, define_network=None):
        """
        Training logic for NAFs.

        :param config: Configuration parameters
        """
        super(NAFModel, self).__init__(config, scope)
        self.action_count = self.config.actions
        self.tau = self.config.tau
        self.epsilon = self.config.epsilon
        self.gamma = self.config.gamma
        self.batch_size = self.config.batch_size

        if self.config.deterministic_mode:
            self.random = global_seed()
        else:
            self.random = np.random.RandomState()

        self.state = tf.placeholder(tf.float32, self.batch_shape + list(self.config.state_shape), name="state")
        self.next_states = tf.placeholder(tf.float32, self.batch_shape + list(self.config.state_shape),
                                          name="next_states")

        self.actions = tf.placeholder(tf.float32, [None, self.action_count], name='actions')
        self.terminals = tf.placeholder(tf.float32, [None], name='terminals')
        self.rewards = tf.placeholder(tf.float32, [None], name='rewards')
        self.q_targets = tf.placeholder(tf.float32, [None], name='q_targets')
        self.target_network_update = []
        self.episode = 0

        # Get hidden layers from network generator, then add NAF outputs, same for target network
        scope = '' if self.config.tf_scope is None else self.config.tf_scope + '-'

        if define_network is None:
            define_network = NeuralNetwork.layered_network(self.config.network_layers)

        self.training_model = NeuralNetwork(define_network, [self.state], scope=scope + 'training')
        self.target_model = NeuralNetwork(define_network, [self.next_states], scope=scope + 'target')

        # Create output fields
        self.training_v, self.mu, self.advantage, self.q, self.training_output_vars = self.create_outputs(
            self.training_model.get_output(), 'outputs_training')
        self.target_v, _, _, _, self.target_output_vars = self.create_outputs(self.target_model.get_output(),
                                                                              'outputs_target')
        self.create_training_operations()
        self.saver = tf.train.Saver()
        self.session.run(tf.global_variables_initializer())

    def get_action(self, state, episode=1):
        """
        Returns naf action(s) as given by the mean output of the network.

        :param state: Current state
        :param episode: Current episode
        :return: action
        """
        action = self.session.run(self.mu, {self.state: [state]})[0] + self.exploration(episode, self.total_states)
        self.total_states += 1

        return action

    def update(self, batch):
        """
        Executes a NAF update on a training batch.

        :param batch:=
        :return:
        """
        float_terminals = batch['terminals'].astype(float)

        q_targets = batch['rewards'] + (1. - float_terminals) * self.gamma * np.squeeze(
            self.get_target_value_estimate(batch['next_states']))

        self.session.run([self.optimize_op, self.loss, self.training_v, self.advantage, self.q], {
            self.q_targets: q_targets,
            self.actions: batch['actions'],
            self.state: batch['states']})

    def create_outputs(self, last_hidden_layer, scope):
        """
        Creates NAF specific outputs.

        :param last_hidden_layer: Points to last hidden layer
        :param scope: TF name scope

        :return Output variables and all TF variables created in this scope
        """

        with tf.name_scope(scope):
            # State-value function
            v = linear(last_hidden_layer, {'num_outputs': 1, 'weights_regularizer': self.config.weights_regularizer,
                                           'weights_regularizer_args': [self.config.weights_regularizer_args]}, scope + 'v')

            # Action outputs
            mu = linear(last_hidden_layer, {'num_outputs': self.action_count, 'weights_regularizer': self.config.weights_regularizer,
                                            'weights_regularizer_args': [self.config.weights_regularizer_args]}, scope + 'mu')

            # Advantage computation
            # Network outputs entries of lower triangular matrix L
            lower_triangular_size = int(self.action_count * (self.action_count + 1) / 2)
            l_entries = linear(last_hidden_layer, {'num_outputs': lower_triangular_size,
                                                   'weights_regularizer': self.config.weights_regularizer,
                                                   'weights_regularizer_args': [self.config.weights_regularizer_args]},
                               scope + 'l')

            # Iteratively construct matrix. Extra verbose comment here
            l_rows = []
            offset = 0

            for i in xrange(self.action_count):
                # Diagonal elements are exponentiated, otherwise gradient often 0
                # Slice out lower triangular entries from flat representation through moving offset

                diagonal = tf.exp(tf.slice(l_entries, (0, offset), (-1, 1)))

                n = self.action_count - i - 1
                # Slice out non-zero non-diagonal entries, - 1 because we already took the diagonal
                non_diagonal = tf.slice(l_entries, (0, offset + 1), (-1, n))

                # Fill up row with zeros
                row = tf.pad(tf.concat(axis=1, values=(diagonal, non_diagonal)), ((0, 0), (i, 0)))
                offset += (self.action_count - i)
                l_rows.append(row)

            # Stack rows to matrix
            l_matrix = tf.transpose(tf.stack(l_rows, axis=1), (0, 2, 1))

            # P = LL^T
            p_matrix = tf.matmul(l_matrix, tf.transpose(l_matrix, (0, 2, 1)))

            # Need to adjust dimensions to multiply with P.
            action_diff = tf.expand_dims(self.actions - mu, -1)

            # A = -0.5 (a - mu)P(a - mu)
            advantage = -0.5 * tf.matmul(tf.transpose(action_diff, [0, 2, 1]),
                                               tf.matmul(p_matrix, action_diff))
            advantage = tf.reshape(advantage, [-1, 1])

            with tf.name_scope('q_values'):
                # Q = A + V
                q_value = v + advantage

        # Get all variables under this scope for target network update
        return v, mu, advantage, q_value, get_variables(scope)

    def create_training_operations(self):
        """
        NAF update logic.
        """

        with tf.name_scope("update"):
            # MSE
            self.loss = tf.reduce_mean(tf.squared_difference(self.q_targets, tf.squeeze(self.q)),
                                       name='loss')
            self.optimize_op = self.optimizer.minimize(self.loss)

        with tf.name_scope("update_target"):
            # Combine hidden layer variables and output layer variables
            self.training_vars = self.training_model.get_variables() + self.training_output_vars
            self.target_vars = self.target_model.get_variables() + self.target_output_vars

            for v_source, v_target in zip(self.training_vars, self.target_vars):
                update = v_target.assign_sub(self.tau * (v_target - v_source))

                self.target_network_update.append(update)

    def get_target_value_estimate(self, next_states):
        """
        Estimate of next state V value through target network.

        :param next_states:
        :return:
        """

        return self.session.run(self.target_v, {self.next_states: next_states})

    def update_target_network(self):
        """
        Updates target network.

        :return:
        """
        self.session.run(self.target_network_update)
Пример #12
0
class SimpleQModel(Model):
    # Default config values
    default_config = {
        "alpha": 0.01,
        "gamma": 0.99,
        "network_layers": [{
            "type": "linear",
            "num_outputs": 16
        }]
    }

    def __init__(self, config, scope):
        """
        Initialize model, build network and tensorflow ops

        :param config: Config object or dict
        :param scope: tensorflow scope name
        """
        super(SimpleQModel, self).__init__(config, scope)
        self.action_count = self.config.actions

        self.random = np.random.RandomState()

        with tf.device(self.config.tf_device):
            # Create state placeholder
            # self.batch_shape is [None] (set in Model.__init__)
            self.state = tf.placeholder(tf.float32,
                                        self.batch_shape +
                                        list(self.config.state_shape),
                                        name="state")

            # Create neural network
            output_layer = [{
                "type": "linear",
                "num_outputs": self.action_count
            }]
            self.network = NeuralNetwork(self.config.network_layers +
                                         output_layer,
                                         self.state,
                                         scope=self.scope + "network")
            self.network_out = self.network.get_output()

            # Create operations
            self.create_ops()
            self.init_op = tf.global_variables_initializer()

            # Create optimizer
            self.optimizer = tf.train.GradientDescentOptimizer(
                learning_rate=self.config.alpha)

    def get_action(self, state, episode=1):
        """
        Get action for a given state

        :param state: ndarray containing the state
        :param episode: number of episode (for epsilon decay and alike)
        :return: action
        """

        # self.exploration is initialized in Model.__init__ and provides an API for different explorations methods,
        # such as epsilon greedy.
        epsilon = self.exploration(episode,
                                   self.total_states)  # returns a float

        if self.random.random_sample() < epsilon:
            action = self.random.randint(0, self.action_count)
        else:
            action = self.session.run(self.q_action, {self.state: [state]})[0]

        self.total_states += 1
        return action

    def update(self, batch):
        """
        Update model parameters

        :param batch: memory batch
        :return:
        """
        # Get Q values for next states
        next_q = self.session.run(self.network_out,
                                  {self.state: batch['next_states']})

        # Bellmann equation Q = r + y * Q'
        q_targets = batch['rewards'] + (1. - batch['terminals'].astype(float)) \
                                       * self.config.gamma * np.max(next_q, axis=1)

        self.session.run(
            self.optimize_op, {
                self.state: batch['states'],
                self.actions: batch['actions'],
                self.q_targets: q_targets
            })

    def initialize(self):
        """
        Initialize model variables
        :return:
        """
        self.session.run(self.init_op)

    def create_ops(self):
        """
        Create tensorflow ops

        :return:
        """
        with tf.name_scope(self.scope):
            with tf.name_scope("predict"):
                self.q_action = tf.argmax(self.network_out, axis=1)

            with tf.name_scope("update"):
                # These are the target Q values, i.e. the actual rewards plus the expected values of the next states
                # (Bellman equation).
                self.q_targets = tf.placeholder(tf.float32, [None],
                                                name='q_targets')

                # Actions that have been taken.
                self.actions = tf.placeholder(tf.int32, [None], name='actions')

                # We need the Q values of the current states to calculate the difference ("loss") between the
                # expected values and the new values (q targets). Therefore we do a forward-pass
                # and reduce the results to the actions that have been taken.

                # One_hot tensor of the actions that have been taken.
                actions_one_hot = tf.one_hot(self.actions,
                                             self.action_count,
                                             1.0,
                                             0.0,
                                             name='action_one_hot')

                # Training output, reduced to the actions that have been taken.
                q_values_actions_taken = tf.reduce_sum(self.network_out *
                                                       actions_one_hot,
                                                       axis=1,
                                                       name='q_acted')

                # The loss is the difference between the q_targets and the expected q values.
                self.loss = tf.reduce_sum(
                    tf.square(self.q_targets - q_values_actions_taken))
                self.optimize_op = self.optimizer.minimize(self.loss)
Пример #13
0
    def __init__(self, config, scope, define_network=None):
        """
        Training logic for DQN.

        :param config: Configuration dict
        """
        super(DQNModel, self).__init__(config, scope)

        self.action_count = self.config.actions
        self.tau = self.config.tau
        self.gamma = self.config.gamma
        self.batch_size = self.config.batch_size

        self.double_dqn = self.config.double_dqn

        self.clip_value = None
        if self.config.clip_gradients:
            self.clip_value = self.config.clip_value

        if self.config.deterministic_mode:
            self.random = global_seed()
        else:
            self.random = np.random.RandomState()

        self.target_network_update = []

        # output layer
        output_layer_config = [{
            "type": "linear",
            "num_outputs": self.config.actions,
            "trainable": True
        }]

        self.device = self.config.tf_device
        if self.device == 'replica':
            self.device = tf.train.replica_device_setter(
                ps_tasks=1, worker_device=self.config.tf_worker_device)

        with tf.device(self.device):
            # Input placeholders
            self.state = tf.placeholder(tf.float32,
                                        self.batch_shape +
                                        list(self.config.state_shape),
                                        name="state")
            self.next_states = tf.placeholder(tf.float32,
                                              self.batch_shape +
                                              list(self.config.state_shape),
                                              name="next_states")
            self.terminals = tf.placeholder(tf.float32,
                                            self.batch_shape,
                                            name='terminals')
            self.rewards = tf.placeholder(tf.float32,
                                          self.batch_shape,
                                          name='rewards')

            if define_network is None:
                define_network = NeuralNetwork.layered_network(
                    self.config.network_layers + output_layer_config)

            self.training_model = NeuralNetwork(define_network, [self.state],
                                                scope=self.scope + 'training')
            self.target_model = NeuralNetwork(define_network,
                                              [self.next_states],
                                              scope=self.scope + 'target')

            self.training_output = self.training_model.get_output()
            self.target_output = self.target_model.get_output()

            # Create training operations
            self.create_training_operations()
            self.optimizer = tf.train.RMSPropOptimizer(self.alpha,
                                                       momentum=0.95,
                                                       epsilon=0.01)

        self.training_output = self.training_model.get_output()
        self.target_output = self.target_model.get_output()

        self.init_op = tf.global_variables_initializer()

        self.saver = tf.train.Saver()
        self.writer = tf.summary.FileWriter('logs',
                                            graph=tf.get_default_graph())
class DistributedPGModel(object):
    default_config = {}

    def __init__(self,
                 config,
                 scope,
                 task_index,
                 cluster_spec,
                 define_network=None):
        """

        A distributed agent must synchronise local and global parameters under different
        scopes.

        :param config: Configuration parameters
        :param scope: TensorFlow scope
        """

        self.session = None
        self.saver = None
        self.config = create_config(config, default=self.default_config)
        self.scope = scope
        self.task_index = task_index
        self.batch_size = self.config.batch_size
        self.action_count = self.config.actions
        self.use_gae = self.config.use_gae
        self.gae_lambda = self.config.gae_lambda

        self.gamma = self.config.gamma
        self.continuous = self.config.continuous
        self.normalize_advantage = self.config.normalise_advantage

        if self.config.deterministic_mode:
            self.random = global_seed()
        else:
            self.random = np.random.RandomState()

        if define_network is None:
            self.define_network = NeuralNetwork.layered_network(
                self.config.network_layers)
        else:
            self.define_network = define_network

        # This is the scope used to prefix variable creation for distributed TensorFlow
        self.batch_shape = [None]
        self.deterministic_mode = config.get('deterministic_mode', False)
        self.alpha = config.get('alpha', 0.001)
        self.optimizer = None

        self.worker_device = "/job:worker/task:{}/cpu:0".format(task_index)

        with tf.device(
                tf.train.replica_device_setter(
                    1, worker_device=self.worker_device,
                    cluster=cluster_spec)):
            with tf.variable_scope("global"):
                self.global_state = tf.placeholder(
                    tf.float32,
                    self.batch_shape + list(self.config.state_shape),
                    name="global_state")

                self.global_network = NeuralNetwork(self.define_network,
                                                    [self.global_state])
                self.global_step = tf.get_variable(
                    "global_step", [],
                    tf.int32,
                    initializer=tf.constant_initializer(0, dtype=tf.int32),
                    trainable=False)

                self.global_prev_action_means = tf.placeholder(
                    tf.float32, [None, self.action_count], name='prev_actions')

                if self.continuous:
                    self.global_policy = GaussianPolicy(
                        self.global_network, self.session, self.global_state,
                        self.random, self.action_count, 'gaussian_policy')
                    self.global_prev_action_log_stds = tf.placeholder(
                        tf.float32, [None, self.action_count])

                    self.global_prev_dist = dict(
                        policy_output=self.global_prev_action_means,
                        policy_log_std=self.global_prev_action_log_stds)

                else:
                    self.global_policy = CategoricalOneHotPolicy(
                        self.global_network, self.session, self.global_state,
                        self.random, self.action_count, 'categorical_policy')
                    self.global_prev_dist = dict(
                        policy_output=self.global_prev_action_means)

                # Probability distribution used in the current policy
                self.global_baseline_value_function = LinearValueFunction()

            # self.optimizer = config.get('optimizer')
            # self.optimizer_args = config.get('optimizer_args', [])
            # self.optimizer_kwargs = config.get('optimizer_kwargs', {})

        exploration = config.get('exploration')
        if not exploration:
            self.exploration = exploration_mode['constant'](self, 0)
        else:
            args = config.get('exploration_args', [])
            kwargs = config.get('exploration_kwargs', {})
            self.exploration = exploration_mode[exploration](self, *args,
                                                             **kwargs)

        self.create_training_operations()

    def set_session(self, session):
        self.session = session

        # Session in policy was still 'None' when
        # we initialised policy, hence need to set again
        self.policy.session = session

    def create_training_operations(self):
        """
        Currently a duplicate of the pg agent logic, to be made generic later to allow
        all models to be executed asynchronously/distributed seamlessly.

        """
        # TODO rewrite agent logic so core update logic can be composed into
        # TODO distributed logic

        with tf.device(self.worker_device):
            with tf.variable_scope("local"):
                self.state = tf.placeholder(tf.float32,
                                            self.batch_shape +
                                            list(self.config.state_shape),
                                            name="state")
                self.prev_action_means = tf.placeholder(
                    tf.float32, [None, self.action_count], name='prev_actions')

                self.local_network = NeuralNetwork(self.define_network,
                                                   [self.state])
                # TODO possibly problematic, check
                self.local_step = self.global_step

                if self.continuous:
                    self.policy = GaussianPolicy(self.local_network,
                                                 self.session, self.state,
                                                 self.random,
                                                 self.action_count,
                                                 'gaussian_policy')
                    self.prev_action_log_stds = tf.placeholder(
                        tf.float32, [None, self.action_count])

                    self.prev_dist = dict(
                        policy_output=self.prev_action_means,
                        policy_log_std=self.prev_action_log_stds)

                else:
                    self.policy = CategoricalOneHotPolicy(
                        self.local_network, self.session, self.state,
                        self.random, self.action_count, 'categorical_policy')
                    self.prev_dist = dict(policy_output=self.prev_action_means)

                # Probability distribution used in the current policy
                self.baseline_value_function = LinearValueFunction()

            self.actions = tf.placeholder(tf.float32,
                                          [None, self.action_count],
                                          name='actions')
            self.advantage = tf.placeholder(tf.float32,
                                            shape=[None, 1],
                                            name='advantage')

            self.dist = self.policy.get_distribution()
            self.log_probabilities = self.dist.log_prob(
                self.policy.get_policy_variables(), self.actions)

            # Concise: Get log likelihood of actions, weigh by advantages, compute gradient on that
            self.loss = -tf.reduce_mean(
                self.log_probabilities * self.advantage, name="loss_op")

            self.gradients = tf.gradients(self.loss,
                                          self.local_network.get_variables())

            grad_var_list = list(
                zip(self.gradients, self.global_network.get_variables()))

            global_step_inc = self.global_step.assign_add(
                tf.shape(self.state)[0])

            self.assign_global_to_local = tf.group(*[
                v1.assign(v2)
                for v1, v2 in zip(self.local_network.get_variables(),
                                  self.global_network.get_variables())
            ])

            # TODO write summaries
            # self.summary_writer = tf.summary.FileWriter('log' + "_%d" % self.task_index)
            if not self.optimizer:
                self.optimizer = tf.train.AdamOptimizer(self.alpha)

            else:
                optimizer_cls = get_function(self.optimizer)
                self.optimizer = optimizer_cls(self.alpha,
                                               *self.optimizer_args,
                                               **self.optimizer_kwargs)

            self.optimize_op = tf.group(
                self.optimizer.apply_gradients(grad_var_list), global_step_inc)

    def get_action(self, state, episode=1):
        return self.policy.sample(state)

    def update(self, batch):
        """
        Get global parameters, compute update, then send results to parameter server.
        :param batch:
        :return:
        """

        self.compute_gae_advantage(batch, self.gamma, self.gae_lambda)

        # Update linear value function for baseline prediction
        self.baseline_value_function.fit(batch)

        # Merge episode inputs into single arrays
        _, _, actions, batch_advantage, states = self.merge_episodes(batch)

        self.session.run(
            [self.optimize_op, self.global_step], {
                self.state: states,
                self.actions: actions,
                self.advantage: batch_advantage
            })

    def get_global_step(self):
        """
        Returns global step to coordinator.
        :return:
        """
        return self.session.run(self.global_step)

    def sync_global_to_local(self):
        """
        Copy shared global weights to local network.

        """
        self.session.run(self.assign_global_to_local)

    def load_model(self, path):
        self.saver.restore(self.session, path)

    def save_model(self, path):
        self.saver.save(self.session, path)

    # TODO remove this duplication, move to util or let distributed agent
    # have a pg agent as a field
    def merge_episodes(self, batch):
        """
        Merge episodes of a batch into single input variables.

        :param batch:
        :return:
        """
        if self.continuous:
            action_log_stds = np.concatenate(
                [path['action_log_stds'] for path in batch])
            action_log_stds = np.expand_dims(action_log_stds, axis=1)
        else:
            action_log_stds = None

        action_means = np.concatenate([path['action_means'] for path in batch])
        actions = np.concatenate([path['actions'] for path in batch])
        batch_advantage = np.concatenate([path["advantage"] for path in batch])

        if self.normalize_advantage:
            batch_advantage = zero_mean_unit_variance(batch_advantage)

        batch_advantage = np.expand_dims(batch_advantage, axis=1)
        states = np.concatenate([path['states'] for path in batch])

        return action_log_stds, action_means, actions, batch_advantage, states

    # TODO duplicate code -> refactor from pg model
    def compute_gae_advantage(self, batch, gamma, gae_lambda, use_gae=False):
        """
        Expects a batch containing at least one episode, sets advantages according to use_gae.

        :param batch: Sequence of observations for at least one episode.
        """
        for episode in batch:
            baseline = self.baseline_value_function.predict(episode)
            if episode['terminated']:
                adjusted_baseline = np.append(baseline, [0])
            else:
                adjusted_baseline = np.append(baseline, baseline[-1])

            episode['returns'] = discount(episode['rewards'], gamma)

            if use_gae:
                deltas = episode['rewards'] + gamma * adjusted_baseline[
                    1:] - adjusted_baseline[:-1]
                episode['advantage'] = discount(deltas, gamma * gae_lambda)
            else:
                episode['advantage'] = episode['returns'] - baseline
    def create_training_operations(self):
        """
        Currently a duplicate of the pg agent logic, to be made generic later to allow
        all models to be executed asynchronously/distributed seamlessly.

        """
        # TODO rewrite agent logic so core update logic can be composed into
        # TODO distributed logic

        with tf.device(self.worker_device):
            with tf.variable_scope("local"):
                self.state = tf.placeholder(tf.float32,
                                            self.batch_shape +
                                            list(self.config.state_shape),
                                            name="state")
                self.prev_action_means = tf.placeholder(
                    tf.float32, [None, self.action_count], name='prev_actions')

                self.local_network = NeuralNetwork(self.define_network,
                                                   [self.state])
                # TODO possibly problematic, check
                self.local_step = self.global_step

                if self.continuous:
                    self.policy = GaussianPolicy(self.local_network,
                                                 self.session, self.state,
                                                 self.random,
                                                 self.action_count,
                                                 'gaussian_policy')
                    self.prev_action_log_stds = tf.placeholder(
                        tf.float32, [None, self.action_count])

                    self.prev_dist = dict(
                        policy_output=self.prev_action_means,
                        policy_log_std=self.prev_action_log_stds)

                else:
                    self.policy = CategoricalOneHotPolicy(
                        self.local_network, self.session, self.state,
                        self.random, self.action_count, 'categorical_policy')
                    self.prev_dist = dict(policy_output=self.prev_action_means)

                # Probability distribution used in the current policy
                self.baseline_value_function = LinearValueFunction()

            self.actions = tf.placeholder(tf.float32,
                                          [None, self.action_count],
                                          name='actions')
            self.advantage = tf.placeholder(tf.float32,
                                            shape=[None, 1],
                                            name='advantage')

            self.dist = self.policy.get_distribution()
            self.log_probabilities = self.dist.log_prob(
                self.policy.get_policy_variables(), self.actions)

            # Concise: Get log likelihood of actions, weigh by advantages, compute gradient on that
            self.loss = -tf.reduce_mean(
                self.log_probabilities * self.advantage, name="loss_op")

            self.gradients = tf.gradients(self.loss,
                                          self.local_network.get_variables())

            grad_var_list = list(
                zip(self.gradients, self.global_network.get_variables()))

            global_step_inc = self.global_step.assign_add(
                tf.shape(self.state)[0])

            self.assign_global_to_local = tf.group(*[
                v1.assign(v2)
                for v1, v2 in zip(self.local_network.get_variables(),
                                  self.global_network.get_variables())
            ])

            # TODO write summaries
            # self.summary_writer = tf.summary.FileWriter('log' + "_%d" % self.task_index)
            if not self.optimizer:
                self.optimizer = tf.train.AdamOptimizer(self.alpha)

            else:
                optimizer_cls = get_function(self.optimizer)
                self.optimizer = optimizer_cls(self.alpha,
                                               *self.optimizer_args,
                                               **self.optimizer_kwargs)

            self.optimize_op = tf.group(
                self.optimizer.apply_gradients(grad_var_list), global_step_inc)
Пример #16
0
    def __init__(self, config, scope, define_network=None):
        """
        Training logic for NAFs.

        :param config: Configuration parameters
        """
        super(NAFModel, self).__init__(config, scope)
        self.action_count = self.config.actions
        self.tau = self.config.tau
        self.epsilon = self.config.epsilon
        self.gamma = self.config.gamma
        self.batch_size = self.config.batch_size

        self.state_shape = tuple(self.config.state_shape)
        self.state = tf.placeholder(tf.float32,
                                    (None, None) + self.state_shape,
                                    name="state")
        self.next_states = tf.placeholder(tf.float32,
                                          (None, None) + self.state_shape,
                                          name="next_states")
        self.actions = tf.placeholder(tf.float32,
                                      (None, None, self.action_count),
                                      name='actions')
        self.terminals = tf.placeholder(tf.float32, (None, None),
                                        name='terminals')
        self.rewards = tf.placeholder(tf.float32, (None, None), name='rewards')

        self.q_targets = tf.placeholder(tf.float32, (None, None),
                                        name='q_targets')
        self.target_network_update = []
        self.episode = 0

        # Get hidden layers from network generator, then add NAF outputs, same for target network
        scope = '' if self.config.tf_scope is None else self.config.tf_scope + '-'

        if define_network is None:
            define_network = NeuralNetwork.layered_network(
                self.config.network_layers)

        self.training_network = NeuralNetwork(
            define_network, [self.state],
            episode_length=self.episode_length,
            scope=scope + 'training')
        self.target_network = NeuralNetwork(define_network, [self.next_states],
                                            episode_length=self.episode_length,
                                            scope=scope + 'target')

        self.training_internal_states = self.training_network.internal_state_inits
        self.target_internal_states = self.target_network.internal_state_inits

        # Create output fields
        self.training_v, self.mu, self.advantage, self.q, self.training_output_vars = self.create_outputs(
            self.training_network.output, 'outputs_training')
        self.target_v, _, _, _, self.target_output_vars = self.create_outputs(
            self.target_network.output, 'outputs_target')
        self.create_training_operations()
        self.saver = tf.train.Saver()
        self.writer = tf.summary.FileWriter('logs',
                                            graph=tf.get_default_graph())

        self.session.run(tf.global_variables_initializer())
Пример #17
0
    def __init__(self, config, scope, network_builder=None):
        super(PGModel, self).__init__(config, scope)

        self.continuous = self.config.continuous
        self.batch_size = self.config.batch_size
        self.max_episode_length = min(self.config.max_episode_length,
                                      self.batch_size)
        self.action_count = self.config.actions

        # advantage estimation
        self.gamma = self.config.gamma
        self.generalized_advantage_estimation = self.config.gae
        self.gae_lambda = self.config.gae_lambda
        self.normalize_advantage = self.config.normalize_advantage

        if self.config.deterministic_mode:
            self.random = global_seed()
        else:
            self.random = np.random.RandomState()

        self.state_shape = tuple(self.config.state_shape)
        self.state = tf.placeholder(tf.float32,
                                    (None, None) + self.state_shape,
                                    name="state")
        self.actions = tf.placeholder(tf.float32,
                                      (None, None, self.action_count),
                                      name='actions')
        self.prev_action_means = tf.placeholder(
            tf.float32, (None, None, self.action_count), name='prev_actions')
        self.advantage = tf.placeholder(tf.float32,
                                        shape=(None, None, 1),
                                        name='advantage')

        if network_builder is None:
            network_builder = NeuralNetwork.layered_network(
                self.config.network_layers)
        if self.config.tf_scope is None:
            scope = ''
        else:
            scope = self.config.tf_scope + '-'
        self.network = NeuralNetwork(network_builder,
                                     inputs=[self.state],
                                     episode_length=self.episode_length,
                                     scope=scope + 'value_function')
        self.internal_states = self.network.internal_state_inits

        # From an API perspective, continuous vs discrete might be easier than
        # requiring to set the concrete policy, at least currently
        if self.continuous:
            self.policy = GaussianPolicy(self.network, self.session,
                                         self.state, self.random,
                                         self.action_count, 'gaussian_policy')
            self.prev_action_log_stds = tf.placeholder(
                tf.float32, (None, None, self.action_count))
            self.prev_dist = dict(policy_output=self.prev_action_means,
                                  policy_log_std=self.prev_action_log_stds)

        else:
            self.policy = CategoricalOneHotPolicy(self.network, self.session,
                                                  self.state, self.random,
                                                  self.action_count,
                                                  'categorical_policy')
            self.prev_dist = dict(policy_output=self.prev_action_means)

        # Probability distribution used in the current policy
        self.dist = self.policy.get_distribution()

        size = 1
        for dims in self.state_shape:
            size *= dims
        self.baseline_value_function = LinearValueFunction()
Пример #18
0
class DQNModel(Model):
    default_config = DQNModelConfig

    def __init__(self, config, scope, network_builder=None):
        """
        Training logic for DQN.

        :param config: Configuration dict
        """
        super(DQNModel, self).__init__(config, scope)

        self.action_count = self.config.actions
        self.tau = self.config.tau
        self.gamma = self.config.gamma
        # self.batch_size = self.config.batch_size

        self.double_dqn = self.config.double_dqn

        self.clip_value = None
        if self.config.clip_gradients:
            self.clip_value = self.config.clip_value

        if self.config.deterministic_mode:
            self.random = global_seed()
        else:
            self.random = np.random.RandomState()

        self.target_network_update = []

        # output layer
        output_layer_config = [{"type": "linear", "num_outputs": self.config.actions, "trainable": True}]

        # Input placeholders
        self.state_shape = tuple(self.config.state_shape)
        self.state = tf.placeholder(tf.float32, (None, None) + self.state_shape, name="state")
        self.next_states = tf.placeholder(tf.float32, (None, None) + self.state_shape,
                                          name="next_states")
        self.terminals = tf.placeholder(tf.float32, (None, None), name='terminals')
        self.rewards = tf.placeholder(tf.float32, (None, None), name='rewards')

        if network_builder is None:
            network_builder = NeuralNetwork.layered_network(self.config.network_layers + output_layer_config)

        self.training_network = NeuralNetwork(network_builder, [self.state], episode_length=self.episode_length,
                                              scope=self.scope + 'training')
        self.target_network = NeuralNetwork(network_builder, [self.next_states], episode_length=self.episode_length,
                                            scope=self.scope + 'target')

        self.training_internal_states = self.training_network.internal_state_inits
        self.target_internal_states = self.target_network.internal_state_inits

        self.training_output = self.training_network.output
        self.target_output = self.target_network.output

        # Create training operations
        self.create_training_operations()

        self.init_op = tf.global_variables_initializer()

        self.saver = tf.train.Saver()
        self.writer = tf.summary.FileWriter('logs', graph=tf.get_default_graph())
        self.session.run(self.init_op)

    def get_action(self, state, episode=1):
        """
        Returns the predicted action for a given state.

        :param state: State tensor
        :param episode: Current episode
        :return: action number
        """
        epsilon = self.exploration(episode, self.total_states)

        if self.random.random_sample() < epsilon:
            action = self.random.randint(0, self.action_count)
        else:
            fetches = [self.dqn_action]
            fetches.extend(self.training_internal_states)
            fetches.extend(self.target_internal_states)

            feed_dict = {self.episode_length: [1], self.state: [(state,)]}

            feed_dict.update({internal_state: self.training_network.internal_state_inits[n] for n, internal_state in
                              enumerate(self.training_network.internal_state_inputs)})
            feed_dict.update({internal_state: self.target_network.internal_state_inits[n] for n, internal_state in
                              enumerate(self.target_network.internal_state_inputs)})

            fetched = self.session.run(fetches=fetches, feed_dict=feed_dict)
            # First element of output list is action
            action = fetched[0][0]

            # Update optional internal states, e.g. LSTM cells
            self.training_internal_states = fetched[1:len(self.training_internal_states)]
            self.target_internal_states = fetched[1 + len(self.training_internal_states):]

        self.total_states += 1

        return action

    def update(self, batch):
        """
        Perform a single training step and updates the target network.

        :param batch: Mini batch to use for training
        :return: void
        """
        self.logger.debug('Updating DQN model..')

        # Compute estimated future value
        float_terminals = batch['terminals'].astype(float)
        y = self.get_target_values(batch['next_states'])

        q_targets = batch['rewards'] + (1. - float_terminals) \
                                       * self.gamma * y

        feed_dict = {
            self.episode_length: [len(batch['rewards'])],
            self.q_targets: q_targets,
            self.actions: [batch['actions']],
            self.state: [batch['states']]
        }

        fetches = [self.optimize_op, self.training_output]
        fetches.extend(self.training_network.internal_state_outputs)
        fetches.extend(self.target_network.internal_state_outputs)

        for n, internal_state in enumerate(self.training_network.internal_state_inputs):
            feed_dict[internal_state] = self.training_internal_states[n]

        for n, internal_state in enumerate(self.target_network.internal_state_inputs):
            feed_dict[internal_state] = self.target_internal_states[n]

        fetched = self.session.run(fetches, feed_dict)

        # Update internal state list, e.g. or LSTM
        self.training_internal_states = fetched[2:len(self.training_internal_states)]
        self.target_internal_states = fetched[2 + len(self.training_internal_states):]

    def get_variables(self):
        return self.training_network.get_variables()

    def assign_variables(self, values):
        assign_variables_ops = [variable.assign(value) for variable, value in zip(self.get_variables(), values)]
        self.session.run(tf.group(assign_variables_ops))

    def get_gradients(self):
        return self.grads_and_vars

    def apply_gradients(self, grads_and_vars):
        apply_gradients_op = self.optimizer.apply_gradients(grads_and_vars)
        self.session.run(apply_gradients_op)

    def create_training_operations(self):
        """
        Create graph operations for loss computation and
        target network updates.

        """
        with tf.name_scope(self.scope):
            with tf.name_scope("predict"):
                self.dqn_action = tf.argmax(self.training_output, axis=2, name='dqn_action')

            with tf.name_scope("targets"):
                if self.double_dqn:
                    selector = tf.one_hot(self.dqn_action, self.action_count, name='selector')
                    self.target_values = tf.reduce_sum(tf.multiply(self.target_output, selector), axis=2,
                                                       name='target_values')
                else:
                    self.target_values = tf.reduce_max(self.target_output, axis=2,
                                                       name='target_values')

            with tf.name_scope("update"):
                # Self.q_targets gets fed the actual observed rewards and expected future rewards
                self.q_targets = tf.placeholder(tf.float32, (None, None), name='q_targets')

                # Self.actions gets fed the actual actions that have been taken
                self.actions = tf.placeholder(tf.int32, (None, None), name='actions')

                # One_hot tensor of the actions that have been taken
                actions_one_hot = tf.one_hot(self.actions, self.action_count, 1.0, 0.0, name='action_one_hot')

                # Training output, so we get the expected rewards given the actual states and actions
                q_values_actions_taken = tf.reduce_sum(self.training_output * actions_one_hot, axis=2,
                                                       name='q_acted')

                # Surrogate loss as the mean squared error between actual observed rewards and expected rewards
                delta = self.q_targets - q_values_actions_taken

                # If gradient clipping is used, calculate the huber loss
                if self.config.clip_gradients:
                    huber_loss = tf.where(tf.abs(delta) < self.clip_value, 0.5 * tf.square(delta), tf.abs(delta) - 0.5)
                    self.loss = tf.reduce_mean(huber_loss, name='compute_surrogate_loss')
                else:
                    self.loss = tf.reduce_mean(tf.square(delta), name='compute_surrogate_loss')

                self.grads_and_vars = self.optimizer.compute_gradients(self.loss)
                self.optimize_op = self.optimizer.apply_gradients(self.grads_and_vars)

                # Update target network with update weight tau
                with tf.name_scope("update_target"):
                    for v_source, v_target in zip(self.training_network.variables, self.target_network.variables):
                        update = v_target.assign_sub(self.tau * (v_target - v_source))
                        self.target_network_update.append(update)

    def get_target_values(self, next_states):
        """
        Estimate of next state Q values.
        :param next_states:
        :return:
        """
        if self.double_dqn:
            return self.session.run(self.target_values, {self.state: [next_states], self.next_states: [next_states]})
        else:
            return self.session.run(self.target_values, {self.next_states: [next_states]})

    def update_target_network(self):
        """
        Updates target network.

        :return:
        """
        self.session.run(self.target_network_update)
    def __init__(self,
                 config,
                 scope,
                 task_index,
                 cluster_spec,
                 define_network=None):
        """

        A distributed agent must synchronise local and global parameters under different
        scopes.

        :param config: Configuration parameters
        :param scope: TensorFlow scope
        """

        self.session = None
        self.saver = None
        self.config = create_config(config, default=self.default_config)
        self.scope = scope
        self.task_index = task_index
        self.batch_size = self.config.batch_size
        self.action_count = self.config.actions
        self.use_gae = self.config.use_gae
        self.gae_lambda = self.config.gae_lambda

        self.gamma = self.config.gamma
        self.continuous = self.config.continuous
        self.normalize_advantage = self.config.normalise_advantage

        if self.config.deterministic_mode:
            self.random = global_seed()
        else:
            self.random = np.random.RandomState()

        if define_network is None:
            self.define_network = NeuralNetwork.layered_network(
                self.config.network_layers)
        else:
            self.define_network = define_network

        # This is the scope used to prefix variable creation for distributed TensorFlow
        self.batch_shape = [None]
        self.deterministic_mode = config.get('deterministic_mode', False)
        self.alpha = config.get('alpha', 0.001)
        self.optimizer = None

        self.worker_device = "/job:worker/task:{}/cpu:0".format(task_index)

        with tf.device(
                tf.train.replica_device_setter(
                    1, worker_device=self.worker_device,
                    cluster=cluster_spec)):
            with tf.variable_scope("global"):
                self.global_state = tf.placeholder(
                    tf.float32,
                    self.batch_shape + list(self.config.state_shape),
                    name="global_state")

                self.global_network = NeuralNetwork(self.define_network,
                                                    [self.global_state])
                self.global_step = tf.get_variable(
                    "global_step", [],
                    tf.int32,
                    initializer=tf.constant_initializer(0, dtype=tf.int32),
                    trainable=False)

                self.global_prev_action_means = tf.placeholder(
                    tf.float32, [None, self.action_count], name='prev_actions')

                if self.continuous:
                    self.global_policy = GaussianPolicy(
                        self.global_network, self.session, self.global_state,
                        self.random, self.action_count, 'gaussian_policy')
                    self.global_prev_action_log_stds = tf.placeholder(
                        tf.float32, [None, self.action_count])

                    self.global_prev_dist = dict(
                        policy_output=self.global_prev_action_means,
                        policy_log_std=self.global_prev_action_log_stds)

                else:
                    self.global_policy = CategoricalOneHotPolicy(
                        self.global_network, self.session, self.global_state,
                        self.random, self.action_count, 'categorical_policy')
                    self.global_prev_dist = dict(
                        policy_output=self.global_prev_action_means)

                # Probability distribution used in the current policy
                self.global_baseline_value_function = LinearValueFunction()

            # self.optimizer = config.get('optimizer')
            # self.optimizer_args = config.get('optimizer_args', [])
            # self.optimizer_kwargs = config.get('optimizer_kwargs', {})

        exploration = config.get('exploration')
        if not exploration:
            self.exploration = exploration_mode['constant'](self, 0)
        else:
            args = config.get('exploration_args', [])
            kwargs = config.get('exploration_kwargs', {})
            self.exploration = exploration_mode[exploration](self, *args,
                                                             **kwargs)

        self.create_training_operations()
Пример #20
0
class DQNModel(Model):
    default_config = DQNModelConfig

    def __init__(self, config, scope, define_network=None):
        """
        Training logic for DQN.

        :param config: Configuration dict
        """
        super(DQNModel, self).__init__(config, scope)

        self.action_count = self.config.actions
        self.tau = self.config.tau
        self.gamma = self.config.gamma
        self.batch_size = self.config.batch_size

        self.double_dqn = self.config.double_dqn

        self.clip_value = None
        if self.config.clip_gradients:
            self.clip_value = self.config.clip_value

        if self.config.deterministic_mode:
            self.random = global_seed()
        else:
            self.random = np.random.RandomState()

        self.target_network_update = []

        # output layer
        output_layer_config = [{
            "type": "linear",
            "num_outputs": self.config.actions,
            "trainable": True
        }]

        self.device = self.config.tf_device
        if self.device == 'replica':
            self.device = tf.train.replica_device_setter(
                ps_tasks=1, worker_device=self.config.tf_worker_device)

        with tf.device(self.device):
            # Input placeholders
            self.state = tf.placeholder(tf.float32,
                                        self.batch_shape +
                                        list(self.config.state_shape),
                                        name="state")
            self.next_states = tf.placeholder(tf.float32,
                                              self.batch_shape +
                                              list(self.config.state_shape),
                                              name="next_states")
            self.terminals = tf.placeholder(tf.float32,
                                            self.batch_shape,
                                            name='terminals')
            self.rewards = tf.placeholder(tf.float32,
                                          self.batch_shape,
                                          name='rewards')

            if define_network is None:
                define_network = NeuralNetwork.layered_network(
                    self.config.network_layers + output_layer_config)

            self.training_model = NeuralNetwork(define_network, [self.state],
                                                scope=self.scope + 'training')
            self.target_model = NeuralNetwork(define_network,
                                              [self.next_states],
                                              scope=self.scope + 'target')

            self.training_output = self.training_model.get_output()
            self.target_output = self.target_model.get_output()

            # Create training operations
            self.create_training_operations()
            self.optimizer = tf.train.RMSPropOptimizer(self.alpha,
                                                       momentum=0.95,
                                                       epsilon=0.01)

        self.training_output = self.training_model.get_output()
        self.target_output = self.target_model.get_output()

        self.init_op = tf.global_variables_initializer()

        self.saver = tf.train.Saver()
        self.writer = tf.summary.FileWriter('logs',
                                            graph=tf.get_default_graph())

    def initialize(self):
        self.session.run(self.init_op)

    def get_action(self, state, episode=1):
        """
        Returns the predicted action for a given state.

        :param state: State tensor
        :param episode: Current episode
        :return: action number
        """

        epsilon = self.exploration(episode, self.total_states)

        if self.random.random_sample() < epsilon:
            action = self.random.randint(0, self.action_count)
        else:
            action = self.session.run(self.dqn_action,
                                      {self.state: [state]})[0]

        self.total_states += 1
        return action

    def update(self, batch):
        """
        Perform a single training step and updates the target network.

        :param batch: Mini batch to use for training
        :return: void
        """

        # Compute estimated future value
        float_terminals = batch['terminals'].astype(float)
        q_targets = batch['rewards'] + (1. - float_terminals) \
                                     * self.gamma * self.get_target_values(batch['next_states'])

        self.session.run(
            [self.optimize_op, self.training_output], {
                self.q_targets: q_targets,
                self.actions: batch['actions'],
                self.state: batch['states']
            })

    def get_variables(self):
        return self.training_model.get_variables()

    def assign_variables(self, values):
        assign_variables_ops = [
            variable.assign(value)
            for variable, value in zip(self.get_variables(), values)
        ]
        self.session.run(tf.group(assign_variables_ops))

    def get_gradients(self):
        return self.grads_and_vars

    def apply_gradients(self, grads_and_vars):
        apply_gradients_op = self.optimizer.apply_gradients(grads_and_vars)
        self.session.run(apply_gradients_op)

    def create_training_operations(self):
        """
        Create graph operations for compute_surrogate_loss computation and
        target network updates.

        :return:
        """
        with tf.name_scope(self.scope):
            with tf.name_scope("predict"):
                self.dqn_action = tf.argmax(self.training_output,
                                            axis=1,
                                            name='dqn_action')

            with tf.name_scope("targets"):
                if self.double_dqn:
                    selector = tf.one_hot(self.dqn_action,
                                          self.action_count,
                                          name='selector')
                    self.target_values = tf.reduce_sum(tf.multiply(
                        self.target_output, selector),
                                                       axis=1,
                                                       name='target_values')
                else:
                    self.target_values = tf.reduce_max(self.target_output,
                                                       axis=1,
                                                       name='target_values')

            with tf.name_scope("update"):
                # Self.q_targets gets fed the actual observed rewards and expected future rewards
                self.q_targets = tf.placeholder(tf.float32, [None],
                                                name='q_targets')

                # Self.actions gets fed the actual actions that have been taken
                self.actions = tf.placeholder(tf.int32, [None], name='actions')

                # One_hot tensor of the actions that have been taken
                actions_one_hot = tf.one_hot(self.actions,
                                             self.action_count,
                                             1.0,
                                             0.0,
                                             name='action_one_hot')

                # Training output, so we get the expected rewards given the actual states and actions
                q_values_actions_taken = tf.reduce_sum(self.training_output *
                                                       actions_one_hot,
                                                       axis=1,
                                                       name='q_acted')

                # Surrogate loss as the mean squared error between actual observed rewards and expected rewards
                delta = self.q_targets - q_values_actions_taken

                # if gradient clipping is used, calculate the huber loss
                if self.config.clip_gradients:
                    huber_loss = tf.where(
                        tf.abs(delta) < self.clip_value,
                        0.5 * tf.square(delta),
                        tf.abs(delta) - 0.5)
                    self.loss = tf.reduce_mean(huber_loss,
                                               name='compute_surrogate_loss')
                else:
                    self.loss = tf.reduce_mean(tf.square(delta),
                                               name='compute_surrogate_loss')

                self.grads_and_vars = self.optimizer.compute_gradients(
                    self.loss)
                self.optimize_op = self.optimizer.apply_gradients(
                    self.grads_and_vars)

            # Update target network with update weight tau
            with tf.name_scope("update_target"):
                for v_source, v_target in zip(
                        self.training_model.get_variables(),
                        self.target_model.get_variables()):
                    update = v_target.assign_sub(self.tau *
                                                 (v_target - v_source))
                    self.target_network_update.append(update)

    def get_target_values(self, next_states):
        """
        Estimate of next state Q values.
        :param next_states:
        :return:
        """
        if self.double_dqn:
            return self.session.run(self.target_values, {
                self.state: next_states,
                self.next_states: next_states
            })
        else:
            return self.session.run(self.target_values,
                                    {self.next_states: next_states})

    def update_target_network(self):
        """
        Updates target network.

        :return:
        """
        self.session.run(self.target_network_update)