예제 #1
0
    def _build_value_initial(self):
        """ Builds the value model (initial step) """
        from diplomacy_research.utils.tensorflow import tf
        from diplomacy_research.utils.tensorflow import to_float

        if not self.placeholders:
            self.placeholders = self.get_placeholders()
        else:
            self.placeholders.update(self.get_placeholders())

        # Quick function to retrieve hparams and placeholders and function shorthands
        pholder = lambda placeholder_name: self.placeholders[placeholder_name]

        # Training loop
        with tf.variable_scope('value', reuse=tf.AUTO_REUSE):
            with tf.device(self.cluster_config.worker_device if self.
                           cluster_config else None):

                # Features
                board_state = to_float(
                    self.features['board_state']
                )  # tf.float32 - (b, NB_NODES, NB_FEATURES)
                current_power = self.features[
                    'current_power']  # tf.int32   - (b,)
                value_target = self.features[
                    'value_target']  # tf.float32 - (b,)

                # Placeholders
                stop_gradient_all = pholder('stop_gradient_all')

                # Computing value for the current power
                state_value = self.get_board_value(board_state, current_power)

                # Computing value loss
                with tf.variable_scope('value_loss'):
                    value_loss = tf.reduce_mean(
                        tf.square(value_target - state_value))
                    value_loss = tf.cond(
                        stop_gradient_all,
                        lambda: tf.stop_gradient(value_loss),  # pylint: disable=cell-var-from-loop
                        lambda: value_loss)  # pylint: disable=cell-var-from-loop

        # Building output tags
        outputs = {
            'tag/value/v001_val_relu_7': True,
            'state_value': state_value,
            'value_loss': value_loss
        }

        # Adding features, placeholders and outputs to graph
        self.add_meta_information(outputs)
예제 #2
0
    def _build_draw_initial(self):
        """ Builds the draw model (initial step) """
        from diplomacy_research.utils.tensorflow import tf
        from diplomacy_research.models.layers.graph_convolution import GraphConvolution, preprocess_adjacency
        from diplomacy_research.utils.tensorflow import to_float

        if not self.placeholders:
            self.placeholders = self.get_placeholders()
        else:
            self.placeholders.update(self.get_placeholders())

        # Quick function to retrieve hparams and placeholders and function shorthands
        hps = lambda hparam_name: self.hparams[hparam_name]
        pholder = lambda placeholder_name: self.placeholders[placeholder_name]
        relu = tf.nn.relu
        sigmoid = tf.nn.sigmoid

        # Training loop
        with tf.variable_scope('draw', reuse=tf.AUTO_REUSE):
            with tf.device(self.cluster_config.worker_device if self.
                           cluster_config else None):

                # Features
                board_state = to_float(
                    self.features['board_state']
                )  # tf.float32 - (b, NB_NODES, NB_FEATURES)
                current_power = self.features[
                    'current_power']  # tf.int32   - (b,)
                draw_target = self.features['draw_target']  # tf.float32 - (b,)

                # Placeholders
                stop_gradient_all = pholder('stop_gradient_all')

                # Norm Adjacency
                batch_size = tf.shape(board_state)[0]
                norm_adjacency = preprocess_adjacency(get_adjacency_matrix())
                norm_adjacency = tf.tile(
                    tf.expand_dims(norm_adjacency, axis=0), [batch_size, 1, 1])

                # Graph embeddings
                with tf.variable_scope('graph_conv_scope'):
                    board_state_h0 = board_state  # (b, 81, 35)
                    board_state_h1 = GraphConvolution(
                        input_dim=NB_FEATURES,
                        output_dim=hps('draw_gcn_1_output_size'),
                        norm_adjacency=norm_adjacency,
                        activation_fn=relu,
                        bias=True)(board_state_h0)  # (b, 81, 25)

                    # board_state_h2: (b, 2025)
                    # board_state_h3: (b, 128)
                    board_state_h2 = tf.reshape(
                        board_state_h1,
                        shape=[-1, NB_NODES * hps('draw_gcn_1_output_size')])
                    board_state_graph_conv = tf.layers.Dense(
                        units=hps('draw_embedding_size'),
                        activation=relu,
                        use_bias=True)(board_state_h2)

                # Calculating draw for all powers
                with tf.variable_scope('draw_scope'):
                    current_power_mask = tf.one_hot(current_power,
                                                    NB_POWERS,
                                                    dtype=tf.float32)

                    draw_h0 = board_state_graph_conv  # (b, 128)
                    draw_h1 = tf.layers.Dense(
                        units=hps('draw_h1_size'),  # (b, 64)
                        activation=relu,
                        use_bias=True)(draw_h0)
                    draw_h2 = tf.layers.Dense(
                        units=hps('draw_h2_size'),  # (b, 64)
                        activation=relu,
                        use_bias=True)(draw_h1)
                    draw_probs = tf.layers.Dense(
                        units=NB_POWERS,  # (b, 7)
                        activation=sigmoid,
                        use_bias=True)(draw_h2)
                    draw_prob = tf.reduce_sum(draw_probs * current_power_mask,
                                              axis=1)  # (b,)

                # Computing draw loss
                with tf.variable_scope('draw_loss'):
                    draw_loss = tf.reduce_mean(
                        tf.square(draw_target - draw_prob))
                    draw_loss = tf.cond(
                        stop_gradient_all,
                        lambda: tf.stop_gradient(draw_loss),  # pylint: disable=cell-var-from-loop
                        lambda: draw_loss)  # pylint: disable=cell-var-from-loop

        # Building output tags
        outputs = {
            'tag/draw/v001_draw_relu': True,
            'draw_prob': draw_prob,
            'draw_loss': draw_loss
        }

        # Adding features, placeholders and outputs to graph
        self.add_meta_information(outputs)
예제 #3
0
    def build(self):
        """ Builds the RL model using the correct optimizer """
        from diplomacy_research.utils.tensorflow import tf, tfp, normalize, to_float
        from diplomacy_research.models.layers.avg_grad_optimizer import AvgGradOptimizer

        # Quick function to retrieve hparams and placeholders and function shorthands
        hps = lambda hparam_name: self.model.hparams[hparam_name]

        # Training loop
        with tf.variable_scope('policy', reuse=tf.AUTO_REUSE):
            with tf.device(self.cluster_config.worker_device if self.
                           cluster_config else None):

                # Placeholders
                stop_gradient_all = self.model.placeholders[
                    'stop_gradient_all']

                # Features
                decoder_lengths = self.model.features[
                    'decoder_lengths']  # tf.int32   - (b,)
                draw_action = self.model.features[
                    'draw_action']  # tf.bool    - (b,)
                reward_target = self.model.features[
                    'reward_target']  # tf.float32 - (b,)
                value_target = self.model.features[
                    'value_target']  # tf.float32 - (b,)
                old_log_probs = self.model.features[
                    'old_log_probs']  # tf.float32 - (b, dec_len)
                # current_power = self.model.features['current_power']              # tf.int32   - (b,)

                # Making sure all RNN lengths are at least 1
                # Trimming to the maximum decoder length in the batch
                raw_decoder_lengths = decoder_lengths
                decoder_lengths = tf.math.maximum(1, decoder_lengths)

                # Retrieving model outputs
                baseline = values = self.model.outputs['state_value']  # (b,)
                logits = self.model.outputs['logits']  # (b, dec, VOCAB)
                sequence_mask = tf.sequence_mask(
                    raw_decoder_lengths,  # (b, dec)
                    maxlen=tf.reduce_max(decoder_lengths),
                    dtype=tf.float32)

                # Computing Baseline Mean Square Error Loss
                with tf.variable_scope('baseline_scope'):
                    baseline_mse_loss = tf.minimum(
                        tf.square(value_target - values),
                        hps('clip_value_threshold'))
                    baseline_mse_loss = tf.reduce_sum(baseline_mse_loss)  # ()

                # Calculating surrogate loss
                with tf.variable_scope('policy_gradient_scope'):
                    new_policy_log_probs = self.model.outputs[
                        'log_probs'] * sequence_mask  # (b, dec_len)
                    old_policy_log_probs = old_log_probs * sequence_mask  # (b, dec_len)

                    new_sum_log_probs = tf.reduce_sum(new_policy_log_probs,
                                                      axis=-1)  # (b,)
                    old_sum_log_probs = tf.reduce_sum(old_policy_log_probs,
                                                      axis=-1)  # (b,)

                    ratio = tf.math.exp(new_sum_log_probs -
                                        old_sum_log_probs)  # (b,)
                    clipped_ratio = tf.clip_by_value(ratio,
                                                     1. - hps('epsilon'), 1. +
                                                     hps('epsilon'))  # (b,)
                    advantages = tf.stop_gradient(
                        normalize(reward_target - baseline))  # (b,)

                    surrogate_loss_1 = ratio * advantages  # (b,)
                    surrogate_loss_2 = clipped_ratio * advantages  # (b,)
                    surrogate_loss = -tf.reduce_mean(
                        tf.math.minimum(surrogate_loss_1,
                                        surrogate_loss_2))  # ()

                # Calculating policy gradient for draw action
                with tf.variable_scope('draw_gradient_scope'):
                    draw_action = to_float(draw_action)  # (b,)
                    draw_prob = self.model.outputs['draw_prob']  # (b,)
                    log_prob_of_draw = draw_action * tf.log(draw_prob) + (
                        1. - draw_action) * tf.log(1. - draw_prob)
                    draw_gradient_loss = -1. * log_prob_of_draw * advantages  # (b,)
                    draw_gradient_loss = tf.reduce_mean(
                        draw_gradient_loss)  # ()

                # Calculating entropy loss
                with tf.variable_scope('entropy_scope'):
                    entropy = tfp.distributions.Categorical(
                        logits=logits).entropy()
                    entropy_loss = -tf.reduce_mean(entropy)  # ()

                # Scopes
                scope = ['policy', 'value', 'draw']
                global_ignored_scope = None if not hps(
                    'ignored_scope') else hps('ignored_scope').split(',')

                # Creating PPO loss
                ppo_loss = surrogate_loss \
                           + hps('value_coeff') * baseline_mse_loss \
                           + hps('draw_coeff') * draw_gradient_loss \
                           + hps('entropy_coeff') * entropy_loss
                ppo_loss = tf.cond(
                    stop_gradient_all,
                    lambda: tf.stop_gradient(ppo_loss),  # pylint: disable=cell-var-from-loop
                    lambda: ppo_loss)  # pylint: disable=cell-var-from-loop
                cost_and_scope = [(ppo_loss, scope, None)]

                # Creating optimizer op
                ppo_op = self.model.create_optimizer_op(
                    cost_and_scope=cost_and_scope,
                    ignored_scope=global_ignored_scope,
                    max_gradient_norm=hps('max_gradient_norm'))

                # Making sure we are not using the AvgGradOptimizer, but directly the AdamOptimizer
                assert not isinstance(
                    self.model.optimizer,
                    AvgGradOptimizer), 'PPO does not use AvgGradOptimizer'

        # Storing outputs
        self._add_output('rl_policy_loss', surrogate_loss)
        self._add_output('rl_value_loss', baseline_mse_loss)
        self._add_output('rl_draw_loss', draw_gradient_loss)
        self._add_output('rl_entropy_loss', entropy_loss)
        self._add_output('rl_total_loss', ppo_loss)
        self._add_output('optimizer_op', ppo_op)

        # --------------------------------------
        #               Hooks
        # --------------------------------------
        def hook_baseline_pre_condition(dataset):
            """ Pre-Condition: First queue to run """
            if not hasattr(dataset, 'last_queue') or dataset.last_queue == '':
                return True
            return False

        def hook_baseline_post_queue(dataset):
            """ Post-Queue: Marks the baseline queue as processed """
            dataset.last_queue = 'ppo_policy_baseline'

        # --------------------------------------
        #               Queues
        # --------------------------------------
        self.queue_dataset.create_queue(
            'ppo_policy_baseline',
            placeholders={
                self.model.placeholders['decoder_type']: [TRAINING_DECODER]
            },
            outputs=[
                self.model.outputs[output_name]
                for output_name in ['optimizer_op'] +
                self.get_evaluation_tags()
            ],
            pre_condition=hook_baseline_pre_condition,
            post_queue=hook_baseline_post_queue)
        self.queue_dataset.create_queue(
            'ppo_increase_version',
            placeholders={
                self.model.placeholders['decoder_type']: [GREEDY_DECODER]
            },
            outputs=[tf.assign_add(self.version_step, 1)],
            with_status=True)
예제 #4
0
    def _build_value_final(self):
        """ Builds the value model (final step) """
        from diplomacy_research.utils.tensorflow import tf

        if not self.placeholders:
            self.placeholders = self.get_placeholders()
        else:
            self.placeholders.update(self.get_placeholders())

        # Quick function to retrieve hparams and placeholders and function shorthands
        hps = lambda hparam_name: self.hparams[hparam_name]
        pholder = lambda placeholder_name: self.placeholders[placeholder_name]
        relu = tf.nn.relu

        # Training loop
        with tf.variable_scope('value', reuse=tf.AUTO_REUSE):
            with tf.device(self.cluster_config.worker_device if self.
                           cluster_config else None):

                # Outputs from the policy model
                assert 'rnn_states' in self.outputs

                # Inputs and Features
                rnn_states = self.outputs['rnn_states']
                current_power = self.features[
                    'current_power']  # tf.int32   - (b,)
                value_target = self.features[
                    'value_target']  # tf.float32 - (b,)

                # Placeholders
                stop_gradient_all = pholder('stop_gradient_all')

                # Computing the value
                value_h0 = tf.stop_gradient(rnn_states) if hps(
                    'stop_gradient_value') else rnn_states
                value_h0_pos_0 = value_h0[:, 0, :]  # (b, lstm_size)

                # Linear with relu
                # Then linear without relu
                value_h1_pos_0 = tf.layers.Dense(
                    units=hps('value_h1_size'),  # (b, 256)
                    use_bias=True,
                    activation=relu)(value_h0_pos_0)
                value_h2_pos_0 = tf.layers.Dense(
                    units=NB_POWERS,  # (b, 7)
                    use_bias=True,
                    activation=None)(value_h1_pos_0)

                # Computing for the current power
                current_power_mask = tf.one_hot(current_power,
                                                NB_POWERS,
                                                dtype=tf.float32)
                state_value = tf.reduce_sum(current_power_mask *
                                            value_h2_pos_0,
                                            axis=-1)  # (b,)

                # Computing value loss
                with tf.variable_scope('value_loss'):
                    value_loss = tf.reduce_mean(
                        tf.square(value_target - state_value))
                    value_loss = tf.cond(
                        stop_gradient_all,
                        lambda: tf.stop_gradient(value_loss),  # pylint: disable=cell-var-from-loop
                        lambda: value_loss)  # pylint: disable=cell-var-from-loop

        # Building output tags
        outputs = {
            'tag/value/v003_rnn_step_0': True,
            'state_value': state_value,
            'value_loss': value_loss
        }

        # Adding features, placeholders and outputs to graph
        self.add_meta_information(outputs)
예제 #5
0
    def build(self):
        """ Builds the RL model using the correct optimizer """
        from diplomacy_research.utils.tensorflow import tf, tfp, normalize, to_float
        from diplomacy_research.models.layers.avg_grad_optimizer import AvgGradOptimizer

        # Quick function to retrieve hparams and placeholders and function shorthands
        hps = lambda hparam_name: self.model.hparams[hparam_name]

        # Training loop
        with tf.variable_scope('policy', reuse=tf.AUTO_REUSE):
            with tf.device(self.cluster_config.worker_device if self.cluster_config else None):
                # Placeholders
                stop_gradient_all = self.model.placeholders['stop_gradient_all']

                # Features
                decoder_lengths = self.model.features['decoder_lengths']            # tf.int32   - (b,)
                draw_action = self.model.features['draw_action']                    # tf.bool    - (b,)
                reward_target = self.model.features['reward_target']                # tf.float32 - (b,)
                value_target = self.model.features['value_target']                  # tf.float32 - (b,)
                # current_power = self.model.features['current_power']              # tf.int32   - (b,)

                # Making sure all RNN lengths are at least 1
                # Trimming to the maximum decoder length in the batch
                raw_decoder_lengths = decoder_lengths
                decoder_lengths = tf.math.maximum(1, decoder_lengths)

                # Retrieving model outputs
                baseline = values = self.model.outputs['state_value']                               # (b,)
                logits = self.model.outputs['logits']                                               # (b, dec, VOCAB)
                sequence_mask = tf.sequence_mask(raw_decoder_lengths,                               # (b, dec)
                                                 maxlen=tf.reduce_max(decoder_lengths),
                                                 dtype=tf.float32)

                # Computing Baseline Mean Square Error Loss
                with tf.variable_scope('baseline_scope'):
                    baseline_mse_loss = tf.minimum(tf.square(value_target - values), hps('clip_value_threshold'))
                    baseline_mse_loss = tf.reduce_sum(baseline_mse_loss)                                # ()

                # Calculating policy gradient loss
                with tf.variable_scope('policy_gradient_scope'):
                    log_prob_of_tokens = self.model.outputs['log_probs'] * sequence_mask                # (b, dec_len)

                    # Calculating loss and optimizer op
                    advantages = tf.stop_gradient(normalize(reward_target - baseline))                  # (b,)
                    policy_gradient_loss = -tf.reduce_sum(log_prob_of_tokens, axis=-1) * advantages     # (b,)
                    policy_gradient_loss = tf.reduce_mean(policy_gradient_loss)                         # ()

                # Calculating policy gradient for draw action
                with tf.variable_scope('draw_gradient_scope'):
                    draw_action = to_float(draw_action)                                                 # (b,)
                    draw_prob = self.model.outputs['draw_prob']                                         # (b,)
                    log_prob_of_draw = draw_action * tf.log(draw_prob) + (1. - draw_action) * tf.log(1. - draw_prob)
                    draw_gradient_loss = -1. * log_prob_of_draw * advantages                            # (b,)
                    draw_gradient_loss = tf.reduce_mean(draw_gradient_loss)                             # ()

                # Calculating entropy loss
                with tf.variable_scope('entropy_scope'):
                    categorial_dist = tfp.distributions.Categorical(logits=logits)
                    entropy = categorial_dist.entropy()
                    entropy_loss = -tf.reduce_mean(entropy)                                             # ()

                # Scopes
                scope = ['policy', 'value', 'draw']
                global_ignored_scope = None if not hps('ignored_scope') else hps('ignored_scope').split(',')

                # Creating A2C Loss
                a2c_loss = policy_gradient_loss \
                           + hps('value_coeff') * baseline_mse_loss \
                           + hps('draw_coeff') * draw_gradient_loss \
                           + hps('entropy_coeff') * entropy_loss
                a2c_loss = tf.cond(stop_gradient_all,
                                   lambda: tf.stop_gradient(a2c_loss),          # pylint: disable=cell-var-from-loop
                                   lambda: a2c_loss)                            # pylint: disable=cell-var-from-loop
                cost_and_scope = [(a2c_loss, scope, None)]

                # Creating optimizer op
                a2c_op = self.model.create_optimizer_op(cost_and_scope=cost_and_scope,
                                                        ignored_scope=global_ignored_scope,
                                                        max_gradient_norm=None)     # AvgGradOptimizer will clip

                # Getting AvgGradOptimizer.update(version_step)
                assert isinstance(self.model.optimizer, AvgGradOptimizer), 'A2C requires gradient averaging'
                update_op = self.model.optimizer.update(self.version_step)
                init_op = self.model.optimizer.init()

        # Storing outputs
        self._add_output('rl_policy_loss', policy_gradient_loss)
        self._add_output('rl_value_loss', baseline_mse_loss)
        self._add_output('rl_draw_loss', draw_gradient_loss)
        self._add_output('rl_entropy_loss', entropy_loss)
        self._add_output('rl_total_loss', a2c_loss)
        self._add_output('optimizer_op', a2c_op)
        self._add_output('update_op', update_op)
        self._add_output('init_op', init_op)

        # --------------------------------------
        #               Hooks
        # --------------------------------------
        def hook_baseline_pre_condition(dataset):
            """ Pre-Condition: First queue to run """
            if not hasattr(dataset, 'last_queue') or dataset.last_queue == '':
                return True
            return False

        def hook_baseline_post_queue(dataset):
            """ Post-Queue: Marks the baseline queue as processed """
            dataset.last_queue = 'a2c_policy_baseline'

        def hook_update_pre_condition(dataset):
            """ Pre-Condition: last_queue must be baseline """
            if hasattr(dataset, 'last_queue') and dataset.last_queue == 'a2c_policy_baseline':
                return True
            return False

        def hook_update_pre_queue(dataset):
            """ Pre-Queue: Restricts the queue to 1 dequeue maximum """
            dataset.nb_items_to_pull_from_queue = min(dataset.nb_items_to_pull_from_queue, 1)

        def hook_update_post_queue(dataset):
            """ Post-Queue: Marks the update as processed """
            dataset.last_queue = 'a2c_update'

        # --------------------------------------
        #               Queues
        # --------------------------------------
        self.queue_dataset.create_queue('a2c_policy_baseline',
                                        placeholders={self.model.placeholders['decoder_type']: [TRAINING_DECODER]},
                                        outputs=[self.model.outputs[output_name]
                                                 for output_name in ['optimizer_op'] + self.get_evaluation_tags()],
                                        with_status=True,
                                        pre_condition=hook_baseline_pre_condition,
                                        post_queue=hook_baseline_post_queue)
        self.queue_dataset.create_queue('a2c_update',
                                        placeholders={self.model.placeholders['decoder_type']: [GREEDY_DECODER]},
                                        outputs=[self.model.outputs['update_op']],
                                        with_status=True,
                                        pre_condition=hook_update_pre_condition,
                                        pre_queue=hook_update_pre_queue,
                                        post_queue=hook_update_post_queue)
        self.queue_dataset.create_queue('optimizer_init',
                                        placeholders={self.model.placeholders['decoder_type']: [GREEDY_DECODER]},
                                        outputs=[self.model.outputs['init_op']],
                                        with_status=True)