Пример #1
0
    def test_weighted_random_sample(self):

        probs = np.array([
            [1, 2, 4],
            [2, 4, 4],
            [0, 1, 2]
        ])

        p = weighted_random_sample(probs)
        n_sample = 5000
        with self.test_session() as sess:
            X = np.array([sess.run(p) for _ in range(n_sample)])

        # normalizing rows
        probs_norm = probs / probs.sum(axis=1, keepdims=True)
        expected_items_by_row = [
            [0, 1, 2],
            [0, 1, 2],
            [1, 2]
        ]
        for i in range(3):
            idx, counts = np.unique(X[:, i], return_counts=True)
            self.assertAllEqual(idx, expected_items_by_row[i])
            self.assertArrayNear(counts / n_sample, probs_norm[i][probs_norm[i] > 0], err=0.02)
Пример #2
0
    def build_model(self):
        self.placeholders = _get_placeholders(self.spatial_dim)

        with tf.variable_scope("theta"):
            theta = self.policy(self, trainable=True).build()

        selected_spatial_action_flat = ravel_index_pairs(
            self.placeholders.selected_spatial_action, self.spatial_dim
        )

        selected_log_probs = self._get_select_action_probs(theta, selected_spatial_action_flat)

        # maximum is to avoid 0 / 0 because this is used to calculate some means
        sum_spatial_action_available = tf.maximum(
            1e-10, tf.reduce_sum(self.placeholders.is_spatial_action_available)
        )

        neg_entropy_spatial = tf.reduce_sum(
            theta.spatial_action_probs * theta.spatial_action_log_probs
        ) / sum_spatial_action_available
        neg_entropy_action_id = tf.reduce_mean(tf.reduce_sum(
            theta.action_id_probs * theta.action_id_log_probs, axis=1
        ))

        if self.mode == ACMode.PPO:
            # could also use stop_gradient and forget about the trainable
            with tf.variable_scope("theta_old"):
                theta_old = self.policy(self, trainable=False).build()

            new_theta_var = tf.global_variables("theta/")
            old_theta_var = tf.global_variables("theta_old/")

            assert len(tf.trainable_variables("theta/")) == len(new_theta_var)
            assert not tf.trainable_variables("theta_old/")
            assert len(old_theta_var) == len(new_theta_var)

            self.update_theta_op = [
                tf.assign(t_old, t_new) for t_new, t_old in zip(new_theta_var, old_theta_var)
            ]

            selected_log_probs_old = self._get_select_action_probs(
                theta_old, selected_spatial_action_flat
            )
            ratio = tf.exp(selected_log_probs.total - selected_log_probs_old.total)
            clipped_ratio = tf.clip_by_value(
                ratio, 1.0 - self.clip_epsilon, 1.0 + self.clip_epsilon
            )
            l_clip = tf.minimum(
                ratio * self.placeholders.advantage,
                clipped_ratio * self.placeholders.advantage
            )
            self.sampled_action_id = weighted_random_sample(theta_old.action_id_probs)
            self.sampled_spatial_action = weighted_random_sample(theta_old.spatial_action_probs)
            self.value_estimate = theta_old.value_estimate
            self._scalar_summary("action/ratio", tf.reduce_mean(clipped_ratio))
            self._scalar_summary("action/ratio_is_clipped",
                tf.reduce_mean(tf.to_float(tf.equal(ratio, clipped_ratio))))
            policy_loss = -tf.reduce_mean(l_clip)
        else:
            self.sampled_action_id = weighted_random_sample(theta.action_id_probs)
            self.sampled_spatial_action = weighted_random_sample(theta.spatial_action_probs)
            self.value_estimate = theta.value_estimate
            policy_loss = -tf.reduce_mean(selected_log_probs.total * self.placeholders.advantage)

        value_loss = tf.losses.mean_squared_error(
            self.placeholders.value_target, theta.value_estimate)

        loss = (
            policy_loss
            + value_loss * self.loss_value_weight
            + neg_entropy_spatial * self.entropy_weight_spatial
            + neg_entropy_action_id * self.entropy_weight_action_id
        )

        self.train_op = layers.optimize_loss(
            loss=loss,
            global_step=tf.train.get_global_step(),
            optimizer=self.optimiser,
            clip_gradients=self.max_gradient_norm,
            summaries=OPTIMIZER_SUMMARIES,
            learning_rate=None,
            name="train_op"
        )

        self._scalar_summary("value/estimate", tf.reduce_mean(self.value_estimate))
        self._scalar_summary("value/target", tf.reduce_mean(self.placeholders.value_target))
        self._scalar_summary("action/is_spatial_action_available",
            tf.reduce_mean(self.placeholders.is_spatial_action_available))
        self._scalar_summary("action/selected_id_log_prob",
            tf.reduce_mean(selected_log_probs.action_id))
        self._scalar_summary("loss/policy", policy_loss)
        self._scalar_summary("loss/value", value_loss)
        self._scalar_summary("loss/neg_entropy_spatial", neg_entropy_spatial)
        self._scalar_summary("loss/neg_entropy_action_id", neg_entropy_action_id)
        self._scalar_summary("loss/total", loss)
        self._scalar_summary("value/advantage", tf.reduce_mean(self.placeholders.advantage))
        self._scalar_summary("action/selected_total_log_prob",
            tf.reduce_mean(selected_log_probs.total))
        self._scalar_summary("action/selected_spatial_log_prob",
            tf.reduce_sum(selected_log_probs.spatial) / sum_spatial_action_available)

        self.init_op = tf.global_variables_initializer()
        self.saver = tf.train.Saver(max_to_keep=2)
        self.all_summary_op = tf.summary.merge_all(tf.GraphKeys.SUMMARIES)
        self.scalar_summary_op = tf.summary.merge(tf.get_collection(self._scalar_summary_key))
Пример #3
0
    def build_model(self):
        self.placeholders = _get_placeholders(self.spatial_dim)
        # Wtihin theta you build the policy net. Check the graph in tensoflow and expand theta to see the nets
        with tf.variable_scope("theta"):
            self.theta = self.policy(self, trainable=True).build() # (MINE) from policy.py you build the net. Theta is
            # actually the policy and contains the actions ids and spatial action dstrs

        # selected_spatial_action_flat = ravel_index_pairs(
        #     self.placeholders.selected_spatial_action, self.spatial_dim
        # )

        selected_log_probs = self._get_select_action_probs(self.theta)

        # maximum is to avoid 0 / 0 because this is used to calculate some means
        # sum_spatial_action_available = tf.maximum(
        #     1e-10, tf.reduce_sum(self.placeholders.is_spatial_action_available)
        # )

        # neg_entropy_spatial = tf.reduce_sum(
        #     self.theta.spatial_action_probs * self.theta.spatial_action_log_probs
        # ) / sum_spatial_action_available
        neg_entropy_action_id = tf.reduce_mean(tf.reduce_sum(self.theta.action_id_probs * self.theta.action_id_log_probs, axis=1))
        # neg_entropy_action_id = tf.reduce_sum(self.theta.action_id_probs * self.theta.action_id_log_probs, axis=1)
        # (MINE) Sample now actions from the corresponding dstrs defined by the policy network theta
        if self.mode == ACMode.PPO:
            # could also use stop_gradient and forget about the trainable
            with tf.variable_scope("theta_old"):
                theta_old = self.policy(self, trainable=False).build()

            new_theta_var = tf.global_variables("theta/")
            old_theta_var = tf.global_variables("theta_old/")

            assert len(tf.trainable_variables("theta/")) == len(new_theta_var)
            assert not tf.trainable_variables("theta_old/")
            assert len(old_theta_var) == len(new_theta_var)

            self.update_theta_op = [
                tf.assign(t_old, t_new) for t_new, t_old in zip(new_theta_var, old_theta_var)
            ]

            selected_log_probs_old = self._get_select_action_probs(theta_old)
            ratio = tf.exp(selected_log_probs.total - selected_log_probs_old.total)
            clipped_ratio = tf.clip_by_value(
                ratio, 1.0 - self.clip_epsilon, 1.0 + self.clip_epsilon
            )
            l_clip = tf.minimum(
                ratio * self.placeholders.advantage,
                clipped_ratio * self.placeholders.advantage
            )
            self.sampled_action_id = weighted_random_sample(theta_old.action_id_probs)
            #self.sampled_spatial_action = weighted_random_sample(theta_old.spatial_action_probs)
            self.value_estimate = theta_old.value_estimate
            self._scalar_summary("action/ratio", tf.reduce_mean(clipped_ratio))
            self._scalar_summary("action/ratio_is_clipped",
                tf.reduce_mean(tf.to_float(tf.equal(ratio, clipped_ratio))))
            policy_loss = -tf.reduce_mean(l_clip)
        else:
            self.sampled_action_id = weighted_random_sample(self.theta.action_id_probs)
            # self.sampled_spatial_action = weighted_random_sample(self.theta.spatial_action_probs)
            self.value_estimate = self.theta.value_estimate
            policy_loss = -tf.reduce_mean(selected_log_probs.total * self.placeholders.advantage)
            #policy_loss = -tf.reduce_sum(selected_log_probs.total * self.placeholders.advantage)

        value_loss = tf.losses.mean_squared_error(self.placeholders.value_target, self.theta.value_estimate) # Target comes from runner/run_batch when you specify the full input
        # value_loss = tf.reduce_sum(tf.square(tf.reshape(self.placeholders.value_target,[-1]) - tf.reshape(self.value_estimate, [-1])))

        loss = (
            policy_loss
            + value_loss * self.loss_value_weight
            + neg_entropy_action_id * self.entropy_weight_action_id
        )

        self.train_op = layers.optimize_loss(
            loss=loss,
            global_step=tf.train.get_global_step(),
            optimizer=self.optimiser,
            clip_gradients=self.max_gradient_norm, # Caps the gradients at the value self.max_gradient_norm
            summaries=OPTIMIZER_SUMMARIES,
            learning_rate=None,
            name="train_op"
        )

        self._scalar_summary("value/estimate", tf.reduce_mean(self.value_estimate))
        self._scalar_summary("value/target", tf.reduce_mean(self.placeholders.value_target))
        # self._scalar_summary("action/is_spatial_action_available",
        #     tf.reduce_mean(self.placeholders.is_spatial_action_available))
        self._scalar_summary("action/selected_id_log_prob",
            tf.reduce_mean(selected_log_probs.action_id))
        self._scalar_summary("loss/policy", policy_loss)
        self._scalar_summary("loss/value", value_loss)
        #self._scalar_summary("loss/neg_entropy_spatial", neg_entropy_spatial)
        self._scalar_summary("loss/neg_entropy_action_id", neg_entropy_action_id)
        self._scalar_summary("loss/total", loss)
        self._scalar_summary("value/advantage", tf.reduce_mean(self.placeholders.advantage))
        self._scalar_summary("action/selected_total_log_prob",
            tf.reduce_mean(selected_log_probs.total))
        # self._scalar_summary("action/selected_spatial_log_prob",
        #     tf.reduce_sum(selected_log_probs.spatial) / sum_spatial_action_available)

        #tf.summary.image('convs output', tf.reshape(self.theta.map_output,[-1,25,25,64]))

        self.init_op = tf.global_variables_initializer()
        self.saver = tf.train.Saver(max_to_keep=2)
        self.all_summary_op = tf.summary.merge_all(tf.GraphKeys.SUMMARIES)
        self.scalar_summary_op = tf.summary.merge(tf.get_collection(self._scalar_summary_key))
Пример #4
0
    def build_model(self):
        self.placeholders = _get_placeholders(self.spatial_dim)
        with tf.variable_scope("theta"):
            units_embedded = layers.embed_sequence(
                self.placeholders.screen_unit_type,
                vocab_size=SCREEN_FEATURES.unit_type.scale,
                embed_dim=self.unit_type_emb_dim,
                scope="unit_type_emb",
                trainable=self.trainable
            )

            # Let's not one-hot zero which is background
            player_relative_screen_one_hot = layers.one_hot_encoding(
                self.placeholders.player_relative_screen,
                num_classes=SCREEN_FEATURES.player_relative.scale
            )[:, :, :, 1:]
            player_relative_minimap_one_hot = layers.one_hot_encoding(
                self.placeholders.player_relative_minimap,
                num_classes=MINIMAP_FEATURES.player_relative.scale
            )[:, :, :, 1:]

            channel_axis = 3
            screen_numeric_all = tf.concat(
                [self.placeholders.screen_numeric, units_embedded, player_relative_screen_one_hot],
                axis=channel_axis
            )
            minimap_numeric_all = tf.concat(
                [self.placeholders.minimap_numeric, player_relative_minimap_one_hot],
                axis=channel_axis
            )

            # BUILD CONVNNs
            screen_output = self._build_convs(screen_numeric_all, "screen_network")
            minimap_output = self._build_convs(minimap_numeric_all, "minimap_network")


            # State representation (last layer before separation as described in the paper)
            self.map_output = tf.concat([screen_output, minimap_output], axis=channel_axis)

            # BUILD CONVLSTM
            self.rnn_in = tf.reshape(self.map_output, [1, -1, 32, 32, 64])
            self.cell = tf.contrib.rnn.Conv2DLSTMCell(input_shape=[32, 32, 1], # input dims
                                                 kernel_shape=[3, 3],  # for a 3 by 3 conv
                                                 output_channels=64)  # number of feature maps
            c_init = np.zeros((1, 32, 32, 64), np.float32)
            h_init = np.zeros((1, 32, 32, 64), np.float32)
            self.state_init = [c_init, h_init]
            step_size = tf.shape(self.map_output)[:1] # Get step_size from input dimensions
            c_in = tf.placeholder(tf.float32, [None, 32, 32, 64])
            h_in = tf.placeholder(tf.float32, [None, 32, 32, 64])
            self.state_in = (c_in, h_in)
            state_in = tf.nn.rnn_cell.LSTMStateTuple(c_in, h_in)
            self.step_size = tf.placeholder(tf.float32, [1])
            (self.outputs, self.state) = tf.nn.dynamic_rnn(self.cell, self.rnn_in, initial_state=state_in, sequence_length=step_size, time_major=False,
                                                          dtype=tf.float32)
            lstm_c, lstm_h = self.state
            self.state_out = (lstm_c[:1, :], lstm_h[:1, :])
            rnn_out = tf.reshape(self.outputs, [-1, 32, 32, 64])
            
            # 1x1 conv layer to generate our spatial policy
            self.spatial_action_logits = layers.conv2d(
                rnn_out,
                data_format="NHWC",
                num_outputs=1,
                kernel_size=1,
                stride=1,
                activation_fn=None,
                scope='spatial_action',
                trainable=self.trainable
            )

            spatial_action_probs = tf.nn.softmax(layers.flatten(self.spatial_action_logits))


            map_output_flat = tf.reshape(self.outputs, [-1, 65536])  # (32*32*64)
            # fully connected layer for Value predictions and action_id
            self.fc1 = layers.fully_connected(
                map_output_flat,
                num_outputs=256,
                activation_fn=tf.nn.relu,
                scope="fc1",
                trainable=self.trainable
            )
            # fc/action_id
            action_id_probs = layers.fully_connected(
                self.fc1,
                num_outputs=len(actions.FUNCTIONS),
                activation_fn=tf.nn.softmax,
                scope="action_id",
                trainable=self.trainable
            )
            # fc/value
            self.value_estimate = tf.squeeze(layers.fully_connected(
                self.fc1,
                num_outputs=1,
                activation_fn=None,
                scope='value',
                trainable=self.trainable
            ), axis=1)

            # disregard non-allowed actions by setting zero prob and re-normalizing to 1 ((MINE) THE MASK)
            action_id_probs *= self.placeholders.available_action_ids
            action_id_probs /= tf.reduce_sum(action_id_probs, axis=1, keepdims=True)

            def logclip(x):
                return tf.log(tf.clip_by_value(x, 1e-12, 1.0))

            spatial_action_log_probs = (
                    logclip(spatial_action_probs)
                    * tf.expand_dims(self.placeholders.is_spatial_action_available, axis=1)
            )
            # non-available actions get log(1e-10) value but that's ok because it's never used
            action_id_log_probs = logclip(action_id_probs)

            self.value_estimate = self.value_estimate
            self.action_id_probs = action_id_probs
            self.spatial_action_probs = spatial_action_probs
            self.action_id_log_probs = action_id_log_probs
            self.spatial_action_log_probs = spatial_action_log_probs

        selected_spatial_action_flat = ravel_index_pairs(
            self.placeholders.selected_spatial_action, self.spatial_dim
        )

        selected_log_probs = self._get_select_action_probs(selected_spatial_action_flat)

        # maximum is to avoid 0 / 0 because this is used to calculate some means
        sum_spatial_action_available = tf.maximum(
            1e-10, tf.reduce_sum(self.placeholders.is_spatial_action_available)
        )

        neg_entropy_spatial = tf.reduce_sum(
            self.spatial_action_probs * self.spatial_action_log_probs
        ) / sum_spatial_action_available
        neg_entropy_action_id = tf.reduce_mean(tf.reduce_sum(
            self.action_id_probs * self.action_id_log_probs, axis=1
        ))
        
        # Sample now actions from the corresponding dstrs defined by the policy network theta
        self.sampled_action_id = weighted_random_sample(self.action_id_probs)
        self.sampled_spatial_action = weighted_random_sample(self.spatial_action_probs)
        
        self.value_estimate = self.value_estimate
        policy_loss = -tf.reduce_mean(selected_log_probs.total * self.placeholders.advantage)

        value_loss = tf.losses.mean_squared_error(
            self.placeholders.value_target, self.value_estimate)

        loss = (
            policy_loss
            + value_loss * self.loss_value_weight
            + neg_entropy_spatial * self.entropy_weight_spatial
            + neg_entropy_action_id * self.entropy_weight_action_id
        )

        self.train_op = layers.optimize_loss(
            loss=loss,
            global_step=tf.train.get_global_step(),
            optimizer=self.optimiser,
            clip_gradients=self.max_gradient_norm,
            summaries=OPTIMIZER_SUMMARIES,
            learning_rate=None,
            name="train_op"
        )

        self._scalar_summary("value/estimate", tf.reduce_mean(self.value_estimate))
        self._scalar_summary("value/target", tf.reduce_mean(self.placeholders.value_target))
        self._scalar_summary("action/is_spatial_action_available",
                             tf.reduce_mean(self.placeholders.is_spatial_action_available))
        self._scalar_summary("action/selected_id_log_prob",
                             tf.reduce_mean(selected_log_probs.action_id))
        self._scalar_summary("loss/policy", policy_loss)
        self._scalar_summary("loss/value", value_loss)
        self._scalar_summary("loss/neg_entropy_spatial", neg_entropy_spatial)
        self._scalar_summary("loss/neg_entropy_action_id", neg_entropy_action_id)
        self._scalar_summary("loss/total", loss)
        self._scalar_summary("value/advantage", tf.reduce_mean(self.placeholders.advantage))
        self._scalar_summary("action/selected_total_log_prob",
                             tf.reduce_mean(selected_log_probs.total))
        self._scalar_summary("action/selected_spatial_log_prob",
                             tf.reduce_sum(selected_log_probs.spatial) / sum_spatial_action_available)

        self.init_op = tf.global_variables_initializer()
        self.saver = tf.train.Saver(max_to_keep=2)
        self.all_summary_op = tf.summary.merge_all(tf.GraphKeys.SUMMARIES)
        self.scalar_summary_op = tf.summary.merge(tf.get_collection(self._scalar_summary_key))
Пример #5
0
    def build_model(self):
        self._define_input_placeholders()

        spatial_action_probs, action_id_probs, value_estimate = \
            self._build_fullyconv_network()

        selected_spatial_action_flat = ravel_index_pairs(
            self.ph_selected_spatial_action, self.spatial_dim
        )

        def logclip(x):
            return tf.log(tf.clip_by_value(x, 1e-12, 1.0))

        spatial_action_log_probs = (
            logclip(spatial_action_probs)
            * tf.expand_dims(self.ph_is_spatial_action_available, axis=1)
        )

        # non-available actions get log(1e-10) value but that's ok because it's never used        
        action_id_log_probs = logclip(action_id_probs)

        selected_spatial_action_log_prob = select_from_each_row(
            spatial_action_log_probs, selected_spatial_action_flat
        )
        selected_action_id_log_prob = select_from_each_row(
            action_id_log_probs, self.ph_selected_action_id
        )
        selected_action_total_log_prob = (
            selected_spatial_action_log_prob
            + selected_action_id_log_prob
        )

        # maximum is to avoid 0 / 0 because this is used to calculate some means
        sum_spatial_action_available = tf.maximum(
            1e-10, tf.reduce_sum(self.ph_is_spatial_action_available)
        )
        neg_entropy_spatial = tf.reduce_sum(
            spatial_action_probs * spatial_action_log_probs
        ) / sum_spatial_action_available
        neg_entropy_action_id = tf.reduce_mean(tf.reduce_sum(
            action_id_probs * action_id_log_probs, axis=1
        ))

        advantage = tf.stop_gradient(self.ph_value_target - value_estimate)
        policy_loss = -tf.reduce_mean(selected_action_total_log_prob * advantage)
        value_loss = tf.losses.mean_squared_error(self.ph_value_target, value_estimate)

        loss = (
            policy_loss
            + value_loss * self.loss_value_weight
            + neg_entropy_spatial * self.entropy_weight_spatial
            + neg_entropy_action_id * self.entropy_weight_action_id
        )

        scalar_summary_collection_name = "scalar_summaries"
        s_collections = [scalar_summary_collection_name, tf.GraphKeys.SUMMARIES]
        tf.summary.scalar("loss/policy", policy_loss, collections=s_collections)
        tf.summary.scalar("loss/value", value_loss, s_collections)
        tf.summary.scalar("loss/neg_entropy_spatial", neg_entropy_spatial, s_collections)
        tf.summary.scalar("loss/neg_entropy_action_id", neg_entropy_action_id, s_collections)
        tf.summary.scalar("loss/total", loss, s_collections)
        tf.summary.scalar("value/advantage", tf.reduce_mean(advantage), s_collections)
        tf.summary.scalar("value/estimate", tf.reduce_mean(value_estimate), s_collections)
        tf.summary.scalar("value/target", tf.reduce_mean(self.ph_value_target), s_collections)
        tf.summary.scalar("action/is_spatial_action_available",
            tf.reduce_mean(self.ph_is_spatial_action_available), s_collections)
        tf.summary.scalar("action/is_spatial_action_available",
            tf.reduce_mean(self.ph_is_spatial_action_available), s_collections)
        tf.summary.scalar("action/selected_id_log_prob",
            tf.reduce_mean(selected_action_id_log_prob))
        tf.summary.scalar("action/selected_total_log_prob",
            tf.reduce_mean(selected_action_total_log_prob))
        tf.summary.scalar("action/selected_spatial_log_prob",
            tf.reduce_sum(selected_spatial_action_log_prob) / sum_spatial_action_available
        )

        self.sampled_action_id = weighted_random_sample(action_id_probs)
        self.sampled_spatial_action = weighted_random_sample(spatial_action_probs)
        self.value_estimate = value_estimate

        self.train_op = layers.optimize_loss(
            loss=loss,
            global_step=framework.get_global_step(),
            optimizer=self.optimiser,
            clip_gradients=self.max_gradient_norm,
            summaries=OPTIMIZER_SUMMARIES,
            learning_rate=None,
            name="train_op"
        )

        self.init_op = tf.global_variables_initializer()
        self.saver = tf.train.Saver(max_to_keep=2)
        self.all_summary_op = tf.summary.merge_all(tf.GraphKeys.SUMMARIES)
        self.scalar_summary_op = tf.summary.merge(tf.get_collection(scalar_summary_collection_name))
Пример #6
0
    def build_model(self):
        """build_model

        Function that actually builds the model, initialising
        variables and setting up the policy.
        After this, it sets up the loss value, defines a training
        step and sets up logging for all needed values.
        """

        # Initialise the placeholders property with some default values.
        self.placeholders = get_default_values(self.spatial_dim)

        # Provides checks to ensure that variable isn't shared by accident,
        # and starts up the fully convolutional policy.
        with tf.variable_scope("theta"):
            theta = self.policy(self,
                                trainable=True,
                                spatial_dim=self.spatial_dim).build()

        # Get the actions and the probabilities of those actions.
        selected_spatial_action = ravel_index_pairs(
            self.placeholders.selected_spatial_action, self.spatial_dim)

        selected_log_probabilities = self.get_selected_action_probability(
            theta, selected_spatial_action)

        # Take the maximum here to avoid a divide by 0 error next.
        sum_of_available_spatial = tf.maximum(
            1e-10,
            tf.reduce_sum(self.placeholders.is_spatial_action_available))

        # Generate the negative entropy, used later as part of the loss
        # function. This in-turn is used to optimise to get the lowest
        # loss possible.
        negative_spatial_entropy = tf.reduce_sum(
            theta.spatial_action_probs * theta.spatial_action_log_probs)

        negative_spatial_entropy /= sum_of_available_spatial

        negative_entropy_for_action_id = tf.reduce_mean(
            tf.reduce_sum(theta.action_id_probs * theta.action_id_log_probs,
                          axis=1))

        # Get the values for the possible actions.
        self.sampled_action_id = weighted_random_sample(theta.action_id_probs)
        self.sampled_spatial_action = weighted_random_sample(
            theta.spatial_action_probs)

        self.value_estimate = theta.value_estimate

        # Calculate the policy and value loss, such that the final loss
        # can be calculated and optimised against.
        policy_loss = -tf.reduce_mean(
            selected_log_probabilities.total * self.placeholders.advantage)

        value_loss = tf.losses.mean_squared_error(
            self.placeholders.value_target, theta.value_estimate)

        total_loss = (
            policy_loss + value_loss * self.loss_value_weight +
            negative_spatial_entropy * self.entropy_weight_spatial +
            negative_entropy_for_action_id * self.entropy_weight_action_id)

        # Define a training step to be optimising the loss to be the lowest.
        self.train_operation = layers.optimize_loss(
            loss=total_loss,
            global_step=tf.train.get_global_step(),
            optimizer=self.optimiser,
            clip_gradients=self.max_gradient_norm,
            summaries=OPTIMIZER_SUMMARIES,
            learning_rate=None,
            name="train_operation")

        # Finally, log some information about the model in its current state.
        self.get_scalar_summary("Value - Estimate:",
                                tf.reduce_mean(self.value_estimate))

        self.get_scalar_summary("Value - Target:",
                                tf.reduce_mean(self.placeholders.value_target))

        self.get_scalar_summary(
            "Action - Is Spatial Action Available:",
            tf.reduce_mean(self.placeholders.is_spatial_action_available))

        self.get_scalar_summary(
            "Action - Selected Action ID Log Probability",
            tf.reduce_mean(selected_log_probabilities.action_id))

        self.get_scalar_summary("Loss - Policy Loss", policy_loss)
        self.get_scalar_summary("Loss - Value Loss", value_loss)
        self.get_scalar_summary("Loss - Negative Spatial Entropy",
                                negative_spatial_entropy)
        self.get_scalar_summary("Loss - Negative Entropy for Action ID",
                                negative_entropy_for_action_id)

        self.get_scalar_summary("Loss - Total", total_loss)
        self.get_scalar_summary("Value - Advantage",
                                tf.reduce_mean(self.placeholders.advantage))

        self.get_scalar_summary(
            "Action - Selected Total Log Probability",
            tf.reduce_mean(selected_log_probabilities.total))

        self.get_scalar_summary(
            "Action - Selected Spatial Action Log Probability",
            tf.reduce_sum(selected_log_probabilities.spatial) /
            sum_of_available_spatial)

        # Clean up and save.
        self.init_op = tf.global_variables_initializer()
        self.saver = tf.train.Saver(max_to_keep=2)
        self.all_summary_op = tf.summary.merge_all(tf.GraphKeys.SUMMARIES)
        self.scalar_summary_op = tf.summary.merge(
            tf.get_collection(self._scalar_summary_key))
Пример #7
0
    def build_model(self):
        self.placeholders = _get_placeholders(self.spatial_dim, self.nsteps,
                                              self.num_envs, self.policy_type,
                                              self.obs_dims)
        with tf.variable_scope("theta"):
            self.theta = self.policy(self, trainable=True).build(
            )  # (MINE) from policy.py you build the net. Theta is

        selected_log_probs = self._get_select_action_probs(self.theta)

        if self.mode == ACMode.PPO:
            # could also use stop_gradient and forget about the trainable
            with tf.variable_scope("theta_old"):
                theta_old = self.policy(self, trainable=False).build(
                )  # theta old is used as a constant here

            new_theta_var = tf.global_variables("theta/")
            old_theta_var = tf.global_variables("theta_old/")

            assert len(tf.trainable_variables("theta/")) == len(new_theta_var)
            assert not tf.trainable_variables("theta_old/")  # Has to be empty
            assert len(old_theta_var) == len(new_theta_var)

            self.update_theta_op = [
                tf.assign(t_old, t_new)
                for t_new, t_old in zip(new_theta_var, old_theta_var)
            ]

            selected_log_probs_old = self._get_select_action_probs(theta_old)
            ratio = tf.exp(selected_log_probs.total -
                           selected_log_probs_old.total)
            clipped_ratio = tf.clip_by_value(ratio, 1.0 - self.clip_epsilon,
                                             1.0 + self.clip_epsilon)
            l_clip = tf.minimum(ratio * self.placeholders.advantage,
                                clipped_ratio * self.placeholders.advantage)
            self.sampled_action_id = weighted_random_sample(
                theta_old.action_id_probs)
            #self.sampled_spatial_action = weighted_random_sample(theta_old.spatial_action_probs)
            self.value_estimate = theta_old.value_estimate
            self._scalar_summary("action/ratio", tf.reduce_mean(clipped_ratio))
            self._scalar_summary(
                "action/ratio_is_clipped",
                tf.reduce_mean(tf.to_float(tf.equal(ratio, clipped_ratio))))
            self.policy_loss = -tf.reduce_mean(l_clip)
        else:
            self.sampled_action_id = weighted_random_sample(
                self.theta.action_id_probs)
            if self.policy_type == 'FactoredPolicy' or self.policy_type == 'FactoredPolicy_PhaseI' or self.policy_type == 'FactoredPolicy_PhaseII':
                self.value_estimate_goal = self.theta.value_estimate_goal
                self.value_estimate_fire = self.theta.value_estimate_fire
                self.value_estimate = self.theta.value_estimate
            else:
                self.value_estimate = self.theta.value_estimate

        if self.policy_type == 'MetaPolicy':
            # RESHAPE ACTIONS, ADVANTAGES, VALUES. USE MASK TO COMPUTE CORRECT MEANS!!!
            batch_size = tf.shape(
                self.placeholders.rgb_screen)[0]  # or maybe use -1
            max_steps = tf.shape(self.placeholders.rgb_screen)[1]
            mask = self.theta.mask  # Check dims!!!
            self.mask = mask  # for debug

            # Actions (already masked)
            self.action_id_probs = tf.reshape(
                self.theta.action_id_probs,
                [batch_size, max_steps, self.num_actions])
            self.action_id_log_probs = tf.reshape(
                self.theta.action_id_log_probs,
                [batch_size, max_steps, self.num_actions])

            # --------------
            # Cross Entropy
            # -------------
            entropy_i = tf.multiply(
                self.action_id_probs,
                self.action_id_log_probs)  # [batch,max_steps,num_actions]
            # cross_entropy = -tf.reduce_sum(entropy_i, 2) # result: [batch,max_steps], axis=2 means sum wrt actions
            cross_entropy = tf.reduce_sum(entropy_i, 2)
            # mask = tf.sign(tf.reduce_max(tf.abs(entropy_i), 2)) # # [batch,max_steps] with zeros and ones
            cross_entropy *= mask
            # Average over actual sequence lengths.
            cross_entropy = tf.reduce_sum(
                cross_entropy, 1
            )  # sum all policy values per timestep for each sequence. result: batch x 1
            cross_entropy /= tf.reduce_sum(
                mask, 1
            )  # You sum the 1s of the [batch x maxsteps] over axis=1 (maxsteps) to get the actual length of each sequence in your batch
            self.neg_entropy_action_id = tf.reduce_mean(cross_entropy)
            # self.neg_entropy_action_id = tf.reduce_sum(cross_entropy_m) / tf.reduce_sum(tf.reduce_sum(mask, 1))
            # --------------
            #   Policy
            # --------------
            # Start with policy per timestep i per sequence. Result will be [batch * maxsteps]
            policy_i = selected_log_probs.total * self.placeholders.advantage  # The selected log probs for masked actions should already be zero. The mask also (inside the policy.py) masks specific lengths so even if there are actions 0 (which are valid as a number) if not included in episode, they will be masked
            # Reshape now to calculate correct means
            policy = tf.reshape(
                policy_i,
                [batch_size, max_steps])  # result: [batch x maxsteps]
            policy = tf.reduce_sum(
                policy, 1
            )  # sum all policy values per timestep for each sequence. result: batch x 1
            policy /= tf.reduce_sum(mask, 1)
            self.policy_loss = -tf.reduce_mean(policy)
            # self.policy_loss = tf.reduce_sum(policy_i) / tf.reduce_sum(tf.reduce_sum(mask, 1))

            # --------------
            #    Value
            # --------------
            vloss_i = tf.squared_difference(self.placeholders.value_target,
                                            self.theta.value_estimate)
            mse = tf.reshape(
                vloss_i, [batch_size, max_steps])  # result: [batch x maxsteps]
            mse = tf.reduce_sum(
                mse, 1
            )  # sum all value losses per timestep for each sequence. result: batch x 1
            mse /= tf.reduce_sum(
                mask, 1
            )  # Denominator is the number of timesteps per sequence [batch x 1] vector
            self.value_loss = tf.reduce_mean(
                mse
            )  # the mean of the mean losses per sequence (so the denominator in mean will be the number of batches)
            # self.value_loss = tf.reduce_sum(vloss_i)/tf.reduce_sum(tf.reduce_sum(mask, 1))# alternative: instead of the mean of the mean per sequence we take the mean of all samples

        elif self.policy_type == 'FactoredPolicy' or self.policy_type == 'FactoredPolicy_PhaseI' or self.policy_type == 'FactoredPolicy_PhaseII':
            self.neg_entropy_action_id = tf.reduce_mean(
                tf.reduce_sum(self.theta.action_id_probs *
                              self.theta.action_id_log_probs,
                              axis=1))
            self.value_loss_goal = tf.losses.mean_squared_error(
                self.placeholders.value_target_goal,
                self.theta.value_estimate_goal
            )  # value_target comes from runner/run_batch when you specify the full input
            self.value_loss_fire = tf.losses.mean_squared_error(
                self.placeholders.value_target_fire,
                self.theta.value_estimate_fire)
            self.value_loss = tf.losses.mean_squared_error(
                self.placeholders.value_target, self.theta.value_estimate)
            self.policy_loss = -tf.reduce_mean(
                selected_log_probs.total * self.placeholders.advantage)
        else:
            self.neg_entropy_action_id = tf.reduce_mean(
                tf.reduce_sum(self.theta.action_id_probs *
                              self.theta.action_id_log_probs,
                              axis=1))
            self.value_loss = tf.losses.mean_squared_error(
                self.placeholders.value_target, self.theta.value_estimate
            )  # value_target comes from runner/run_batch when you specify the full input
            self.policy_loss = -tf.reduce_mean(
                selected_log_probs.total * self.placeholders.advantage)
        """ Loss function choices """
        if self.policy_type == 'FactoredPolicy':
            loss = (self.policy_loss +
                    (self.value_loss_goal + self.value_loss_fire +
                     self.value_loss) * self.loss_value_weight +
                    self.neg_entropy_action_id * self.entropy_weight_action_id)
        elif self.policy_type == 'FactoredPolicy_PhaseI':
            loss = (
                self.policy_loss + self.value_loss * self.loss_value_weight +
                self.neg_entropy_action_id * self.entropy_weight_action_id)  #\
            # + (self.value_loss_fire + self.value_loss_goal)*0.0 # when it was 0.0000 performance was good--so now might affect more? # You should try take them out of the loss equation completely as with symbolic differentiation might get values
        elif self.policy_type == 'FactoredPolicy_PhaseII':
            loss = (self.value_loss_fire + self.value_loss_goal
                    )  #* self.loss_value_weight # Not sure if this is needed
        else:
            loss = (self.policy_loss +
                    self.value_loss * self.loss_value_weight +
                    self.neg_entropy_action_id * self.entropy_weight_action_id)

        if self.policy_type == 'FactoredPolicy_PhaseI' or self.policy_type == 'FactoredPolicy_PhaseII':
            # list of the head variables
            head_train_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, "theta/heads")
            tvars = tf.trainable_variables()
            for v in (head_train_vars):
                i = 0
                for tv in tvars:
                    if v.name == tv.name: del tvars[i]
                    i += 1
            # PHASE I
            if self.policy_type == 'FactoredPolicy_PhaseI':
                vars = tvars
            # PHASE II
            elif self.policy_type == 'FactoredPolicy_PhaseII':
                vars = head_train_vars
        else:
            vars = None  # default
            tvars = None  # added so FullyConv and other policies wont have problem with the extra saver

        self.train_op = layers.optimize_loss(
            loss=loss,
            global_step=tf.train.get_global_step(),
            optimizer=self.optimiser,
            clip_gradients=self.
            max_gradient_norm,  # Caps the gradients at the value self.max_gradient_norm
            summaries=OPTIMIZER_SUMMARIES,
            learning_rate=None,
            variables=vars,
            name="train_op")

        if self.policy_type == 'FactoredPolicy' or self.policy_type == 'FactoredPolicy_PhaseI' or self.policy_type == 'FactoredPolicy_PhaseII':
            self._scalar_summary(
                "value_goal/estimate", tf.reduce_mean(self.value_estimate_goal)
            )  # no correct!mean is for all samples but we use masks!!!
            self._scalar_summary("value_goal/target",
                                 tf.reduce_mean(
                                     self.placeholders.value_target_goal)
                                 )  # no correct!mean is for all samples
            self._scalar_summary(
                "value_fire/estimate", tf.reduce_mean(self.value_estimate_fire)
            )  # no correct!mean is for all samples but we use masks!!!
            self._scalar_summary(
                "value_fire/target",
                tf.reduce_mean(self.placeholders.value_target_fire))
            self._scalar_summary("loss/value_fire", self.value_loss_fire)
            self._scalar_summary("loss/value_goal", self.value_loss_goal)
            self._scalar_summary("value/estimate",
                                 tf.reduce_mean(self.value_estimate))
            self._scalar_summary("loss/value", self.value_loss)
        else:
            self._scalar_summary(
                "value/estimate", tf.reduce_mean(self.value_estimate)
            )  # no correct!mean is for all samples but we use masks!!!
            self._scalar_summary(
                "value/target", tf.reduce_mean(self.placeholders.value_target))
            self._scalar_summary("loss/value", self.value_loss)
        # self._scalar_summary("action/is_spatial_action_available",
        #     tf.reduce_mean(self.placeholders.is_spatial_action_available))
        # self._scalar_summary("action/selected_id_log_prob",
        #     tf.reduce_mean(selected_log_probs.action_id)) # You need the corrected one
        self._scalar_summary("loss/policy", self.policy_loss)

        self._scalar_summary("loss/neg_entropy_action_id",
                             self.neg_entropy_action_id)
        self._scalar_summary("loss/total", loss)
        # self._scalar_summary("value/advantage", tf.reduce_mean(self.placeholders.advantage)) # You need the corrected one (masked)
        # self._scalar_summary("action/selected_total_log_prob", # You need the corrected one (masked)
        #     tf.reduce_mean(selected_log_probs.total))

        #tf.summary.image('convs output', tf.reshape(self.theta.map_output,[-1,25,25,64]))

        self.init_op = tf.global_variables_initializer()
        #TODO: we need 2 savers. PhaseI: it will save only the headless network. PhaseII: it will save the whole network (previous params plus the heads params)
        self.saver_orig = tf.train.Saver(
            max_to_keep=2
        )  # Save everything (tf.all_variables() which is different from tf.trainable_variables()) which includes Adam vars# keeps only the last 2 set of params and model checkpoints. If you want more increase the umber to keep
        # self.saver = tf.train.Saver(max_to_keep=2)
        # This saves and restores only the variables in the var_list
        # self.saver = tf.train.Saver(max_to_keep=2, var_list=tvars)  # 2 phase training
        self.saver = tf.train.Saver(
            max_to_keep=2, var_list=tvars
        )  # 2 phase training. If tvars=None then saves everything
        self.all_summary_op = tf.summary.merge_all(tf.GraphKeys.SUMMARIES)
        self.scalar_summary_op = tf.summary.merge(
            tf.get_collection(self._scalar_summary_key))