Exemplo n.º 1
0
    def __init__(
        self,
        input_shape,
        num_queries,
        num_classes,
        num_heads,
        dim_transformer,
        dim_feedforward,
        num_transformer_layer,
        backbone_name,
        backbone_config,
        train_backbone=False,
    ):
        """Initialize Detection Transformer (DETR) network.

        Parameters
        ----------
        input_shape : tuple
            Specification of model input [H, W, C].
        num_queries : int
            Number of queries used in transformer.
        num_classes : int
            Number of target classes.
        num_heads : int
            Number of heads in multi-head attention layers.
        dim_transformer : int
            Number of neurons in multi-head attention layers.
            Should be a multiple of `num_heads`.
        dim_feedforward : int
            Number of neurons in transformer feed forward layers.
        num_transformer_layer : int
            Number of layers in transformer network.
        backbone_name : str
            Name of backbone used for DETR network.
        backbone_config : dict
            Config of backbone used for DETR network.
        train_backbone : bool, optional
            Flag to indicate training/inference mode.
        """

        # Save object parameters
        self.input_shape = input_shape
        self.num_queries = num_queries
        self.num_classes = num_classes
        self.num_heads = num_heads
        self.dim_transformer = dim_transformer
        self.dim_feedforward = dim_feedforward
        self.num_transformer_layer = num_transformer_layer
        self.train_backbone = train_backbone

        # Init Backbone
        self.backbone = Backbone(backbone_name, backbone_config).model
        self.backbone.trainable = train_backbone
        self.fm_shape = self.backbone.get_layer(
            "feature_map").output.shape[1::]
        self.positional_encodings_shape = (
            self.fm_shape[0] * self.fm_shape[1],
            dim_transformer,
        )
Exemplo n.º 2
0
class DETR:
    def __init__(
        self,
        input_shape,
        num_queries,
        num_classes,
        num_heads,
        dim_transformer,
        dim_feedforward,
        num_transformer_layer,
        backbone_name,
        backbone_config,
        train_backbone=False,
    ):
        """Initialize Detection Transformer (DETR) network.

        Parameters
        ----------
        input_shape : tuple
            Specification of model input [H, W, C].
        num_queries : int
            Number of queries used in transformer.
        num_classes : int
            Number of target classes.
        num_heads : int
            Number of heads in multi-head attention layers.
        dim_transformer : int
            Number of neurons in multi-head attention layers.
            Should be a multiple of `num_heads`.
        dim_feedforward : int
            Number of neurons in transformer feed forward layers.
        num_transformer_layer : int
            Number of layers in transformer network.
        backbone_name : str
            Name of backbone used for DETR network.
        backbone_config : dict
            Config of backbone used for DETR network.
        train_backbone : bool, optional
            Flag to indicate training/inference mode.
        """

        # Save object parameters
        self.input_shape = input_shape
        self.num_queries = num_queries
        self.num_classes = num_classes
        self.num_heads = num_heads
        self.dim_transformer = dim_transformer
        self.dim_feedforward = dim_feedforward
        self.num_transformer_layer = num_transformer_layer
        self.train_backbone = train_backbone

        # Init Backbone
        self.backbone = Backbone(backbone_name, backbone_config).model
        self.backbone.trainable = train_backbone
        self.fm_shape = self.backbone.get_layer(
            "feature_map").output.shape[1::]
        self.positional_encodings_shape = (
            self.fm_shape[0] * self.fm_shape[1],
            dim_transformer,
        )

    def build_model(self):
        """Build Detection Transformer (DETR) model.

        Returns
        -------
        tf.Model
            Detection Transformer (DETR) model
        """
        batch_input = tf.keras.layers.Input(shape=self.input_shape,
                                            name="Batch_Input")
        positional_encodings = tf.keras.layers.Input(
            shape=self.positional_encodings_shape,
            name="Positional_Encodings_Input")
        feature_map = self.backbone(batch_input)

        # Set backbone learning rate order of magnitude smaller
        feature_map = (1 / 10) * feature_map + (
            1 - 1 / 10) * tf.stop_gradient(feature_map)

        transformer_input = tf.keras.layers.Conv2D(self.dim_transformer,
                                                   kernel_size=1)(feature_map)

        batch_size = tf.shape(transformer_input)[0]

        transformer_input = tf.reshape(
            transformer_input,
            shape=(
                batch_size,
                transformer_input.shape[1] * transformer_input.shape[2],
                transformer_input.shape[3],
            ),
        )

        # Create Queries
        # Query Input is always a tensor of ones, therefore the output
        # equals the weights of the Embedding Layer
        query_pos = tf.ones((self.num_queries), dtype=tf.float32)
        query_pos = tf.repeat(tf.expand_dims(query_pos, axis=0),
                              repeats=batch_size,
                              axis=0)
        query_embedding = tf.keras.layers.Embedding(
            input_dim=self.num_queries,
            output_dim=self.dim_transformer)(query_pos)

        transformer = Transformer(
            self.num_transformer_layer,
            self.dim_transformer,
            self.num_heads,
            self.dim_feedforward,
        )

        transformer_output = transformer(
            inp=transformer_input,
            positional_encodings=positional_encodings,
            query_pos=query_embedding,
        )

        cls_pred = tf.keras.layers.Dense(
            units=self.num_classes + 1,
            activation="softmax")(transformer_output)

        bbox_pred = tf.keras.layers.Dense(
            units=4, activation="sigmoid")(transformer_output)

        output_tensor = [cls_pred, bbox_pred]

        return Model([batch_input, positional_encodings],
                     output_tensor,
                     name="DETR")

    def train(self, training_config, optimizer, count_images, data_feeder):
        """Train the DETR Model.

        Parameters
        ----------
        training_config : dict
            Contains the training configuration:
        optimizer : tf.Optimizer
            Any chosen optimizer used for training.
        count_images : int
            Number of total images used for training.
        data_feeder : detr_models.detr.data_feeder
            DataFeeder object used for training. Currently, we supprt a data feeder taking
            input data of PascalVOC and COCO type.

        Returns
        -------
        float
            Final training loss.
        """

        print("-------------------------------------------", flush=True)
        print("-------------------------------------------\n", flush=True)

        if training_config["use_pretrained"]:
            print(
                "Load pre-trained model from: {}\n".format(
                    training_config["use_pretrained"]),
                flush=True,
            )
            model = tf.keras.models.load_model(
                training_config["use_pretrained"])
        else:
            print("Build model from scratch\n", flush=True)
            model = self.build_model()

        print("-------------------------------------------\n", flush=True)
        print(
            "Start Training - Total of {} Epochs:\n".format(
                training_config["epochs"]),
            flush=True,
        )

        detr_loss = []

        positional_encodings = create_positional_encodings(
            fm_shape=self.fm_shape,
            num_pos_feats=self.dim_transformer // 2,
            batch_size=training_config["batch_size"],
        )

        for epoch in range(training_config["epochs"]):
            start = time.time()
            print("-------------------------------------------", flush=True)
            print(f"Epoch: {epoch+1}\n", flush=True)
            epoch_loss = np.array([0.0, 0.0, 0.0])
            batch_iteration = 0

            # Iterate over all batches
            for input_data in data_feeder(training_config["verbose"]):

                batch_loss = _train(
                    detr=model,
                    optimizer=optimizer,
                    batch_inputs=input_data[0],
                    batch_cls=input_data[1],
                    batch_bbox=input_data[2],
                    obj_indices=input_data[3],
                    positional_encodings=positional_encodings,
                )

                batch_loss = np.array([loss.numpy() for loss in batch_loss])

                epoch_loss = epoch_loss + batch_loss

                batch_iteration += 1

            detr_loss.append(epoch_loss)

            print("DETR Loss: %f" % epoch_loss[0], flush=True)
            print(f"Time for epoch {epoch + 1} is {time.time()-start} sec",
                  flush=True)
            print("-------------------------------------------\n", flush=True)

        print("Finalize Training\n", flush=True)

        # Save training loss and model
        model.save("{}/detr_model".format(training_config["output_dir"]))
        save_training_loss(
            detr_loss,
            "{}/detr_loss.txt".format(training_config["output_dir"]))

        return detr_loss
Exemplo n.º 3
0
class DETR:
    def __init__(
        self,
        storage_path,
        input_shape,
        num_queries,
        num_classes,
        num_heads,
        dim_transformer,
        dim_feedforward,
        num_transformer_layer,
        backbone_name,
        backbone_config,
        train_backbone=False,
    ):
        """Initialize Detection Transformer (DETR) network.

        Parameters
        ----------
        storage_path : str
            Path to images.
        input_shape : tuple
            Specification of model input [H, W, C].
        num_queries : int
            Number of queries used in transformer.
        num_classes : int
            Number of target classes.
        num_heads : int
            Number of heads in multi-head attention layers.
        dim_transformer : int
            Number of neurons in multi-head attention layers.
            Should be a multiple of `num_heads`.
        dim_feedforward : int
            Number of neurons in transformer feed forward layers.
        num_transformer_layer : int
            Number of layers in transformer network.
        backbone_name : str
            Name of backbone used for DETR network.
        backbone_config : dict
            Config of backbone used for DETR network.
        train_backbone : bool, optional
            Flag to indicate training/inference mode.
        """

        # Save object parameters
        self.storage_path = storage_path
        self.input_shape = input_shape
        self.num_queries = num_queries
        self.num_classes = num_classes
        self.num_heads = num_heads
        self.dim_transformer = dim_transformer
        self.dim_feedforward = dim_feedforward
        self.num_transformer_layer = num_transformer_layer
        self.train_backbone = train_backbone

        # Init Backbone
        self.backbone = Backbone(backbone_name, backbone_config).model
        self.backbone.trainable = train_backbone
        self.fm_shape = self.backbone.get_layer(
            "feature_map").output.shape[1::]
        self.positional_encodings_shape = (
            self.fm_shape[0] * self.fm_shape[1],
            dim_transformer,
        )

        # Init Feeder and Iterator
        self.uuiditerator = UUIDIterator(storage_path)
        self.feeder = DataFeeder(storage_path, num_queries, num_classes,
                                 self.fm_shape, dim_transformer)

    def build_model(self):
        """Build Detection Transformer (DETR) model.

        Returns
        -------
        tf.Model
            Detection Transformer (DETR) model
        """
        batch_input = tf.keras.layers.Input(shape=self.input_shape,
                                            name="Batch_Input")
        positional_encodings = tf.keras.layers.Input(
            shape=self.positional_encodings_shape,
            name="Positional_Encodings_Input")
        feature_map = self.backbone(batch_input)

        transformer_input = tf.keras.layers.Conv2D(self.dim_transformer,
                                                   kernel_size=1)(feature_map)

        batch_size = tf.shape(transformer_input)[0]

        transformer_input = tf.reshape(
            transformer_input,
            shape=(
                batch_size,
                transformer_input.shape[1] * transformer_input.shape[2],
                transformer_input.shape[3],
            ),
        )

        # Create Queries
        # Query Input is always a tensor of ones, therefore the output
        # equals the weights of the Embedding Layer
        query_pos = tf.ones((self.num_queries), dtype=tf.float32)
        query_pos = tf.repeat(tf.expand_dims(query_pos, axis=0),
                              repeats=batch_size,
                              axis=0)
        query_embedding = tf.keras.layers.Embedding(
            input_dim=self.num_queries,
            output_dim=self.dim_transformer)(query_pos)

        transformer = Transformer(
            self.num_transformer_layer,
            self.dim_transformer,
            self.num_heads,
            self.dim_feedforward,
        )

        transformer_output = transformer(
            inp=transformer_input,
            positional_encodings=positional_encodings,
            query_pos=query_embedding,
        )

        cls_pred = tf.keras.layers.Dense(
            units=self.num_classes + 1,
            activation="softmax")(transformer_output)
        bbox_pred = tf.keras.layers.Dense(
            units=4, activation="sigmoid")(transformer_output)

        output_tensor = [cls_pred, bbox_pred]

        return Model([batch_input, positional_encodings],
                     output_tensor,
                     name="DETR")

    def train(
        self,
        epochs,
        optimizer,
        batch_size,
        count_images,
        output_dir,
        use_pretrained=None,
    ):
        """Train the DETR Model.

        Parameters
        ----------
        epochs : int
            Number of training epochs.
        optimizer : tf.Optimizer
            Any chosen optimizer used for training.
        batch_size : int
            Number of samples per batch.
        count_images : int
            Number of total images used for training.
        output_dir: str
            Path used to save the final model weights and training loss.
        use_pretrained : str, optional
            Path to saved pre-trained model weights. Only used if specified and only
            valid if the weights align with the chosen model config.

        Returns
        -------
        float
            Final training loss.
        """

        print("-------------------------------------------", flush=True)
        print("-------------------------------------------\n", flush=True)
        print("Build Model")

        model = self.build_model()

        if use_pretrained:
            print("Used pre-trained model weights\n", flush=True)
            model.load_weights(use_pretrained)

        print("-------------------------------------------\n", flush=True)
        print(f"Start Training - Total of {epochs} Epochs:\n", flush=True)

        detr_loss = []

        for epoch in range(epochs):
            start = time.time()
            print("-------------------------------------------", flush=True)
            print(f"Beginning of Epoch: {epoch+1}\n", flush=True)

            epoch_loss = np.array([0.0, 0.0, 0.0])
            batch_iteration = 0

            # Iterate over all batches
            for batch_uuids in self.uuiditerator(batch_size):
                print(
                    "Batch: {}/{} ".format(batch_iteration + 1,
                                           count_images // batch_size),
                    flush=True,
                )

                (
                    batch_inputs,
                    batch_cls,
                    batch_bbox,
                    obj_indices,
                    positional_encodings,
                ) = self.feeder(batch_uuids)

                batch_loss = _train(
                    detr=model,
                    optimizer=optimizer,
                    batch_inputs=batch_inputs,
                    batch_cls=batch_cls,
                    batch_bbox=batch_bbox,
                    obj_indices=obj_indices,
                    positional_encodings=positional_encodings,
                )

                batch_loss = [loss.numpy() for loss in batch_loss]
                epoch_loss += (1 / len(batch_uuids)) * np.array(batch_loss)
                batch_iteration += 1

            detr_loss.append(epoch_loss)

            print("DETR Loss: %f" % epoch_loss[0], flush=True)
            print(f"Time for epoch {epoch + 1} is {time.time()-start} sec",
                  flush=True)
            print("-------------------------------------------\n", flush=True)

        print("Finalize Training\n", flush=True)
        print("-------------------------------------------\n", flush=True)
        print("Done", flush=True)

        # Save training loss and model
        model.save_weights("{}/detr_weights".format(output_dir))
        save_training_loss(detr_loss, "{}/detr_loss.txt".format(output_dir))

        return detr_loss