示例#1
0
def get_sequence_from_user(max_sequence_length: int) -> Tuple[Tensor, Tensor]:
    """
    Ask the user to enter a sequence of token ids and convert it to source
    token tensor and source mask tensor for feeding the model.
    """
    enter_message = (
        "\nEnter the desired source sequence token ids separated by spaces: ")

    # asking for user input and splitting it into a sequence of token ids:
    src_seq = list(map(int, input(enter_message).split()))
    n_tokens = len(src_seq)

    if n_tokens > max_sequence_length:
        # truncating the sequence if its length is higher than allowed:
        n_tokens = max_sequence_length
        src_seq = src_seq[:max_sequence_length]

    # padding the sequence if its length is lower than the maximum one and
    # converting it to the right format:
    src_seq = torch_cat(
        (
            tensor(src_seq, dtype=torch_long),  # noqa: E501 pylint: disable=not-callable
            torch_zeros((max_sequence_length - n_tokens), dtype=torch_long)),
        dim=-1)
    src_seq = torch_unsqueeze(input=src_seq, dim=0)

    # creating the sequence mask based on the padding done:
    src_seq_mask = torch_cat(
        (torch_ones((1, 1, n_tokens), dtype=torch_long),
         torch_zeros(
             (1, 1, max_sequence_length - n_tokens), dtype=torch_long)),
        dim=-1)

    return src_seq, src_seq_mask
示例#2
0
def xy_to_cxcy(xy):
    """Calculation of center-size coordinates calculation from boundary coordinates as used in SSD rewrite in PyTorch.

    This is implemented as shown in https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Object-Detection. Some 
    modifications are made. All credits to @sgrvinod.
    """

    return torch_cat([(xy[:, 2:] + xy[:, :2]) / 2, xy[:, 2:] - xy[:, :2]], 1)
示例#3
0
    def forward(self, conv4_3_features, conv7_features, conv8_2_features,
                conv9_2_features, conv10_2_features, conv11_2_features):
        batch_size = conv4_3_features.size(0)

        locations = self._predict_locations(batch_size, conv4_3_features,
                                            conv7_features, conv8_2_features,
                                            conv9_2_features,
                                            conv10_2_features,
                                            conv11_2_features)
        locations = torch_cat(list(locations), dim=1)

        classes_scores = self._predict_classes(
            batch_size, conv4_3_features, conv7_features, conv8_2_features,
            conv9_2_features, conv10_2_features, conv11_2_features)
        classes_scores = torch_cat(list(classes_scores), dim=1)

        return locations, classes_scores
示例#4
0
 def core(self, it, fc_feats_ph, att_feats_ph, memory, state, mask):
     if len(state) == 0:
         ys = it.unsqueeze(1)
     else:
         ys = torch_cat([state[0][0], it.unsqueeze(1)], dim=1)
     out = self.model.decode(memory, mask, ys,
                             subsequent_mask(ys.size(1)).to(memory.device))
     return out[:, -1], [ys.unsqueeze(0)]
示例#5
0
def cxcy_to_xy(cxcy):
    """Calculation of boundary coordinates calculation from center-size coordinates as used in SSD rewrite in PyTorch.

    This is implemented as shown in https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Object-Detection. Some
    modifications are made. All credits to @sgrvinod.
    """

    return torch_cat(
        [cxcy[:, :2] - (cxcy[:, 2:] / 2), cxcy[:, :2] + (cxcy[:, 2:] / 2)], 1)
示例#6
0
def gcxgcy_to_cxcy(gcxgcy, priors_cxcy):
    """Decodes bounding boxes from the corresponding prior boxes, both in center-size coordinates form, as used in SSD
    rewrite in PyTorch.

    This is implemented as shown in https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Object-Detection. Some 
    modifications are made. All credits to @sgrvinod.
    """

    return torch_cat([
        gcxgcy[:, :2] * priors_cxcy[:, 2:] / 10 + priors_cxcy[:, :2],
        torch_exp(gcxgcy[:, 2:] / 5) * priors_cxcy[:, 2:]
    ], 1)
示例#7
0
def cxcy_to_gcxgcy(cxcy, priors_cxcy):
    """Encodes bounding boxes to the corresponding prior boxes, both in center-size coordinates form, as used in SSD
    rewrite in PyTorch.

    This is implemented as shown in https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Object-Detection. Some 
    modifications are made. All credits to @sgrvinod.
    """

    # https://github.com/weiliu89/caffe/issues/155
    return torch_cat([(cxcy[:, :2] - priors_cxcy[:, :2]) /
                      (priors_cxcy[:, 2:] / 10),
                      torch_log(cxcy[:, 2:] / priors_cxcy[:, 2:]) * 5], 1)
    def get_pooled_features(self, x:str) -> Tensor:
        """
        Get concatenation of [mean, max, last] of last hidden state.

        Parameters
        ----------
        x: str
            this is the pre-processed string associated with the issue.
            If you have two seperate fields "title" and "body" you will want
            to pre-process these fields with the process_dict method before calling
            this.

        Returns
        -------
        Tensor
            This is an embedding in the form of a Tensor with the shape (1, 2400)
        """
        raw = self.get_raw_features(x)
        # return [mean, max, last] with size of (1, self.learn.emb_sz * 3)
        return torch_cat([raw.mean(dim=1), raw.max(dim=1)[0], raw[:,-1,:]], dim=-1)
示例#9
0
    def forward(self, input_step, prev_hidden_state, encoder_outputs):
        '''
        NOTE: This forward function happens one timestep at a time! Therefore:
        input_step = (1, batch_size) --> single input word fed to the GRU
        prev_hidden_state = (num_layers*num_directions, batch_size, hidden_size) --> final hidden state of encoder
        encoder_outputs = (max_length, batch_size, hidden_size) --> final output state of encoder
        '''
        if self.use_embedding is True:
            # this will return shape of (1, batch_size, hidden_size) --> since embedding_size = hidden_size
            output = self.embedding(input_step)
        else:
            output = input_step
        # output = (1, batch_size, hidden_size); prev_hidden_state = (num_layers*num_directions, batch_size, hidden_size)
        output, prev_hidden_state = self.gru(output, prev_hidden_state)
        # attention_weights = (batch_size, 1, max_length)
        attention_weights = self.attention(output, encoder_outputs)

        '''
        BMM = batch matrix multiplication
        Here we're multiplying the attention weights with the encoder outputs. Before we can do that however,
        we need to transpose the encoder outpust to a shape of (batch_size, max_length, hidden_size).
        This would mean we're doing (batch_size, 1, max_length) * (batch_size, max_length, hidden_size), which
        can work because number of columns in first matrix is equal to number of rows in second matrix: max_length
        This would return a size of (batch_size, 1, hidden_size)
        '''
        context = attention_weights.bmm(encoder_outputs.transpose(0, 1))
        # we do this because we want shape of (batch_size, hidden_size) for concatenation, so we squeeze along dimension 0
        output = output.squeeze(0)
        # we do this because we want shape of (batch_size, hidden_size) for concatenation, so we squeeze along dimension 1
        context = context.squeeze(1)
        # concatenate output of GRU and context vector along dimension 1. This returns (batch_size, hidden_size*2)
        concatenated_output = torch_cat((output, context), 1)
        # reduce (batch_size, hidden_suze*2) to (batch_size, hidden_size)
        concatenated_output = torch_tanh(self.concatenate(concatenated_output))
        # transform from (batch_size, hidden_size) to (batch_size, vocab_size)
        output = self.output(concatenated_output)
        # take softmax across dimension 1 - the columns
        output = F.log_softmax(output, dim=1)
        return output, prev_hidden_state
    def forward(self, v_in):
        """Forward pass.

        :param v_in: The input to the RNN encoder of the Masker.
        :type v_in: numpy.core.multiarray.ndarray
        :return: The output of the RNN encoder of the Masker.
        :rtype: torch.autograd.variable.Variable
        """
        batch_size = v_in.size()[0]
        seq_length = v_in.size()[1]

        h_t_f = Variable(torch_zeros(batch_size, self._input_dim))
        h_t_b = Variable(torch_zeros(batch_size, self._input_dim))
        h_enc = Variable(
            torch_zeros(batch_size, seq_length - (2 * self._context_length),
                        2 * self._input_dim))
        v_tr = v_in[:, :, :self._input_dim]

        if not self._debug and torch_has_cudnn:
            h_t_f = h_t_f.cuda()
            h_t_b = h_t_b.cuda()
            h_enc = h_enc.cuda()

        for t in range(seq_length):
            h_t_f = self.gru_enc_f((v_tr[:, t, :]), h_t_f)
            h_t_b = self.gru_enc_b((v_tr[:, seq_length - t - 1, :]), h_t_b)

            if self._context_length <= t < seq_length - self._context_length:
                h_t = torch_cat([
                    h_t_f + v_tr[:, t, :],
                    h_t_b + v_tr[:, seq_length - t - 1, :]
                ],
                                dim=1)
                h_enc[:, t - self._context_length, :] = h_t

        return h_enc
示例#11
0
    def detect_objects(self, image_as_tensor, min_score, max_overlap, top_k):
        predicted_locs, predicted_scores = self.forward(image_as_tensor)
        batch_size = predicted_locs.size(0)
        n_priors = self.priors_cxcy.size(0)
        predicted_scores = F.softmax(predicted_scores, dim=2)

        all_images_boxes = list()
        all_images_labels = list()
        all_images_scores = list()

        assert n_priors == predicted_locs.size(1) == predicted_scores.size(1)

        for i in range(batch_size):
            decoded_locs = cxcy_to_xy(
                gcxgcy_to_cxcy(predicted_locs[i], self.priors_cxcy))

            image_boxes = list()
            image_labels = list()
            image_scores = list()

            for c in range(self.num_classes - 1):
                class_scores = predicted_scores[i][:, c]
                score_above_min_score = class_scores > min_score
                n_above_min_score = score_above_min_score.sum().item()
                if n_above_min_score == 0:
                    continue
                class_scores = class_scores[score_above_min_score]
                class_decoded_locs = decoded_locs[score_above_min_score]

                class_scores, sort_ind = class_scores.sort(dim=0,
                                                           descending=True)
                class_decoded_locs = class_decoded_locs[sort_ind]

                overlap = find_jaccard_overlap(class_decoded_locs,
                                               class_decoded_locs)

                suppress = self._to_cuda(
                    torch_zeros((n_above_min_score), dtype=torch_uint8))
                for box in range(class_decoded_locs.size(0)):
                    if suppress[box] == 1:
                        continue

                    suppress = torch_max(
                        suppress,
                        (overlap[box] > max_overlap).type(torch_uint8))
                    suppress[box] = 0

                kept_indices = self._to_cuda(
                    suppress.type(BoolTensor).logical_not())
                locs = class_decoded_locs[kept_indices].tolist()
                for loc_index, loc in enumerate(locs):
                    locs[loc_index] = [
                        max(loc[0], 0.),
                        max(loc[1], 0.),
                        min(loc[2], 1.),
                        min(loc[3], 1.)
                    ]
                image_boxes.append(self._to_cuda(FloatTensor(locs)))
                image_labels.append(
                    self._to_cuda(LongTensor(kept_indices.sum().item() * [c])))
                image_scores.append(self._to_cuda(class_scores[kept_indices]))

            if len(image_boxes) == 0:
                image_boxes.append(
                    self._to_cuda(FloatTensor([[0., 0., 0., 0.]])))
                image_labels.append(self._to_cuda(LongTensor([120])))
                image_scores.append(self._to_cuda(FloatTensor([0.])))

            image_boxes = self._to_cuda(torch_cat(image_boxes, dim=0))
            image_labels = self._to_cuda(torch_cat(image_labels, dim=0))
            image_scores = self._to_cuda(torch_cat(image_scores, dim=0))
            n_objects = image_scores.size(0)

            if n_objects > top_k:
                image_scores, sort_ind = image_scores.sort(dim=0,
                                                           descending=True)
                image_scores = image_scores[:top_k]
                image_boxes = image_boxes[sort_ind][:top_k]
                image_labels = image_labels[sort_ind][:top_k]

            all_images_boxes.append(image_boxes)
            all_images_labels.append(image_labels)
            all_images_scores.append(image_scores)

        return all_images_boxes, all_images_labels, all_images_scores
示例#12
0
    def _ssd_discrete_metrics(self, predictions, targets, is_cuda=False, *unused_args, **unused_kwargs):
        def __to_cuda(obj):
            if is_cuda:
                obj = obj.cuda()
            return obj

        predicted_boxes = predictions['boxes']
        predicted_labels = predictions['labels']
        predicted_class_scores = predictions['scores']

        target_boxes = targets['boxes']
        target_labels = targets['labels']

        assert len(predicted_boxes) == len(predicted_labels) == len(predicted_class_scores) == len(
            target_boxes) == len(target_labels)

        target_images = list()
        for i in range(len(target_labels)):
            target_images.extend([i] * target_labels[i].size(0))
        target_images = __to_cuda(LongTensor(target_images))
        target_boxes = torch_cat(target_boxes, dim=0)
        target_labels = torch_cat(target_labels, dim=0)

        assert target_images.size(0) == target_boxes.size(0) == target_labels.size(0)

        predicted_images = list()
        for i in range(len(predicted_labels)):
            predicted_images.extend([i] * predicted_labels[i].size(0))
        predicted_images = __to_cuda(LongTensor(predicted_images))
        predicted_boxes = torch_cat(predicted_boxes, dim=0)
        predicted_labels = torch_cat(predicted_labels, dim=0)
        predicted_class_scores = torch_cat(predicted_class_scores, dim=0)

        assert predicted_images.size(0) == predicted_boxes.size(0) == predicted_labels.size(
            0) == predicted_class_scores.size(0)

        average_precisions = torch_zeros(self.num_classes, dtype=torch_float)
        recalls = torch_zeros(self.num_classes, dtype=torch_float)
        precisions = torch_zeros(self.num_classes, dtype=torch_float)
        for c in range(self.num_classes):
            target_class_images = target_images[target_labels == c]
            target_class_boxes = target_boxes[target_labels == c]

            total_objects = target_class_boxes.size(0)

            target_class_boxes_detected = __to_cuda(torch_zeros(total_objects, dtype=torch_uint8))

            class_c_predicted_images = predicted_images[predicted_labels == c]
            class_c_predicted_boxes = predicted_boxes[predicted_labels == c]
            class_c_predicted_class_scores = predicted_class_scores[predicted_labels == c]
            class_c_num_detections = class_c_predicted_boxes.size(0)
            if class_c_num_detections == 0:
                continue

            class_c_predicted_class_scores, sort_ind = torch_sort(class_c_predicted_class_scores, dim=0,
                                                                  descending=True)
            class_c_predicted_images = class_c_predicted_images[sort_ind]
            class_c_predicted_boxes = class_c_predicted_boxes[sort_ind]

            true_positives = __to_cuda(torch_zeros(class_c_num_detections, dtype=torch_float))
            false_positives = __to_cuda(torch_zeros(class_c_num_detections, dtype=torch_float))
            for d in range(class_c_num_detections):
                this_detection_box = shapely_box(*class_c_predicted_boxes[d].data)
                this_image = class_c_predicted_images[d]

                object_boxes = target_class_boxes[target_class_images == this_image]
                if object_boxes.size(0) == 0:
                    false_positives[d] = 1
                    continue

                ground_truth_contains_prediction_center = [
                    shapely_box(*box.data).contains(this_detection_box.centroid) for box in object_boxes]
                for ind, prediction_center_in_ground_truth in enumerate(ground_truth_contains_prediction_center):
                    original_ind = LongTensor(range(target_class_boxes.size(0)))[target_class_images == this_image][ind]

                    if prediction_center_in_ground_truth:
                        if target_class_boxes_detected[original_ind] == 0:
                            true_positives[d] = 1
                            target_class_boxes_detected[original_ind] = 1
                        else:
                            false_positives[d] = 1
                    else:
                        false_positives[d] = 1

            cumul_true_positives = torch_cumsum(true_positives, dim=0)
            cumul_false_positives = torch_cumsum(false_positives, dim=0)
            cumul_precision = cumul_true_positives / (cumul_true_positives + cumul_false_positives + 1e-10)
            cumul_recall = cumul_true_positives / total_objects

            recall_thresholds = [x / 10 for x in range(11)]
            interpolated_precisions = __to_cuda(torch_zeros((len(recall_thresholds)), dtype=torch_float))
            for i, threshold in enumerate(recall_thresholds):
                recalls_above_threshold = cumul_recall >= threshold
                if recalls_above_threshold.any():
                    interpolated_precisions[i] = cumul_precision[recalls_above_threshold].max()
                else:
                    interpolated_precisions[i] = 0.
            average_precisions[c] = interpolated_precisions.mean()

            total_true_positives = torch_sum(true_positives)
            recalls[c] = total_true_positives / max(float(total_objects), 1e-10)
            precisions[c] = total_true_positives / max(
                total_true_positives + torch_sum(false_positives), torch_tensor(1e-10))
        return average_precisions.tolist(), recalls.tolist(), precisions.tolist()
示例#13
0
 def collate(self, tensors: Iterable[Tensor]) -> Tensor:
     return torch_cat(tensors=list(tensors), dim=0)
示例#14
0
    def __getitem__(self, item):
        row_data = self.data.iloc[item]

        temporal_tensor = []
        self.individual_tr.start()

        for i, p in enumerate(row_data.img_paths):
            # get the events tensor with the appropriate number of channels depending on polarity
            img = self.read_image(p)
            img = self.individual_tr(img)
            events_tensor = self.to_tensor_transform(img)

            if self.several_features:
                other_polarity_path = row_data.other_feature[i]
                if op.exists(other_polarity_path):
                    channel_2_img = self.read_image(other_polarity_path)
                    # channel_2_tensor = self.transforms(channel_2_img)
                    channel_2_img = self.individual_tr(channel_2_img)
                    channel_2_tensor = self.to_tensor_transform(channel_2_img)

                    # image channel is on dimension 0
                    events_tensor = torch_cat(
                        (events_tensor, channel_2_tensor), dim=0)
                else:
                    print(
                        "intended several features but path {0} does not exists"
                        .format(other_polarity_path))
            temporal_tensor.append(events_tensor)
        temporal_tensor = torch_cat(temporal_tensor, dim=0)

        with open(row_data.info_dict) as fd:
            info_dict = json.load(fd)
        # with open(row_data.temp2text) as fd:
        #     temp2txt = json.load(fd)
        #
        # # get the texture image associated to the first event image
        # img_raw_p = row_data.img_raw
        # p = row_data.img_paths[0]
        # r_name = temp2txt[op.basename(p)]
        # if not r_name.endswith(".png") or not r_name.endswith(".jpg"):
        #     r_name += ".png"
        # img_raw_p = op.join(img_raw_p, r_name)
        img_raw_p = self.get_img_raw_path(row_data)

        img_raw = self.read_image(img_raw_p, mode="RGB")
        # img_raw_tensor = self.transforms(img_raw)
        img_raw = self.individual_tr(img_raw)
        img_raw_tensor = self.to_tensor_transform(img_raw)

        # get the future image. Take into account that the class image of the last temporal
        # image has to be the first element of the answer, even if technically,
        # it is not part of the future
        p = row_data.class_imgs[-1]
        # get the class image
        class_img_tensor = self.get_class_img(p, info_dict['factor'])
        future_tensor = [class_img_tensor]

        if self.future_process:
            for p in row_data.future_data:
                # get the class image
                class_img_tensor = self.get_class_img(p, info_dict['factor'])
                future_tensor.append(class_img_tensor)
        future_tensor = torch_cat(future_tensor, dim=0)

        if self.transforms:
            # we have to concat data to allow transformation to be perform equally at all levels
            input_ = torch_cat(
                [img_raw_tensor, temporal_tensor,
                 future_tensor.float()],
                dim=0)
            input_ = self.transforms(input_)

            img_raw_size = img_raw_tensor.shape[0]
            img_raw_tensor = input_[:img_raw_size, :, :]
            temporal_tensor = input_[img_raw_size:img_raw_size +
                                     temporal_tensor.shape[0], :, :]
            future_tensor = input_[-future_tensor.shape[0]:, :, :].int()

        self.individual_tr.end()

        return (img_raw_tensor, temporal_tensor), future_tensor
示例#15
0
    def __getitem__(self, item):
        if self.future_process:
            window, future, _, _ = self.data.iloc[item]
        else:
            window, _, _ = self.data.iloc[item]
            future = []
        temporal_tensor = []
        self.individual_tr.start()

        for p in window:
            # get the events tensor with the appropriate number of channels depending on polarity
            img = self.read_image(p)
            img = self.individual_tr(img)
            events_tensor = self.to_tensor_transform(img)

            other_polarity_path = p.replace(self.init_path,
                                            self.add_channel) if len(
                                                self.add_channel) else None
            if other_polarity_path:
                channel_2_img = self.read_image(other_polarity_path)
                # channel_2_tensor = self.transforms(channel_2_img)
                channel_2_img = self.individual_tr(channel_2_img)
                channel_2_tensor = self.to_tensor_transform(channel_2_img)

                # image channel is on dimension 0
                events_tensor = torch_cat((events_tensor, channel_2_tensor),
                                          dim=0)
            temporal_tensor.append(events_tensor)
        temporal_tensor = torch_cat(temporal_tensor, dim=0)

        # get the texture image associated to the first event image
        p = window[0]
        r_name = self.temp2txt[op.basename(p)]
        if not r_name.endswith(".png") or not r_name.endswith(".jpg"):
            r_name += ".png"
        img_raw_p = op.join(self.img_raw_path, r_name)
        img_raw = self.read_image(img_raw_p, mode="RGB")
        # img_raw_tensor = self.transforms(img_raw)
        img_raw = self.individual_tr(img_raw)
        img_raw_tensor = self.to_tensor_transform(img_raw)

        # get the future image. Take into account that the class image of the last temporal
        # image has to be the first element of the answer, even if technically,
        # it is not part of the future
        p = window[-1]
        # get the class image
        class_img_tensor = self.get_class_img(p)
        future_tensor = [class_img_tensor]

        for p in future:
            # get the class image
            class_img_tensor = self.get_class_img(p)
            future_tensor.append(class_img_tensor)
        future_tensor = torch_cat(future_tensor, dim=0)

        if self.transforms:
            # we have to concat data to allow transformation to be perform equally at all levels
            input_ = torch_cat(
                [img_raw_tensor, temporal_tensor,
                 future_tensor.float()],
                dim=0)
            input_ = self.transforms(input_)

            img_raw_size = img_raw_tensor.shape[0]
            img_raw_tensor = input_[:img_raw_size, :, :]
            temporal_tensor = input_[img_raw_size:img_raw_size +
                                     temporal_tensor.shape[0], :, :]
            future_tensor = input_[-future_tensor.shape[0]:, :, :].int()

        self.individual_tr.end()

        return (img_raw_tensor, temporal_tensor), future_tensor
示例#16
0
    def predict(  # pylint: disable=too-many-arguments
            self,
            src_sequences: Tensor,
            src_masks: Tensor,
            tgt_bos_token: int,
            decoding_method: str = 'greedy',
            gpu_if_possible: bool = True) -> Tensor:
        """
        Predict target token sequences from source token sequences.
        """
        # selecting the device handling computations:
        device = select_device(gpu_if_possible=gpu_if_possible)

        # moving model parameters and buffers to such device:
        self.model.to(device)

        # moving inputs to such device:
        src_sequences = src_sequences.to(device)
        src_masks = src_masks.to(device)

        # switching to inference mode:
        self.model.eval()

        if decoding_method == 'greedy':

            # greedy decoding:

            # computing encoder outputs, i.e. encoded representations of
            # source tokens - from dimensionality (samples, tokens) to
            # dimensionality (samples, tokens, features):
            src_encoded_tokens = self.model.encode(src_tokens=src_sequences,
                                                   src_mask=src_masks)

            # initializing predicted output sequences:
            cumulative_tgt_sequences = torch_ones((1, 1), requires_grad=False)\
                .fill_(value=tgt_bos_token).type_as(src_sequences)

            # for each target position, the respective token is sequentially
            # predicted, given the decoder auto-regressive predictive nature -
            # for all sequences at the same time:
            for _ in range(self.max_sequence_length - 1):

                # computing logits - from dimensionality (samples, tokens,
                # features) to dimensionality (samples, tokens, features):
                next_token_logits = self.model.decode(
                    src_encoded_tokens=src_encoded_tokens,
                    src_mask=src_masks,
                    tgt_tokens=cumulative_tgt_sequences,
                    tgt_mask=allowed_positions_to_attend(
                        # positions to attend equal computed target tokens:
                        n_positions=cumulative_tgt_sequences.size(1)).to(
                            device))

                # turning the logits of next (last) tokens in the sequences
                # into log-probabilities - from dimensionality (samples,
                # tokens, features) to dimensionality (samples, features):
                next_token_log_probabilities = self.model.log_softmax_layer(
                    next_token_logits[:, -1]  # next (last) tokens
                )

                # discretizing probabilities to predicted tokens - from
                # dimensionality (samples, features) to dimensionality
                # (samples):
                next_tokens = torch_max(next_token_log_probabilities,
                                        dim=1).indices[0]

                # concatenating the newly predicted tokens to the sequences of
                # already predicted tokens:
                cumulative_tgt_sequences = torch_cat(
                    (cumulative_tgt_sequences, torch_ones(
                        (1, 1)).type_as(src_sequences).fill_(next_tokens)),
                    dim=1)
                # FIXME: shapes not understood

                # TODO: truncate the different predicted sequences in the
                # mini-batch from their respective first padding token on

            return cumulative_tgt_sequences

        raise NotImplementedError("Unavailable decoding method: " +
                                  decoding_method)
示例#17
0
    def forward(self, input, seq, data_gts):
        """
        Input is either logits or log softmax
        """
        out = {}

        batch_size = input.size(0)  # batch_size = sample_size * seq_per_img
        seq_per_img = batch_size // len(data_gts)

        assert seq_per_img == self.opt.train_sample_n, seq_per_img

        mask = (seq > 0).float()
        mask = torch_cat([mask.new_full((mask.size(0), 1), 1), mask[:, :-1]],
                         1)

        scores = get_scores(data_gts, seq, self.opt)
        scores = from_numpy(scores).type_as(input).view(-1, seq_per_img)
        out["reward"] = scores  # .mean()
        if self.opt.entropy_reward_weight > 0:
            entropy = (-(F.softmax(input, dim=2) *
                         F.log_softmax(input, dim=2)).sum(2).data)
            entropy = (entropy * mask).sum(1) / mask.sum(1)
            print("entropy", entropy.mean().item())
            scores = scores + self.opt.entropy_reward_weight * entropy.view(
                -1, seq_per_img)
        # rescale cost to [0,1]
        costs = -scores
        if self.loss_type == "risk" or self.loss_type == "softmax_margin":
            costs = costs - costs.min(1, keepdim=True)[0]
            costs = costs / costs.max(1, keepdim=True)[0]
        # in principle
        # Only risk need such rescale
        # margin should be alright; Let's try.

        # Gather input: BxTxD -> BxT
        input = input.gather(2, seq.unsqueeze(2)).squeeze(2)

        if self.loss_type == "seqnll":
            # input is logsoftmax
            input = input * mask
            input = input.sum(1) / mask.sum(1)
            input = input.view(-1, seq_per_img)

            target = costs.min(1)[1]
            output = F.cross_entropy(input, target)
        elif self.loss_type == "risk":
            # input is logsoftmax
            input = input * mask
            input = input.sum(1)
            input = input.view(-1, seq_per_img)

            output = (F.softmax(input.exp()) * costs).sum(1).mean()

            # test
            # avg_scores = input
            # probs = F.softmax(avg_scores.exp_())
            # loss = (probs * costs.type_as(probs)).sum() / input.size(0)
            # print(output.item(), loss.item())

        elif self.loss_type == "max_margin":
            # input is logits
            input = input * mask
            input = input.sum(1) / mask.sum(1)
            input = input.view(-1, seq_per_img)
            _, __ = costs.min(1, keepdim=True)
            costs_star = _
            input_star = input.gather(1, __)
            output = F.relu(costs - costs_star - input_star +
                            input).max(1)[0] / 2
            output = output.mean()

            # sanity test
            # avg_scores = input + costs
            # scores_with_high_target = avg_scores.clone()
            # scores_with_high_target.scatter_(1, costs.min(1)[1].view(-1, 1), 1e10)

            # target_and_offender_index = scores_with_high_target.sort(1, True)[1][:, 0:2]
            # avg_scores = avg_scores.gather(1, target_and_offender_index)
            # target_index = avg_scores.new_zeros(avg_scores.size(0), dtype=torch.long)
            # loss = F.multi_margin_loss(avg_scores, target_index, size_average=True, margin=0)
            # print(loss.item() * 2, output.item())

        elif self.loss_type == "multi_margin":
            # input is logits
            input = input * mask
            input = input.sum(1) / mask.sum(1)
            input = input.view(-1, seq_per_img)
            _, __ = costs.min(1, keepdim=True)
            costs_star = _
            input_star = input.gather(1, __)
            output = F.relu(costs - costs_star - input_star + input)
            output = output.mean()

            # sanity test
            # avg_scores = input + costs
            # loss = F.multi_margin_loss(avg_scores, costs.min(1)[1], margin=0)
            # print(output, loss)

        elif self.loss_type == "softmax_margin":
            # input is logsoftmax
            input = input * mask
            input = input.sum(1) / mask.sum(1)
            input = input.view(-1, seq_per_img)

            input = input + costs
            target = costs.min(1)[1]
            output = F.cross_entropy(input, target)

        elif self.loss_type == "real_softmax_margin":
            # input is logits
            # This is what originally defined in Kevin's paper
            # The result should be equivalent to softmax_margin
            input = input * mask
            input = input.sum(1) / mask.sum(1)
            input = input.view(-1, seq_per_img)

            input = input + costs
            target = costs.min(1)[1]
            output = F.cross_entropy(input, target)

        elif self.loss_type == "new_self_critical":
            """
            A different self critical
            Self critical uses greedy decoding score as baseline;
            This setting uses the average score of the rest samples as baseline
            (suppose c1...cn n samples, reward1 = score1 - 1/(n-1)(score2+..+scoren) )
            """
            baseline = (scores.sum(1, keepdim=True) -
                        scores) / (scores.shape[1] - 1)
            scores = scores - baseline
            # self cider used as reward to promote diversity (not working that much in this way)
            if getattr(self.opt, "self_cider_reward_weight", 0) > 0:
                _scores = get_self_cider_scores(data_gts, seq, self.opt)
                _scores = from_numpy(_scores).type_as(scores).view(-1, 1)
                _scores = _scores.expand_as(scores - 1)
                scores += self.opt.self_cider_reward_weight * _scores
            output = -input * mask * scores.view(-1, 1)
            output = torch_sum(output) / torch_sum(mask)

        out["loss"] = output
        return out
示例#18
0
    def _ssd_discrete_metrics(self, predictions, targets, iou_threshold=0.5, is_cuda=False):
        def __to_cuda(obj):
            if is_cuda:
                obj = obj.cuda()
            return obj

        predicted_boxes = predictions['boxes']
        predicted_labels = predictions['labels']
        predicted_class_scores = predictions['scores']

        target_boxes = targets['boxes']
        target_labels = targets['labels']

        assert len(predicted_boxes) == len(predicted_labels) == len(predicted_class_scores) == len(
            target_boxes) == len(target_labels)

        target_images = list()
        for i in range(len(target_labels)):
            target_images.extend([i] * target_labels[i].size(0))
        target_images = __to_cuda(LongTensor(target_images))
        target_boxes = torch_cat(target_boxes, dim=0)
        target_labels = torch_cat(target_labels, dim=0)

        assert target_images.size(0) == target_boxes.size(0) == target_labels.size(0)

        predicted_images = list()
        for i in range(len(predicted_labels)):
            predicted_images.extend([i] * predicted_labels[i].size(0))
        predicted_images = __to_cuda(LongTensor(predicted_images))
        predicted_boxes = torch_cat(predicted_boxes, dim=0)
        predicted_labels = torch_cat(predicted_labels, dim=0)
        predicted_class_scores = torch_cat(predicted_class_scores, dim=0)

        assert predicted_images.size(0) == predicted_boxes.size(0) == predicted_labels.size(
            0) == predicted_class_scores.size(0)

        average_precisions = torch_zeros(self.num_classes, dtype=torch_float)
        recalls = torch_zeros(self.num_classes, dtype=torch_float)
        precisions = torch_zeros(self.num_classes, dtype=torch_float)
        for c in range(self.num_classes):
            target_class_images = target_images[target_labels == c]
            target_class_boxes = target_boxes[target_labels == c]

            total_objects = target_class_boxes.size(0)

            target_class_boxes_detected = __to_cuda(torch_zeros(total_objects, dtype=torch_uint8))

            class_c_predicted_images = predicted_images[predicted_labels == c]
            class_c_predicted_boxes = predicted_boxes[predicted_labels == c]
            class_c_predicted_class_scores = predicted_class_scores[predicted_labels == c]
            class_c_num_detections = class_c_predicted_boxes.size(0)
            if class_c_num_detections == 0:
                continue

            class_c_predicted_class_scores, sort_ind = torch_sort(class_c_predicted_class_scores, dim=0,
                                                                  descending=True)
            class_c_predicted_images = class_c_predicted_images[sort_ind]
            class_c_predicted_boxes = class_c_predicted_boxes[sort_ind]

            true_positives = __to_cuda(torch_zeros(class_c_num_detections, dtype=torch_float))
            false_positives = __to_cuda(torch_zeros(class_c_num_detections, dtype=torch_float))
            for d in range(class_c_num_detections):
                this_detection_box = class_c_predicted_boxes[d].unsqueeze(0)
                this_image = class_c_predicted_images[d]

                object_boxes = target_class_boxes[target_class_images == this_image]
                if object_boxes.size(0) == 0:
                    false_positives[d] = 1
                    continue

                overlaps = find_jaccard_overlap(this_detection_box, object_boxes)
                max_overlap, ind = torch_max(overlaps.squeeze(0), dim=0)

                original_ind = LongTensor(range(target_class_boxes.size(0)))[target_class_images == this_image][ind]

                if max_overlap.item() > iou_threshold:
                    if target_class_boxes_detected[original_ind] == 0:
                        true_positives[d] = 1
                        target_class_boxes_detected[original_ind] = 1
                    else:
                        false_positives[d] = 1
                else:
                    false_positives[d] = 1

            cumul_true_positives = torch_cumsum(true_positives, dim=0)
            cumul_false_positives = torch_cumsum(false_positives, dim=0)
            cumul_precision = cumul_true_positives / (cumul_true_positives + cumul_false_positives + 1e-10)
            cumul_recall = cumul_true_positives / total_objects

            recall_thresholds = [x / 10 for x in range(11)]
            interpolated_precisions = __to_cuda(torch_zeros((len(recall_thresholds)), dtype=torch_float))
            for i, threshold in enumerate(recall_thresholds):
                recalls_above_threshold = cumul_recall >= threshold
                if recalls_above_threshold.any():
                    interpolated_precisions[i] = cumul_precision[recalls_above_threshold].max()
                else:
                    interpolated_precisions[i] = 0.
            average_precisions[c] = interpolated_precisions.mean()

            total_true_positives = torch_sum(true_positives)
            recalls[c] = total_true_positives / max(float(total_objects), 1e-10)
            precisions[c] = total_true_positives / max(
                total_true_positives + torch_sum(false_positives), torch_tensor(1e-10))
        return average_precisions.tolist(), recalls.tolist(), precisions.tolist()