def get_sequence_from_user(max_sequence_length: int) -> Tuple[Tensor, Tensor]: """ Ask the user to enter a sequence of token ids and convert it to source token tensor and source mask tensor for feeding the model. """ enter_message = ( "\nEnter the desired source sequence token ids separated by spaces: ") # asking for user input and splitting it into a sequence of token ids: src_seq = list(map(int, input(enter_message).split())) n_tokens = len(src_seq) if n_tokens > max_sequence_length: # truncating the sequence if its length is higher than allowed: n_tokens = max_sequence_length src_seq = src_seq[:max_sequence_length] # padding the sequence if its length is lower than the maximum one and # converting it to the right format: src_seq = torch_cat( ( tensor(src_seq, dtype=torch_long), # noqa: E501 pylint: disable=not-callable torch_zeros((max_sequence_length - n_tokens), dtype=torch_long)), dim=-1) src_seq = torch_unsqueeze(input=src_seq, dim=0) # creating the sequence mask based on the padding done: src_seq_mask = torch_cat( (torch_ones((1, 1, n_tokens), dtype=torch_long), torch_zeros( (1, 1, max_sequence_length - n_tokens), dtype=torch_long)), dim=-1) return src_seq, src_seq_mask
def xy_to_cxcy(xy): """Calculation of center-size coordinates calculation from boundary coordinates as used in SSD rewrite in PyTorch. This is implemented as shown in https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Object-Detection. Some modifications are made. All credits to @sgrvinod. """ return torch_cat([(xy[:, 2:] + xy[:, :2]) / 2, xy[:, 2:] - xy[:, :2]], 1)
def forward(self, conv4_3_features, conv7_features, conv8_2_features, conv9_2_features, conv10_2_features, conv11_2_features): batch_size = conv4_3_features.size(0) locations = self._predict_locations(batch_size, conv4_3_features, conv7_features, conv8_2_features, conv9_2_features, conv10_2_features, conv11_2_features) locations = torch_cat(list(locations), dim=1) classes_scores = self._predict_classes( batch_size, conv4_3_features, conv7_features, conv8_2_features, conv9_2_features, conv10_2_features, conv11_2_features) classes_scores = torch_cat(list(classes_scores), dim=1) return locations, classes_scores
def core(self, it, fc_feats_ph, att_feats_ph, memory, state, mask): if len(state) == 0: ys = it.unsqueeze(1) else: ys = torch_cat([state[0][0], it.unsqueeze(1)], dim=1) out = self.model.decode(memory, mask, ys, subsequent_mask(ys.size(1)).to(memory.device)) return out[:, -1], [ys.unsqueeze(0)]
def cxcy_to_xy(cxcy): """Calculation of boundary coordinates calculation from center-size coordinates as used in SSD rewrite in PyTorch. This is implemented as shown in https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Object-Detection. Some modifications are made. All credits to @sgrvinod. """ return torch_cat( [cxcy[:, :2] - (cxcy[:, 2:] / 2), cxcy[:, :2] + (cxcy[:, 2:] / 2)], 1)
def gcxgcy_to_cxcy(gcxgcy, priors_cxcy): """Decodes bounding boxes from the corresponding prior boxes, both in center-size coordinates form, as used in SSD rewrite in PyTorch. This is implemented as shown in https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Object-Detection. Some modifications are made. All credits to @sgrvinod. """ return torch_cat([ gcxgcy[:, :2] * priors_cxcy[:, 2:] / 10 + priors_cxcy[:, :2], torch_exp(gcxgcy[:, 2:] / 5) * priors_cxcy[:, 2:] ], 1)
def cxcy_to_gcxgcy(cxcy, priors_cxcy): """Encodes bounding boxes to the corresponding prior boxes, both in center-size coordinates form, as used in SSD rewrite in PyTorch. This is implemented as shown in https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Object-Detection. Some modifications are made. All credits to @sgrvinod. """ # https://github.com/weiliu89/caffe/issues/155 return torch_cat([(cxcy[:, :2] - priors_cxcy[:, :2]) / (priors_cxcy[:, 2:] / 10), torch_log(cxcy[:, 2:] / priors_cxcy[:, 2:]) * 5], 1)
def get_pooled_features(self, x:str) -> Tensor: """ Get concatenation of [mean, max, last] of last hidden state. Parameters ---------- x: str this is the pre-processed string associated with the issue. If you have two seperate fields "title" and "body" you will want to pre-process these fields with the process_dict method before calling this. Returns ------- Tensor This is an embedding in the form of a Tensor with the shape (1, 2400) """ raw = self.get_raw_features(x) # return [mean, max, last] with size of (1, self.learn.emb_sz * 3) return torch_cat([raw.mean(dim=1), raw.max(dim=1)[0], raw[:,-1,:]], dim=-1)
def forward(self, input_step, prev_hidden_state, encoder_outputs): ''' NOTE: This forward function happens one timestep at a time! Therefore: input_step = (1, batch_size) --> single input word fed to the GRU prev_hidden_state = (num_layers*num_directions, batch_size, hidden_size) --> final hidden state of encoder encoder_outputs = (max_length, batch_size, hidden_size) --> final output state of encoder ''' if self.use_embedding is True: # this will return shape of (1, batch_size, hidden_size) --> since embedding_size = hidden_size output = self.embedding(input_step) else: output = input_step # output = (1, batch_size, hidden_size); prev_hidden_state = (num_layers*num_directions, batch_size, hidden_size) output, prev_hidden_state = self.gru(output, prev_hidden_state) # attention_weights = (batch_size, 1, max_length) attention_weights = self.attention(output, encoder_outputs) ''' BMM = batch matrix multiplication Here we're multiplying the attention weights with the encoder outputs. Before we can do that however, we need to transpose the encoder outpust to a shape of (batch_size, max_length, hidden_size). This would mean we're doing (batch_size, 1, max_length) * (batch_size, max_length, hidden_size), which can work because number of columns in first matrix is equal to number of rows in second matrix: max_length This would return a size of (batch_size, 1, hidden_size) ''' context = attention_weights.bmm(encoder_outputs.transpose(0, 1)) # we do this because we want shape of (batch_size, hidden_size) for concatenation, so we squeeze along dimension 0 output = output.squeeze(0) # we do this because we want shape of (batch_size, hidden_size) for concatenation, so we squeeze along dimension 1 context = context.squeeze(1) # concatenate output of GRU and context vector along dimension 1. This returns (batch_size, hidden_size*2) concatenated_output = torch_cat((output, context), 1) # reduce (batch_size, hidden_suze*2) to (batch_size, hidden_size) concatenated_output = torch_tanh(self.concatenate(concatenated_output)) # transform from (batch_size, hidden_size) to (batch_size, vocab_size) output = self.output(concatenated_output) # take softmax across dimension 1 - the columns output = F.log_softmax(output, dim=1) return output, prev_hidden_state
def forward(self, v_in): """Forward pass. :param v_in: The input to the RNN encoder of the Masker. :type v_in: numpy.core.multiarray.ndarray :return: The output of the RNN encoder of the Masker. :rtype: torch.autograd.variable.Variable """ batch_size = v_in.size()[0] seq_length = v_in.size()[1] h_t_f = Variable(torch_zeros(batch_size, self._input_dim)) h_t_b = Variable(torch_zeros(batch_size, self._input_dim)) h_enc = Variable( torch_zeros(batch_size, seq_length - (2 * self._context_length), 2 * self._input_dim)) v_tr = v_in[:, :, :self._input_dim] if not self._debug and torch_has_cudnn: h_t_f = h_t_f.cuda() h_t_b = h_t_b.cuda() h_enc = h_enc.cuda() for t in range(seq_length): h_t_f = self.gru_enc_f((v_tr[:, t, :]), h_t_f) h_t_b = self.gru_enc_b((v_tr[:, seq_length - t - 1, :]), h_t_b) if self._context_length <= t < seq_length - self._context_length: h_t = torch_cat([ h_t_f + v_tr[:, t, :], h_t_b + v_tr[:, seq_length - t - 1, :] ], dim=1) h_enc[:, t - self._context_length, :] = h_t return h_enc
def detect_objects(self, image_as_tensor, min_score, max_overlap, top_k): predicted_locs, predicted_scores = self.forward(image_as_tensor) batch_size = predicted_locs.size(0) n_priors = self.priors_cxcy.size(0) predicted_scores = F.softmax(predicted_scores, dim=2) all_images_boxes = list() all_images_labels = list() all_images_scores = list() assert n_priors == predicted_locs.size(1) == predicted_scores.size(1) for i in range(batch_size): decoded_locs = cxcy_to_xy( gcxgcy_to_cxcy(predicted_locs[i], self.priors_cxcy)) image_boxes = list() image_labels = list() image_scores = list() for c in range(self.num_classes - 1): class_scores = predicted_scores[i][:, c] score_above_min_score = class_scores > min_score n_above_min_score = score_above_min_score.sum().item() if n_above_min_score == 0: continue class_scores = class_scores[score_above_min_score] class_decoded_locs = decoded_locs[score_above_min_score] class_scores, sort_ind = class_scores.sort(dim=0, descending=True) class_decoded_locs = class_decoded_locs[sort_ind] overlap = find_jaccard_overlap(class_decoded_locs, class_decoded_locs) suppress = self._to_cuda( torch_zeros((n_above_min_score), dtype=torch_uint8)) for box in range(class_decoded_locs.size(0)): if suppress[box] == 1: continue suppress = torch_max( suppress, (overlap[box] > max_overlap).type(torch_uint8)) suppress[box] = 0 kept_indices = self._to_cuda( suppress.type(BoolTensor).logical_not()) locs = class_decoded_locs[kept_indices].tolist() for loc_index, loc in enumerate(locs): locs[loc_index] = [ max(loc[0], 0.), max(loc[1], 0.), min(loc[2], 1.), min(loc[3], 1.) ] image_boxes.append(self._to_cuda(FloatTensor(locs))) image_labels.append( self._to_cuda(LongTensor(kept_indices.sum().item() * [c]))) image_scores.append(self._to_cuda(class_scores[kept_indices])) if len(image_boxes) == 0: image_boxes.append( self._to_cuda(FloatTensor([[0., 0., 0., 0.]]))) image_labels.append(self._to_cuda(LongTensor([120]))) image_scores.append(self._to_cuda(FloatTensor([0.]))) image_boxes = self._to_cuda(torch_cat(image_boxes, dim=0)) image_labels = self._to_cuda(torch_cat(image_labels, dim=0)) image_scores = self._to_cuda(torch_cat(image_scores, dim=0)) n_objects = image_scores.size(0) if n_objects > top_k: image_scores, sort_ind = image_scores.sort(dim=0, descending=True) image_scores = image_scores[:top_k] image_boxes = image_boxes[sort_ind][:top_k] image_labels = image_labels[sort_ind][:top_k] all_images_boxes.append(image_boxes) all_images_labels.append(image_labels) all_images_scores.append(image_scores) return all_images_boxes, all_images_labels, all_images_scores
def _ssd_discrete_metrics(self, predictions, targets, is_cuda=False, *unused_args, **unused_kwargs): def __to_cuda(obj): if is_cuda: obj = obj.cuda() return obj predicted_boxes = predictions['boxes'] predicted_labels = predictions['labels'] predicted_class_scores = predictions['scores'] target_boxes = targets['boxes'] target_labels = targets['labels'] assert len(predicted_boxes) == len(predicted_labels) == len(predicted_class_scores) == len( target_boxes) == len(target_labels) target_images = list() for i in range(len(target_labels)): target_images.extend([i] * target_labels[i].size(0)) target_images = __to_cuda(LongTensor(target_images)) target_boxes = torch_cat(target_boxes, dim=0) target_labels = torch_cat(target_labels, dim=0) assert target_images.size(0) == target_boxes.size(0) == target_labels.size(0) predicted_images = list() for i in range(len(predicted_labels)): predicted_images.extend([i] * predicted_labels[i].size(0)) predicted_images = __to_cuda(LongTensor(predicted_images)) predicted_boxes = torch_cat(predicted_boxes, dim=0) predicted_labels = torch_cat(predicted_labels, dim=0) predicted_class_scores = torch_cat(predicted_class_scores, dim=0) assert predicted_images.size(0) == predicted_boxes.size(0) == predicted_labels.size( 0) == predicted_class_scores.size(0) average_precisions = torch_zeros(self.num_classes, dtype=torch_float) recalls = torch_zeros(self.num_classes, dtype=torch_float) precisions = torch_zeros(self.num_classes, dtype=torch_float) for c in range(self.num_classes): target_class_images = target_images[target_labels == c] target_class_boxes = target_boxes[target_labels == c] total_objects = target_class_boxes.size(0) target_class_boxes_detected = __to_cuda(torch_zeros(total_objects, dtype=torch_uint8)) class_c_predicted_images = predicted_images[predicted_labels == c] class_c_predicted_boxes = predicted_boxes[predicted_labels == c] class_c_predicted_class_scores = predicted_class_scores[predicted_labels == c] class_c_num_detections = class_c_predicted_boxes.size(0) if class_c_num_detections == 0: continue class_c_predicted_class_scores, sort_ind = torch_sort(class_c_predicted_class_scores, dim=0, descending=True) class_c_predicted_images = class_c_predicted_images[sort_ind] class_c_predicted_boxes = class_c_predicted_boxes[sort_ind] true_positives = __to_cuda(torch_zeros(class_c_num_detections, dtype=torch_float)) false_positives = __to_cuda(torch_zeros(class_c_num_detections, dtype=torch_float)) for d in range(class_c_num_detections): this_detection_box = shapely_box(*class_c_predicted_boxes[d].data) this_image = class_c_predicted_images[d] object_boxes = target_class_boxes[target_class_images == this_image] if object_boxes.size(0) == 0: false_positives[d] = 1 continue ground_truth_contains_prediction_center = [ shapely_box(*box.data).contains(this_detection_box.centroid) for box in object_boxes] for ind, prediction_center_in_ground_truth in enumerate(ground_truth_contains_prediction_center): original_ind = LongTensor(range(target_class_boxes.size(0)))[target_class_images == this_image][ind] if prediction_center_in_ground_truth: if target_class_boxes_detected[original_ind] == 0: true_positives[d] = 1 target_class_boxes_detected[original_ind] = 1 else: false_positives[d] = 1 else: false_positives[d] = 1 cumul_true_positives = torch_cumsum(true_positives, dim=0) cumul_false_positives = torch_cumsum(false_positives, dim=0) cumul_precision = cumul_true_positives / (cumul_true_positives + cumul_false_positives + 1e-10) cumul_recall = cumul_true_positives / total_objects recall_thresholds = [x / 10 for x in range(11)] interpolated_precisions = __to_cuda(torch_zeros((len(recall_thresholds)), dtype=torch_float)) for i, threshold in enumerate(recall_thresholds): recalls_above_threshold = cumul_recall >= threshold if recalls_above_threshold.any(): interpolated_precisions[i] = cumul_precision[recalls_above_threshold].max() else: interpolated_precisions[i] = 0. average_precisions[c] = interpolated_precisions.mean() total_true_positives = torch_sum(true_positives) recalls[c] = total_true_positives / max(float(total_objects), 1e-10) precisions[c] = total_true_positives / max( total_true_positives + torch_sum(false_positives), torch_tensor(1e-10)) return average_precisions.tolist(), recalls.tolist(), precisions.tolist()
def collate(self, tensors: Iterable[Tensor]) -> Tensor: return torch_cat(tensors=list(tensors), dim=0)
def __getitem__(self, item): row_data = self.data.iloc[item] temporal_tensor = [] self.individual_tr.start() for i, p in enumerate(row_data.img_paths): # get the events tensor with the appropriate number of channels depending on polarity img = self.read_image(p) img = self.individual_tr(img) events_tensor = self.to_tensor_transform(img) if self.several_features: other_polarity_path = row_data.other_feature[i] if op.exists(other_polarity_path): channel_2_img = self.read_image(other_polarity_path) # channel_2_tensor = self.transforms(channel_2_img) channel_2_img = self.individual_tr(channel_2_img) channel_2_tensor = self.to_tensor_transform(channel_2_img) # image channel is on dimension 0 events_tensor = torch_cat( (events_tensor, channel_2_tensor), dim=0) else: print( "intended several features but path {0} does not exists" .format(other_polarity_path)) temporal_tensor.append(events_tensor) temporal_tensor = torch_cat(temporal_tensor, dim=0) with open(row_data.info_dict) as fd: info_dict = json.load(fd) # with open(row_data.temp2text) as fd: # temp2txt = json.load(fd) # # # get the texture image associated to the first event image # img_raw_p = row_data.img_raw # p = row_data.img_paths[0] # r_name = temp2txt[op.basename(p)] # if not r_name.endswith(".png") or not r_name.endswith(".jpg"): # r_name += ".png" # img_raw_p = op.join(img_raw_p, r_name) img_raw_p = self.get_img_raw_path(row_data) img_raw = self.read_image(img_raw_p, mode="RGB") # img_raw_tensor = self.transforms(img_raw) img_raw = self.individual_tr(img_raw) img_raw_tensor = self.to_tensor_transform(img_raw) # get the future image. Take into account that the class image of the last temporal # image has to be the first element of the answer, even if technically, # it is not part of the future p = row_data.class_imgs[-1] # get the class image class_img_tensor = self.get_class_img(p, info_dict['factor']) future_tensor = [class_img_tensor] if self.future_process: for p in row_data.future_data: # get the class image class_img_tensor = self.get_class_img(p, info_dict['factor']) future_tensor.append(class_img_tensor) future_tensor = torch_cat(future_tensor, dim=0) if self.transforms: # we have to concat data to allow transformation to be perform equally at all levels input_ = torch_cat( [img_raw_tensor, temporal_tensor, future_tensor.float()], dim=0) input_ = self.transforms(input_) img_raw_size = img_raw_tensor.shape[0] img_raw_tensor = input_[:img_raw_size, :, :] temporal_tensor = input_[img_raw_size:img_raw_size + temporal_tensor.shape[0], :, :] future_tensor = input_[-future_tensor.shape[0]:, :, :].int() self.individual_tr.end() return (img_raw_tensor, temporal_tensor), future_tensor
def __getitem__(self, item): if self.future_process: window, future, _, _ = self.data.iloc[item] else: window, _, _ = self.data.iloc[item] future = [] temporal_tensor = [] self.individual_tr.start() for p in window: # get the events tensor with the appropriate number of channels depending on polarity img = self.read_image(p) img = self.individual_tr(img) events_tensor = self.to_tensor_transform(img) other_polarity_path = p.replace(self.init_path, self.add_channel) if len( self.add_channel) else None if other_polarity_path: channel_2_img = self.read_image(other_polarity_path) # channel_2_tensor = self.transforms(channel_2_img) channel_2_img = self.individual_tr(channel_2_img) channel_2_tensor = self.to_tensor_transform(channel_2_img) # image channel is on dimension 0 events_tensor = torch_cat((events_tensor, channel_2_tensor), dim=0) temporal_tensor.append(events_tensor) temporal_tensor = torch_cat(temporal_tensor, dim=0) # get the texture image associated to the first event image p = window[0] r_name = self.temp2txt[op.basename(p)] if not r_name.endswith(".png") or not r_name.endswith(".jpg"): r_name += ".png" img_raw_p = op.join(self.img_raw_path, r_name) img_raw = self.read_image(img_raw_p, mode="RGB") # img_raw_tensor = self.transforms(img_raw) img_raw = self.individual_tr(img_raw) img_raw_tensor = self.to_tensor_transform(img_raw) # get the future image. Take into account that the class image of the last temporal # image has to be the first element of the answer, even if technically, # it is not part of the future p = window[-1] # get the class image class_img_tensor = self.get_class_img(p) future_tensor = [class_img_tensor] for p in future: # get the class image class_img_tensor = self.get_class_img(p) future_tensor.append(class_img_tensor) future_tensor = torch_cat(future_tensor, dim=0) if self.transforms: # we have to concat data to allow transformation to be perform equally at all levels input_ = torch_cat( [img_raw_tensor, temporal_tensor, future_tensor.float()], dim=0) input_ = self.transforms(input_) img_raw_size = img_raw_tensor.shape[0] img_raw_tensor = input_[:img_raw_size, :, :] temporal_tensor = input_[img_raw_size:img_raw_size + temporal_tensor.shape[0], :, :] future_tensor = input_[-future_tensor.shape[0]:, :, :].int() self.individual_tr.end() return (img_raw_tensor, temporal_tensor), future_tensor
def predict( # pylint: disable=too-many-arguments self, src_sequences: Tensor, src_masks: Tensor, tgt_bos_token: int, decoding_method: str = 'greedy', gpu_if_possible: bool = True) -> Tensor: """ Predict target token sequences from source token sequences. """ # selecting the device handling computations: device = select_device(gpu_if_possible=gpu_if_possible) # moving model parameters and buffers to such device: self.model.to(device) # moving inputs to such device: src_sequences = src_sequences.to(device) src_masks = src_masks.to(device) # switching to inference mode: self.model.eval() if decoding_method == 'greedy': # greedy decoding: # computing encoder outputs, i.e. encoded representations of # source tokens - from dimensionality (samples, tokens) to # dimensionality (samples, tokens, features): src_encoded_tokens = self.model.encode(src_tokens=src_sequences, src_mask=src_masks) # initializing predicted output sequences: cumulative_tgt_sequences = torch_ones((1, 1), requires_grad=False)\ .fill_(value=tgt_bos_token).type_as(src_sequences) # for each target position, the respective token is sequentially # predicted, given the decoder auto-regressive predictive nature - # for all sequences at the same time: for _ in range(self.max_sequence_length - 1): # computing logits - from dimensionality (samples, tokens, # features) to dimensionality (samples, tokens, features): next_token_logits = self.model.decode( src_encoded_tokens=src_encoded_tokens, src_mask=src_masks, tgt_tokens=cumulative_tgt_sequences, tgt_mask=allowed_positions_to_attend( # positions to attend equal computed target tokens: n_positions=cumulative_tgt_sequences.size(1)).to( device)) # turning the logits of next (last) tokens in the sequences # into log-probabilities - from dimensionality (samples, # tokens, features) to dimensionality (samples, features): next_token_log_probabilities = self.model.log_softmax_layer( next_token_logits[:, -1] # next (last) tokens ) # discretizing probabilities to predicted tokens - from # dimensionality (samples, features) to dimensionality # (samples): next_tokens = torch_max(next_token_log_probabilities, dim=1).indices[0] # concatenating the newly predicted tokens to the sequences of # already predicted tokens: cumulative_tgt_sequences = torch_cat( (cumulative_tgt_sequences, torch_ones( (1, 1)).type_as(src_sequences).fill_(next_tokens)), dim=1) # FIXME: shapes not understood # TODO: truncate the different predicted sequences in the # mini-batch from their respective first padding token on return cumulative_tgt_sequences raise NotImplementedError("Unavailable decoding method: " + decoding_method)
def forward(self, input, seq, data_gts): """ Input is either logits or log softmax """ out = {} batch_size = input.size(0) # batch_size = sample_size * seq_per_img seq_per_img = batch_size // len(data_gts) assert seq_per_img == self.opt.train_sample_n, seq_per_img mask = (seq > 0).float() mask = torch_cat([mask.new_full((mask.size(0), 1), 1), mask[:, :-1]], 1) scores = get_scores(data_gts, seq, self.opt) scores = from_numpy(scores).type_as(input).view(-1, seq_per_img) out["reward"] = scores # .mean() if self.opt.entropy_reward_weight > 0: entropy = (-(F.softmax(input, dim=2) * F.log_softmax(input, dim=2)).sum(2).data) entropy = (entropy * mask).sum(1) / mask.sum(1) print("entropy", entropy.mean().item()) scores = scores + self.opt.entropy_reward_weight * entropy.view( -1, seq_per_img) # rescale cost to [0,1] costs = -scores if self.loss_type == "risk" or self.loss_type == "softmax_margin": costs = costs - costs.min(1, keepdim=True)[0] costs = costs / costs.max(1, keepdim=True)[0] # in principle # Only risk need such rescale # margin should be alright; Let's try. # Gather input: BxTxD -> BxT input = input.gather(2, seq.unsqueeze(2)).squeeze(2) if self.loss_type == "seqnll": # input is logsoftmax input = input * mask input = input.sum(1) / mask.sum(1) input = input.view(-1, seq_per_img) target = costs.min(1)[1] output = F.cross_entropy(input, target) elif self.loss_type == "risk": # input is logsoftmax input = input * mask input = input.sum(1) input = input.view(-1, seq_per_img) output = (F.softmax(input.exp()) * costs).sum(1).mean() # test # avg_scores = input # probs = F.softmax(avg_scores.exp_()) # loss = (probs * costs.type_as(probs)).sum() / input.size(0) # print(output.item(), loss.item()) elif self.loss_type == "max_margin": # input is logits input = input * mask input = input.sum(1) / mask.sum(1) input = input.view(-1, seq_per_img) _, __ = costs.min(1, keepdim=True) costs_star = _ input_star = input.gather(1, __) output = F.relu(costs - costs_star - input_star + input).max(1)[0] / 2 output = output.mean() # sanity test # avg_scores = input + costs # scores_with_high_target = avg_scores.clone() # scores_with_high_target.scatter_(1, costs.min(1)[1].view(-1, 1), 1e10) # target_and_offender_index = scores_with_high_target.sort(1, True)[1][:, 0:2] # avg_scores = avg_scores.gather(1, target_and_offender_index) # target_index = avg_scores.new_zeros(avg_scores.size(0), dtype=torch.long) # loss = F.multi_margin_loss(avg_scores, target_index, size_average=True, margin=0) # print(loss.item() * 2, output.item()) elif self.loss_type == "multi_margin": # input is logits input = input * mask input = input.sum(1) / mask.sum(1) input = input.view(-1, seq_per_img) _, __ = costs.min(1, keepdim=True) costs_star = _ input_star = input.gather(1, __) output = F.relu(costs - costs_star - input_star + input) output = output.mean() # sanity test # avg_scores = input + costs # loss = F.multi_margin_loss(avg_scores, costs.min(1)[1], margin=0) # print(output, loss) elif self.loss_type == "softmax_margin": # input is logsoftmax input = input * mask input = input.sum(1) / mask.sum(1) input = input.view(-1, seq_per_img) input = input + costs target = costs.min(1)[1] output = F.cross_entropy(input, target) elif self.loss_type == "real_softmax_margin": # input is logits # This is what originally defined in Kevin's paper # The result should be equivalent to softmax_margin input = input * mask input = input.sum(1) / mask.sum(1) input = input.view(-1, seq_per_img) input = input + costs target = costs.min(1)[1] output = F.cross_entropy(input, target) elif self.loss_type == "new_self_critical": """ A different self critical Self critical uses greedy decoding score as baseline; This setting uses the average score of the rest samples as baseline (suppose c1...cn n samples, reward1 = score1 - 1/(n-1)(score2+..+scoren) ) """ baseline = (scores.sum(1, keepdim=True) - scores) / (scores.shape[1] - 1) scores = scores - baseline # self cider used as reward to promote diversity (not working that much in this way) if getattr(self.opt, "self_cider_reward_weight", 0) > 0: _scores = get_self_cider_scores(data_gts, seq, self.opt) _scores = from_numpy(_scores).type_as(scores).view(-1, 1) _scores = _scores.expand_as(scores - 1) scores += self.opt.self_cider_reward_weight * _scores output = -input * mask * scores.view(-1, 1) output = torch_sum(output) / torch_sum(mask) out["loss"] = output return out
def _ssd_discrete_metrics(self, predictions, targets, iou_threshold=0.5, is_cuda=False): def __to_cuda(obj): if is_cuda: obj = obj.cuda() return obj predicted_boxes = predictions['boxes'] predicted_labels = predictions['labels'] predicted_class_scores = predictions['scores'] target_boxes = targets['boxes'] target_labels = targets['labels'] assert len(predicted_boxes) == len(predicted_labels) == len(predicted_class_scores) == len( target_boxes) == len(target_labels) target_images = list() for i in range(len(target_labels)): target_images.extend([i] * target_labels[i].size(0)) target_images = __to_cuda(LongTensor(target_images)) target_boxes = torch_cat(target_boxes, dim=0) target_labels = torch_cat(target_labels, dim=0) assert target_images.size(0) == target_boxes.size(0) == target_labels.size(0) predicted_images = list() for i in range(len(predicted_labels)): predicted_images.extend([i] * predicted_labels[i].size(0)) predicted_images = __to_cuda(LongTensor(predicted_images)) predicted_boxes = torch_cat(predicted_boxes, dim=0) predicted_labels = torch_cat(predicted_labels, dim=0) predicted_class_scores = torch_cat(predicted_class_scores, dim=0) assert predicted_images.size(0) == predicted_boxes.size(0) == predicted_labels.size( 0) == predicted_class_scores.size(0) average_precisions = torch_zeros(self.num_classes, dtype=torch_float) recalls = torch_zeros(self.num_classes, dtype=torch_float) precisions = torch_zeros(self.num_classes, dtype=torch_float) for c in range(self.num_classes): target_class_images = target_images[target_labels == c] target_class_boxes = target_boxes[target_labels == c] total_objects = target_class_boxes.size(0) target_class_boxes_detected = __to_cuda(torch_zeros(total_objects, dtype=torch_uint8)) class_c_predicted_images = predicted_images[predicted_labels == c] class_c_predicted_boxes = predicted_boxes[predicted_labels == c] class_c_predicted_class_scores = predicted_class_scores[predicted_labels == c] class_c_num_detections = class_c_predicted_boxes.size(0) if class_c_num_detections == 0: continue class_c_predicted_class_scores, sort_ind = torch_sort(class_c_predicted_class_scores, dim=0, descending=True) class_c_predicted_images = class_c_predicted_images[sort_ind] class_c_predicted_boxes = class_c_predicted_boxes[sort_ind] true_positives = __to_cuda(torch_zeros(class_c_num_detections, dtype=torch_float)) false_positives = __to_cuda(torch_zeros(class_c_num_detections, dtype=torch_float)) for d in range(class_c_num_detections): this_detection_box = class_c_predicted_boxes[d].unsqueeze(0) this_image = class_c_predicted_images[d] object_boxes = target_class_boxes[target_class_images == this_image] if object_boxes.size(0) == 0: false_positives[d] = 1 continue overlaps = find_jaccard_overlap(this_detection_box, object_boxes) max_overlap, ind = torch_max(overlaps.squeeze(0), dim=0) original_ind = LongTensor(range(target_class_boxes.size(0)))[target_class_images == this_image][ind] if max_overlap.item() > iou_threshold: if target_class_boxes_detected[original_ind] == 0: true_positives[d] = 1 target_class_boxes_detected[original_ind] = 1 else: false_positives[d] = 1 else: false_positives[d] = 1 cumul_true_positives = torch_cumsum(true_positives, dim=0) cumul_false_positives = torch_cumsum(false_positives, dim=0) cumul_precision = cumul_true_positives / (cumul_true_positives + cumul_false_positives + 1e-10) cumul_recall = cumul_true_positives / total_objects recall_thresholds = [x / 10 for x in range(11)] interpolated_precisions = __to_cuda(torch_zeros((len(recall_thresholds)), dtype=torch_float)) for i, threshold in enumerate(recall_thresholds): recalls_above_threshold = cumul_recall >= threshold if recalls_above_threshold.any(): interpolated_precisions[i] = cumul_precision[recalls_above_threshold].max() else: interpolated_precisions[i] = 0. average_precisions[c] = interpolated_precisions.mean() total_true_positives = torch_sum(true_positives) recalls[c] = total_true_positives / max(float(total_objects), 1e-10) precisions[c] = total_true_positives / max( total_true_positives + torch_sum(false_positives), torch_tensor(1e-10)) return average_precisions.tolist(), recalls.tolist(), precisions.tolist()