def gradient_default(self, X, Y): N = X.size()[1] W_ext = unsqueeze(self.forward_model.W, 0).expand(N, -1, -1) w0_ext = unsqueeze(self.forward_model.w0, 0).expand(N, -1, -1) X_ext = transpose(unsqueeze(X, 0), 0, 2) Y_ext = transpose(unsqueeze(Y, 0), 0, 2) cuda.synchronize() return ( torch_sum(bmm( bmm(W_ext, X_ext) + w0_ext - Y_ext, transpose(X_ext, 1, 2)), dim=0) * 2 / N, # W gradient unsqueeze(torch_sum(self.forward_model(X) - Y, dim=1) * 2 / N, 1) # w0 gradient )
def lossMAE(self, v, t): """ calculate the loss for MAE :param v: :param t: :return: """ return torch_sum(torch_abs(v-t))
def forward(self, input, target, mask): # # Why would dim be 3? and why reduce to 2? if target.ndim == 3: target = target.reshape(-1, target.shape[2]) mask = mask.reshape(-1, mask.shape[2]) # # Truncate to the same size target = target[:, :input.size(1)] mask = mask[:, :input.size(1)] output = -input.gather(2, target.unsqueeze(2)).squeeze(2) * mask # # # Average over each token output = torch_sum(output) / torch_sum(mask) return output
def lossMSE(self, v, t): """ calculate the loss for MSE :param v: :param t: :return: """ return torch_sum((v-t).pow(2))
def update(self, state: np.array, reward_baseline: Tensor, action: np.array): state_tensor = FloatTensor(state).to(device=self.device) action_tensor = FloatTensor(np.array( action, dtype=np.float32)).to(device=self.device) """ Update logic from the Policy Gradient theorem. """ action_probabilities = self.model(state_tensor) action_distribution = Categorical(action_probabilities) selected_log_probabilities = action_distribution.log_prob( action_tensor) loss = torch_sum(-selected_log_probabilities * reward_baseline) self.optimizer.zero_grad() loss.backward() self.optimizer.step() if self.device == "cuda": return loss.detach().cpu().numpy() else: return loss.detach().numpy()
def torch_sum2(x, y): return torch_sum(torch_sum(x, y[1]), y[0])
def torch_sum1(x, y): return torch_sum(x, y[0])
def forward(self, input, seq, data_gts): """ Input is either logits or log softmax """ out = {} batch_size = input.size(0) # batch_size = sample_size * seq_per_img seq_per_img = batch_size // len(data_gts) assert seq_per_img == self.opt.train_sample_n, seq_per_img mask = (seq > 0).float() mask = torch_cat([mask.new_full((mask.size(0), 1), 1), mask[:, :-1]], 1) scores = get_scores(data_gts, seq, self.opt) scores = from_numpy(scores).type_as(input).view(-1, seq_per_img) out["reward"] = scores # .mean() if self.opt.entropy_reward_weight > 0: entropy = (-(F.softmax(input, dim=2) * F.log_softmax(input, dim=2)).sum(2).data) entropy = (entropy * mask).sum(1) / mask.sum(1) print("entropy", entropy.mean().item()) scores = scores + self.opt.entropy_reward_weight * entropy.view( -1, seq_per_img) # rescale cost to [0,1] costs = -scores if self.loss_type == "risk" or self.loss_type == "softmax_margin": costs = costs - costs.min(1, keepdim=True)[0] costs = costs / costs.max(1, keepdim=True)[0] # in principle # Only risk need such rescale # margin should be alright; Let's try. # Gather input: BxTxD -> BxT input = input.gather(2, seq.unsqueeze(2)).squeeze(2) if self.loss_type == "seqnll": # input is logsoftmax input = input * mask input = input.sum(1) / mask.sum(1) input = input.view(-1, seq_per_img) target = costs.min(1)[1] output = F.cross_entropy(input, target) elif self.loss_type == "risk": # input is logsoftmax input = input * mask input = input.sum(1) input = input.view(-1, seq_per_img) output = (F.softmax(input.exp()) * costs).sum(1).mean() # test # avg_scores = input # probs = F.softmax(avg_scores.exp_()) # loss = (probs * costs.type_as(probs)).sum() / input.size(0) # print(output.item(), loss.item()) elif self.loss_type == "max_margin": # input is logits input = input * mask input = input.sum(1) / mask.sum(1) input = input.view(-1, seq_per_img) _, __ = costs.min(1, keepdim=True) costs_star = _ input_star = input.gather(1, __) output = F.relu(costs - costs_star - input_star + input).max(1)[0] / 2 output = output.mean() # sanity test # avg_scores = input + costs # scores_with_high_target = avg_scores.clone() # scores_with_high_target.scatter_(1, costs.min(1)[1].view(-1, 1), 1e10) # target_and_offender_index = scores_with_high_target.sort(1, True)[1][:, 0:2] # avg_scores = avg_scores.gather(1, target_and_offender_index) # target_index = avg_scores.new_zeros(avg_scores.size(0), dtype=torch.long) # loss = F.multi_margin_loss(avg_scores, target_index, size_average=True, margin=0) # print(loss.item() * 2, output.item()) elif self.loss_type == "multi_margin": # input is logits input = input * mask input = input.sum(1) / mask.sum(1) input = input.view(-1, seq_per_img) _, __ = costs.min(1, keepdim=True) costs_star = _ input_star = input.gather(1, __) output = F.relu(costs - costs_star - input_star + input) output = output.mean() # sanity test # avg_scores = input + costs # loss = F.multi_margin_loss(avg_scores, costs.min(1)[1], margin=0) # print(output, loss) elif self.loss_type == "softmax_margin": # input is logsoftmax input = input * mask input = input.sum(1) / mask.sum(1) input = input.view(-1, seq_per_img) input = input + costs target = costs.min(1)[1] output = F.cross_entropy(input, target) elif self.loss_type == "real_softmax_margin": # input is logits # This is what originally defined in Kevin's paper # The result should be equivalent to softmax_margin input = input * mask input = input.sum(1) / mask.sum(1) input = input.view(-1, seq_per_img) input = input + costs target = costs.min(1)[1] output = F.cross_entropy(input, target) elif self.loss_type == "new_self_critical": """ A different self critical Self critical uses greedy decoding score as baseline; This setting uses the average score of the rest samples as baseline (suppose c1...cn n samples, reward1 = score1 - 1/(n-1)(score2+..+scoren) ) """ baseline = (scores.sum(1, keepdim=True) - scores) / (scores.shape[1] - 1) scores = scores - baseline # self cider used as reward to promote diversity (not working that much in this way) if getattr(self.opt, "self_cider_reward_weight", 0) > 0: _scores = get_self_cider_scores(data_gts, seq, self.opt) _scores = from_numpy(_scores).type_as(scores).view(-1, 1) _scores = _scores.expand_as(scores - 1) scores += self.opt.self_cider_reward_weight * _scores output = -input * mask * scores.view(-1, 1) output = torch_sum(output) / torch_sum(mask) out["loss"] = output return out
def test_gradients_and_parameter_updates(self): """ Test that all parameters undergo loss gradient computation with respect to them and are subsequently updated. """ # switching to training mode so that all parameters can undergo # backpropagation: self.layer.train() # defining an optimizer for updating all parameters of the layer - # learning rate is exaggerated to have meaningful updates for all # parameters even where their gradient is very weak: learning_rate = 1e12 optimizer = SGD(self.layer.parameters(), lr=learning_rate) # making sure there is no gradient computation cumulated for any # parameter making each parameter's gradient is not defined yet: optimizer.zero_grad(set_to_none=True) # taking an initial snapshot of all parameters before any # backpropagation pass: initial_parameter_dict = { name: deepcopy(parameter_vector) for name, parameter_vector in self.layer.named_parameters() } # computing the layer outputs after a forward propagation pass: outputs = self.layer(**self.forward_propagation_kwargs) # computing an hypothetical loss - averaging outputs for convenience: loss = outputs.mean() # computing loss gradients with respect to all layer parameters that # require gradient computation: loss.backward() # asserting that every parameter that requires gradient computation # has undergone loss gradient computation: subtest_base_name = "gradients" # for every parameter vector: for name, parameter_vector in self.layer.named_parameters(): subtest_name = subtest_base_name + ' - ' + name with self.subTest(subtest_name): # only parameters that require gradient computation are # considered: if parameter_vector.requires_grad: gradients = parameter_vector.grad self.assertIsNotNone(gradients) # asserting that at least a single parameter gradient in # the vector of parameters is different from zero: self.assertNotEqual(0., torch_sum(torch_abs(gradients))) # updating all layer parameters based on their gradients: optimizer.step() # asserting that every parameter has been updated: subtest_base_name = "parameter updates" # for every parameter vector: for name, updated_parameter_vector in self.layer.named_parameters(): subtest_name = subtest_base_name + ' - ' + name with self.subTest(subtest_name): # only parameters that require gradient computation. i.e. # adjustment, are considered: if updated_parameter_vector.requires_grad: self.assertFalse( torch_equal( initial_parameter_dict[name], # initial values updated_parameter_vector # updated values ))
def inner_product(xs, ys): return sum([torch_sum(x * y) for x, y in zip(xs, ys)])
def _ssd_discrete_metrics(self, predictions, targets, is_cuda=False, *unused_args, **unused_kwargs): def __to_cuda(obj): if is_cuda: obj = obj.cuda() return obj predicted_boxes = predictions['boxes'] predicted_labels = predictions['labels'] predicted_class_scores = predictions['scores'] target_boxes = targets['boxes'] target_labels = targets['labels'] assert len(predicted_boxes) == len(predicted_labels) == len(predicted_class_scores) == len( target_boxes) == len(target_labels) target_images = list() for i in range(len(target_labels)): target_images.extend([i] * target_labels[i].size(0)) target_images = __to_cuda(LongTensor(target_images)) target_boxes = torch_cat(target_boxes, dim=0) target_labels = torch_cat(target_labels, dim=0) assert target_images.size(0) == target_boxes.size(0) == target_labels.size(0) predicted_images = list() for i in range(len(predicted_labels)): predicted_images.extend([i] * predicted_labels[i].size(0)) predicted_images = __to_cuda(LongTensor(predicted_images)) predicted_boxes = torch_cat(predicted_boxes, dim=0) predicted_labels = torch_cat(predicted_labels, dim=0) predicted_class_scores = torch_cat(predicted_class_scores, dim=0) assert predicted_images.size(0) == predicted_boxes.size(0) == predicted_labels.size( 0) == predicted_class_scores.size(0) average_precisions = torch_zeros(self.num_classes, dtype=torch_float) recalls = torch_zeros(self.num_classes, dtype=torch_float) precisions = torch_zeros(self.num_classes, dtype=torch_float) for c in range(self.num_classes): target_class_images = target_images[target_labels == c] target_class_boxes = target_boxes[target_labels == c] total_objects = target_class_boxes.size(0) target_class_boxes_detected = __to_cuda(torch_zeros(total_objects, dtype=torch_uint8)) class_c_predicted_images = predicted_images[predicted_labels == c] class_c_predicted_boxes = predicted_boxes[predicted_labels == c] class_c_predicted_class_scores = predicted_class_scores[predicted_labels == c] class_c_num_detections = class_c_predicted_boxes.size(0) if class_c_num_detections == 0: continue class_c_predicted_class_scores, sort_ind = torch_sort(class_c_predicted_class_scores, dim=0, descending=True) class_c_predicted_images = class_c_predicted_images[sort_ind] class_c_predicted_boxes = class_c_predicted_boxes[sort_ind] true_positives = __to_cuda(torch_zeros(class_c_num_detections, dtype=torch_float)) false_positives = __to_cuda(torch_zeros(class_c_num_detections, dtype=torch_float)) for d in range(class_c_num_detections): this_detection_box = shapely_box(*class_c_predicted_boxes[d].data) this_image = class_c_predicted_images[d] object_boxes = target_class_boxes[target_class_images == this_image] if object_boxes.size(0) == 0: false_positives[d] = 1 continue ground_truth_contains_prediction_center = [ shapely_box(*box.data).contains(this_detection_box.centroid) for box in object_boxes] for ind, prediction_center_in_ground_truth in enumerate(ground_truth_contains_prediction_center): original_ind = LongTensor(range(target_class_boxes.size(0)))[target_class_images == this_image][ind] if prediction_center_in_ground_truth: if target_class_boxes_detected[original_ind] == 0: true_positives[d] = 1 target_class_boxes_detected[original_ind] = 1 else: false_positives[d] = 1 else: false_positives[d] = 1 cumul_true_positives = torch_cumsum(true_positives, dim=0) cumul_false_positives = torch_cumsum(false_positives, dim=0) cumul_precision = cumul_true_positives / (cumul_true_positives + cumul_false_positives + 1e-10) cumul_recall = cumul_true_positives / total_objects recall_thresholds = [x / 10 for x in range(11)] interpolated_precisions = __to_cuda(torch_zeros((len(recall_thresholds)), dtype=torch_float)) for i, threshold in enumerate(recall_thresholds): recalls_above_threshold = cumul_recall >= threshold if recalls_above_threshold.any(): interpolated_precisions[i] = cumul_precision[recalls_above_threshold].max() else: interpolated_precisions[i] = 0. average_precisions[c] = interpolated_precisions.mean() total_true_positives = torch_sum(true_positives) recalls[c] = total_true_positives / max(float(total_objects), 1e-10) precisions[c] = total_true_positives / max( total_true_positives + torch_sum(false_positives), torch_tensor(1e-10)) return average_precisions.tolist(), recalls.tolist(), precisions.tolist()
def _ssd_discrete_metrics(self, predictions, targets, iou_threshold=0.5, is_cuda=False): def __to_cuda(obj): if is_cuda: obj = obj.cuda() return obj predicted_boxes = predictions['boxes'] predicted_labels = predictions['labels'] predicted_class_scores = predictions['scores'] target_boxes = targets['boxes'] target_labels = targets['labels'] assert len(predicted_boxes) == len(predicted_labels) == len(predicted_class_scores) == len( target_boxes) == len(target_labels) target_images = list() for i in range(len(target_labels)): target_images.extend([i] * target_labels[i].size(0)) target_images = __to_cuda(LongTensor(target_images)) target_boxes = torch_cat(target_boxes, dim=0) target_labels = torch_cat(target_labels, dim=0) assert target_images.size(0) == target_boxes.size(0) == target_labels.size(0) predicted_images = list() for i in range(len(predicted_labels)): predicted_images.extend([i] * predicted_labels[i].size(0)) predicted_images = __to_cuda(LongTensor(predicted_images)) predicted_boxes = torch_cat(predicted_boxes, dim=0) predicted_labels = torch_cat(predicted_labels, dim=0) predicted_class_scores = torch_cat(predicted_class_scores, dim=0) assert predicted_images.size(0) == predicted_boxes.size(0) == predicted_labels.size( 0) == predicted_class_scores.size(0) average_precisions = torch_zeros(self.num_classes, dtype=torch_float) recalls = torch_zeros(self.num_classes, dtype=torch_float) precisions = torch_zeros(self.num_classes, dtype=torch_float) for c in range(self.num_classes): target_class_images = target_images[target_labels == c] target_class_boxes = target_boxes[target_labels == c] total_objects = target_class_boxes.size(0) target_class_boxes_detected = __to_cuda(torch_zeros(total_objects, dtype=torch_uint8)) class_c_predicted_images = predicted_images[predicted_labels == c] class_c_predicted_boxes = predicted_boxes[predicted_labels == c] class_c_predicted_class_scores = predicted_class_scores[predicted_labels == c] class_c_num_detections = class_c_predicted_boxes.size(0) if class_c_num_detections == 0: continue class_c_predicted_class_scores, sort_ind = torch_sort(class_c_predicted_class_scores, dim=0, descending=True) class_c_predicted_images = class_c_predicted_images[sort_ind] class_c_predicted_boxes = class_c_predicted_boxes[sort_ind] true_positives = __to_cuda(torch_zeros(class_c_num_detections, dtype=torch_float)) false_positives = __to_cuda(torch_zeros(class_c_num_detections, dtype=torch_float)) for d in range(class_c_num_detections): this_detection_box = class_c_predicted_boxes[d].unsqueeze(0) this_image = class_c_predicted_images[d] object_boxes = target_class_boxes[target_class_images == this_image] if object_boxes.size(0) == 0: false_positives[d] = 1 continue overlaps = find_jaccard_overlap(this_detection_box, object_boxes) max_overlap, ind = torch_max(overlaps.squeeze(0), dim=0) original_ind = LongTensor(range(target_class_boxes.size(0)))[target_class_images == this_image][ind] if max_overlap.item() > iou_threshold: if target_class_boxes_detected[original_ind] == 0: true_positives[d] = 1 target_class_boxes_detected[original_ind] = 1 else: false_positives[d] = 1 else: false_positives[d] = 1 cumul_true_positives = torch_cumsum(true_positives, dim=0) cumul_false_positives = torch_cumsum(false_positives, dim=0) cumul_precision = cumul_true_positives / (cumul_true_positives + cumul_false_positives + 1e-10) cumul_recall = cumul_true_positives / total_objects recall_thresholds = [x / 10 for x in range(11)] interpolated_precisions = __to_cuda(torch_zeros((len(recall_thresholds)), dtype=torch_float)) for i, threshold in enumerate(recall_thresholds): recalls_above_threshold = cumul_recall >= threshold if recalls_above_threshold.any(): interpolated_precisions[i] = cumul_precision[recalls_above_threshold].max() else: interpolated_precisions[i] = 0. average_precisions[c] = interpolated_precisions.mean() total_true_positives = torch_sum(true_positives) recalls[c] = total_true_positives / max(float(total_objects), 1e-10) precisions[c] = total_true_positives / max( total_true_positives + torch_sum(false_positives), torch_tensor(1e-10)) return average_precisions.tolist(), recalls.tolist(), precisions.tolist()