def forward(self, x): bs = x.shape[0] x = torch.repeat_interleave(x, repeats=self.eot_samples, dim=0) y = self.model(self.noise(x)) y = y.view(bs, self.eot_samples, self.num_classes) return torch.mean(y, dim=1)
path_save_synthesis = f'{path_parent}_grow={GROW_ON_K_ITER}_bg={BACKGROUND_INTENSITY:.02f}_step={STEP_SIZE}_scale_mask={SCALE_MASK}_seed_value={SEED_VALUE}/' Path(path_save_synthesis).mkdir(parents=True, exist_ok=True)#OMM #%% for idx_lesion, (target, coord, mask, this_seed) in enumerate(zip(targets, coords, masks, seeds)): # if idx_lesion==3:break #OMM # prepare seed seed, seed_tensor, seed_pool = prepare_seed(target, this_seed, device, num_channels = num_channels, pool_size = 1024) # initialize model model = CeA_00(device = device, grow_on_k_iter=GROW_ON_K_ITER, background_intensity=BACKGROUND_INTENSITY, step_size=STEP_SIZE, scale_mask=SCALE_MASK) optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[1500,2500], gamma=0.1) ## keep 1e-4 longer model_str = correct_label_in_plot(model) target = torch.tensor(target.transpose(-1,0,1)).unsqueeze(0).to(device) target_batch = torch.repeat_interleave(target, repeats = sample_size, dim = 0) losses = [] alive_masks = [] others=[] # train automata start = time() inner_iter_aux = 0 inner_iter = 100 inner_iters=[] for i in range(epochs): inner_iter, inner_iter_aux = epochs_in_inner_loop(i, inner_iter_aux, inner_iter) inner_iters.append(inner_iter)
def recognize_batch_beam_autoreg_LM_multi_hyp(self, encoder_outputs, beam, Am_weight, gamma, LM_model, len_pen, args): """Beam search, decode one utterence now. Args: encoder_outputs: T x H, char_list: list of character, args: args.beam, Returns: nbest_hyps: """ enc_out_len = encoder_outputs.size(1) #---------------------------- maxlen = int(enc_out_len * len_pen) ### This works but can be increased but it takes memory, can be increased Works ---Memory? hyps = beam #---------------------------- print("beam,hyps,len_pen,maxlen,enc_out_len,Am_weight", beam, hyps, len_pen, maxlen, enc_out_len, Am_weight) batch_size = encoder_outputs.size(0) ys = torch.ones(batch_size * hyps, 1).fill_(self.sos_id).type_as(encoder_outputs).long() score_1 = torch.zeros_like(ys).float() rep_encoder_outputs = torch.repeat_interleave(encoder_outputs, hyps, 0) store_ended_hyps = [] store_ended_LLR = [] #============================ scores_list = [] start_collecting = False for i in range(maxlen): #---------------------------------------------------- ## if loop to use or not an LM (or) skip the LM for the first step ## if Am_weight == 1 or (i < 1): #print("not using a LM") COMB_AM_MT_local_scores, scores_list, present_label, dec_output_Bneck = self.prediction_from_trained_model( ys, rep_encoder_outputs, scores_list) else: AM_local_scores, scores_list, present_label, dec_output_Bneck = self.prediction_from_trained_model( ys, rep_encoder_outputs, scores_list) LM_local_scores, scores_list, present_label, scores = LM_model.decoder.prediction_from_trained_model( ys, encoder_outputs, scores_list) ####0.5 to 1.5 COMB_AM_MT_local_scores = Am_weight * AM_local_scores + ( 1 - Am_weight) * LM_local_scores #------------------------------------------------------------------------------------------------------------------------- ys, score_1 = self.prediction_from_trained_model_beam_Search( i, ys, score_1, COMB_AM_MT_local_scores, beam, hyps, gamma, batch_size) ##--------------------------------------------------- score_1, store_ended_hyps, store_ended_LLR = self.get_multiple_hypothesis( store_ended_hyps, store_ended_LLR, ys, score_1, i, maxlen) #---------------------------------------------------- #### removing blank predictions :::::::> #### prdicting eos at the first token ##.pop(index) in python return a value ---> careful remove_blank_predictions_index = [ index for index, element in enumerate(store_ended_hyps) if (len(element) == 2 and element[0] == self.sos_id and element[1] == self.eos_id) == True ] [ store_ended_hyps.pop(element) for element in remove_blank_predictions_index ] [ store_ended_LLR.pop(element) for element in remove_blank_predictions_index ] #---------------------------------------------------- if len(store_ended_hyps) >= hyps: break #---------------------------------------------------- ys = nn.utils.rnn.pad_sequence(store_ended_hyps, batch_first=True, padding_value=self.eos_id) score_1 = nn.utils.rnn.pad_sequence(store_ended_LLR, batch_first=True, padding_value=0) #producing the correct_order #---------------------------------------------------- XS = [torch.sum(i) for i in store_ended_LLR] XS1 = sorted(((e, i) for i, e in enumerate(XS)), reverse=True) correct_sorted_order = [i[1] for i in XS1] #---------------------------------------------------- ys = ys[correct_sorted_order] score_1 = score_1[correct_sorted_order] ##------------------------------ print(ys, torch.sum(score_1, dim=1)) #breakpoint() #-------------------------------- return ys, score_1
def expand_mask(mask, args): mask_block_rows = args.mask_block_rows mask_block_cols = args.mask_block_cols mask = torch.repeat_interleave(mask, mask_block_rows, dim=0) mask = torch.repeat_interleave(mask, mask_block_cols, dim=1) return mask
def _evaluate_image(self, idx: int, class_id: int, area_range: Tuple[int, int], max_det: int, ious: dict) -> Optional[dict]: """Perform evaluation for single class and image. Args: idx: Image Id, equivalent to the index of supplied samples. class_id: Class Id of the supplied ground truth and detection labels. area_range: List of lower and upper bounding box area threshold. max_det: Maximum number of evaluated detection bounding boxes. ious: IoU results for image and class. """ gt = self.groundtruth_boxes[idx] det = self.detection_boxes[idx] gt_label_mask = self.groundtruth_labels[idx] == class_id det_label_mask = self.detection_labels[idx] == class_id # No Gt and No predictions --> ignore image if len(gt_label_mask) == 0 and len(det_label_mask) == 0: return None nb_iou_thrs = len(self.iou_thresholds) # Some GT but no predictions if len(gt_label_mask) > 0 and len(det_label_mask) == 0: return self.__evaluate_image_gt_no_preds(gt, gt_label_mask, area_range, nb_iou_thrs) # Some predictions but no GT if len(gt_label_mask) == 0 and len(det_label_mask) >= 0: return self.__evaluate_image_preds_no_gt(det, idx, det_label_mask, max_det, area_range, nb_iou_thrs) gt = gt[gt_label_mask] det = det[det_label_mask] if gt.numel() == 0 and det.numel() == 0: return None areas = box_area(gt) ignore_area = (areas < area_range[0]) | (areas > area_range[1]) # sort dt highest score first, sort gt ignore last ignore_area_sorted, gtind = torch.sort(ignore_area.to(torch.uint8)) # Convert to uint8 temporarily and back to bool, because "Sort currently does not support bool dtype on CUDA" ignore_area_sorted = ignore_area_sorted.to(torch.bool) gt = gt[gtind] scores = self.detection_scores[idx] scores_filtered = scores[det_label_mask] scores_sorted, dtind = torch.sort(scores_filtered, descending=True) det = det[dtind] if len(det) > max_det: det = det[:max_det] # load computed ious ious = ious[idx, class_id][:, gtind] if len( ious[idx, class_id]) > 0 else ious[idx, class_id] nb_iou_thrs = len(self.iou_thresholds) nb_gt = len(gt) nb_det = len(det) gt_matches = torch.zeros((nb_iou_thrs, nb_gt), dtype=torch.bool, device=gt.device) det_matches = torch.zeros((nb_iou_thrs, nb_det), dtype=torch.bool, device=gt.device) gt_ignore = ignore_area_sorted det_ignore = torch.zeros((nb_iou_thrs, nb_det), dtype=torch.bool, device=gt.device) if torch.numel(ious) > 0: for idx_iou, t in enumerate(self.iou_thresholds): for idx_det, _ in enumerate(det): m = MeanAveragePrecision._find_best_gt_match( t, gt_matches, idx_iou, gt_ignore, ious, idx_det) if m == -1: continue det_ignore[idx_iou, idx_det] = gt_ignore[m] det_matches[idx_iou, idx_det] = 1 gt_matches[idx_iou, m] = 1 # set unmatched detections outside of area range to ignore det_areas = box_area(det) det_ignore_area = (det_areas < area_range[0]) | (det_areas > area_range[1]) ar = det_ignore_area.reshape((1, nb_det)) det_ignore = torch.logical_or( det_ignore, torch.logical_and(det_matches == 0, torch.repeat_interleave(ar, nb_iou_thrs, 0))) return { "dtMatches": det_matches.to(self.device), "gtMatches": gt_matches.to(self.device), "dtScores": scores_sorted.to(self.device), "gtIgnore": gt_ignore.to(self.device), "dtIgnore": det_ignore.to(self.device), }
def losses(self, predictions, proposals, current_step=0): """ Args: predictions: return values of :meth:`forward()`. proposals (list[Instances]): proposals that match the features that were used to compute predictions. current_step: current optimizer step. Used for losses with an annealing component. """ global device pred_class_logits, pred_proposal_deltas, pred_class_logits_var, pred_proposal_covs = predictions if len(proposals): box_type = type(proposals[0].proposal_boxes) # cat(..., dim=0) concatenates over all images in the batch proposals_boxes = box_type.cat( [p.proposal_boxes for p in proposals]) assert ( not proposals_boxes.tensor.requires_grad), "Proposals should not require gradients!" # The following fields should exist only when training. if proposals[0].has("gt_boxes"): gt_boxes = box_type.cat([p.gt_boxes for p in proposals]) assert proposals[0].has("gt_classes") gt_classes = cat([p.gt_classes for p in proposals], dim=0) else: proposals_boxes = Boxes( torch.zeros( 0, 4, device=pred_proposal_deltas.device)) no_instances = len(proposals) == 0 # no instances found # Compute Classification Loss if no_instances: # TODO 0.0 * pred.sum() is enough since PT1.6 loss_cls = 0.0 * F.cross_entropy( pred_class_logits, torch.zeros( 0, dtype=torch.long, device=pred_class_logits.device), reduction="sum",) else: if self.compute_cls_var: # Compute classification variance according to: # "What Uncertainties Do We Need in Bayesian Deep Learning for Computer Vision?", NIPS 2017 if self.cls_var_loss == 'loss_attenuation': num_samples = self.cls_var_num_samples # Compute standard deviation pred_class_logits_var = torch.sqrt( torch.exp(pred_class_logits_var)) # Produce normal samples using logits as the mean and the standard deviation computed above # Scales with GPU memory. 12 GB ---> 3 Samples per anchor for # COCO dataset. univariate_normal_dists = distributions.normal.Normal( pred_class_logits, scale=pred_class_logits_var) pred_class_stochastic_logits = univariate_normal_dists.rsample( (num_samples,)) pred_class_stochastic_logits = pred_class_stochastic_logits.view( (pred_class_stochastic_logits.shape[1] * num_samples, pred_class_stochastic_logits.shape[2], -1)) pred_class_logits = pred_class_stochastic_logits.squeeze( 2) # Produce copies of the target classes to match the number of # stochastic samples. gt_classes_target = torch.unsqueeze(gt_classes, 0) gt_classes_target = torch.repeat_interleave( gt_classes_target, num_samples, dim=0).view( (gt_classes_target.shape[1] * num_samples, -1)) gt_classes_target = gt_classes_target.squeeze(1) loss_cls = F.cross_entropy( pred_class_logits, gt_classes_target, reduction="mean") elif self.cls_var_loss == 'evidential': # ToDo: Currently does not provide any reasonable mAP Results # (15% mAP) # Assume dirichlet parameters are output. alphas = get_dir_alphas(pred_class_logits) # Get sum of all alphas dirichlet_s = alphas.sum(1).unsqueeze(1) # Generate one hot vectors for ground truth one_hot_vectors = torch.nn.functional.one_hot( gt_classes, alphas.shape[1]) # Compute loss. This loss attempts to put all evidence on the # correct location. per_instance_loss = ( one_hot_vectors * (torch.digamma(dirichlet_s) - torch.digamma(alphas))) # Compute KL divergence regularizer loss estimated_dirichlet = torch.distributions.dirichlet.Dirichlet( (alphas - 1.0) * (1.0 - one_hot_vectors) + 1.0) uniform_dirichlet = torch.distributions.dirichlet.Dirichlet( torch.ones_like(one_hot_vectors).type(torch.FloatTensor).to(device)) kl_regularization_loss = torch.distributions.kl.kl_divergence( estimated_dirichlet, uniform_dirichlet) # Compute final loss annealing_multiplier = torch.min( torch.as_tensor( current_step / self.annealing_step).to(device), torch.as_tensor(1.0).to(device)) per_proposal_loss = per_instance_loss.sum( 1) + annealing_multiplier * kl_regularization_loss # Compute evidence auxiliary loss evidence_maximization_loss = smooth_l1_loss( dirichlet_s, 100.0 * torch.ones_like(dirichlet_s).to(device), beta=self.smooth_l1_beta, reduction='mean') evidence_maximization_loss *= annealing_multiplier # Compute final loss foreground_loss = per_proposal_loss[(gt_classes >= 0) & ( gt_classes < pred_class_logits.shape[1] - 1)] background_loss = per_proposal_loss[gt_classes == pred_class_logits.shape[1] - 1] loss_cls = (torch.mean(foreground_loss) + torch.mean(background_loss) ) / 2 + 0.01 * evidence_maximization_loss else: loss_cls = F.cross_entropy( pred_class_logits, gt_classes, reduction="mean") # Compute regression loss: if no_instances: # TODO 0.0 * pred.sum() is enough since PT1.6 loss_box_reg = 0.0 * smooth_l1_loss( pred_proposal_deltas, torch.zeros_like(pred_proposal_deltas), 0.0, reduction="sum", ) else: gt_proposal_deltas = self.box2box_transform.get_deltas( proposals_boxes.tensor, gt_boxes.tensor ) box_dim = gt_proposal_deltas.size(1) # 4 or 5 cls_agnostic_bbox_reg = pred_proposal_deltas.size(1) == box_dim device = pred_proposal_deltas.device bg_class_ind = pred_class_logits.shape[1] - 1 # Box delta loss is only computed between the prediction for the gt class k # (if 0 <= k < bg_class_ind) and the target; there is no loss defined on predictions # for non-gt classes and background. # Empty fg_inds produces a valid loss of zero as long as the size_average # arg to smooth_l1_loss is False (otherwise it uses torch.mean internally # and would produce a nan loss). fg_inds = torch.nonzero( (gt_classes >= 0) & (gt_classes < bg_class_ind), as_tuple=True )[0] if cls_agnostic_bbox_reg: # pred_proposal_deltas only corresponds to foreground class for # agnostic gt_class_cols = torch.arange(box_dim, device=device) else: fg_gt_classes = gt_classes[fg_inds] # pred_proposal_deltas for class k are located in columns [b * k : b * k + b], # where b is the dimension of box representation (4 or 5) # Note that compared to Detectron1, # we do not perform bounding box regression for background # classes. gt_class_cols = box_dim * \ fg_gt_classes[:, None] + torch.arange(box_dim, device=device) gt_covar_class_cols = self.bbox_cov_dims * \ fg_gt_classes[:, None] + torch.arange(self.bbox_cov_dims, device=device) loss_reg_normalizer = gt_classes.numel() pred_proposal_deltas = pred_proposal_deltas[fg_inds[:, None], gt_class_cols] gt_proposals_delta = gt_proposal_deltas[fg_inds] if self.compute_bbox_cov: pred_proposal_covs = pred_proposal_covs[fg_inds[:, None], gt_covar_class_cols] pred_proposal_covs = clamp_log_variance(pred_proposal_covs) if self.bbox_cov_loss == 'negative_log_likelihood': if self.bbox_cov_type == 'diagonal': # Ger foreground proposals. _proposals_boxes = proposals_boxes.tensor[fg_inds] # Compute regression negative log likelihood loss according to: # "What Uncertainties Do We Need in Bayesian Deep Learning for Computer Vision?", NIPS 2017 loss_box_reg = 0.5 * torch.exp(-pred_proposal_covs) * smooth_l1_loss( pred_proposal_deltas, gt_proposals_delta, beta=self.smooth_l1_beta) loss_covariance_regularize = 0.5 * pred_proposal_covs loss_box_reg += loss_covariance_regularize loss_box_reg = torch.sum( loss_box_reg) / loss_reg_normalizer else: # Multivariate Gaussian Negative Log Likelihood loss using pytorch # distributions.multivariate_normal.log_prob() forecaster_cholesky = covariance_output_to_cholesky( pred_proposal_covs) multivariate_normal_dists = distributions.multivariate_normal.MultivariateNormal( pred_proposal_deltas, scale_tril=forecaster_cholesky) loss_box_reg = - \ multivariate_normal_dists.log_prob(gt_proposals_delta) loss_box_reg = torch.sum( loss_box_reg) / loss_reg_normalizer elif self.bbox_cov_loss == 'second_moment_matching': # Compute regression covariance using second moment # matching. loss_box_reg = smooth_l1_loss(pred_proposal_deltas, gt_proposals_delta, self.smooth_l1_beta) errors = (pred_proposal_deltas - gt_proposals_delta) if self.bbox_cov_type == 'diagonal': # Handel diagonal case second_moment_matching_term = smooth_l1_loss( torch.exp(pred_proposal_covs), errors ** 2, beta=self.smooth_l1_beta) loss_box_reg += second_moment_matching_term loss_box_reg = torch.sum( loss_box_reg) / loss_reg_normalizer else: # Handel full covariance case errors = torch.unsqueeze(errors, 2) gt_error_covar = torch.matmul( errors, torch.transpose(errors, 2, 1)) # This is the cholesky decomposition of the covariance matrix. # We reconstruct it from 10 estimated parameters as a # lower triangular matrix. forecaster_cholesky = covariance_output_to_cholesky( pred_proposal_covs) predicted_covar = torch.matmul( forecaster_cholesky, torch.transpose( forecaster_cholesky, 2, 1)) second_moment_matching_term = smooth_l1_loss( predicted_covar, gt_error_covar, beta=self.smooth_l1_beta, reduction='sum') loss_box_reg = ( torch.sum(loss_box_reg) + second_moment_matching_term) / loss_reg_normalizer elif self.bbox_cov_loss == 'energy_loss': forecaster_cholesky = covariance_output_to_cholesky( pred_proposal_covs) # Define per-anchor Distributions multivariate_normal_dists = distributions.multivariate_normal.MultivariateNormal( pred_proposal_deltas, scale_tril=forecaster_cholesky) # Define Monte-Carlo Samples distributions_samples = multivariate_normal_dists.rsample( (self.bbox_cov_num_samples + 1,)) distributions_samples_1 = distributions_samples[0:self.bbox_cov_num_samples, :, :] distributions_samples_2 = distributions_samples[1: self.bbox_cov_num_samples + 1, :, :] # Compute energy score loss_covariance_regularize = - smooth_l1_loss( distributions_samples_1, distributions_samples_2, beta=self.smooth_l1_beta, reduction="sum") / self.bbox_cov_num_samples # Second term gt_proposals_delta_samples = torch.repeat_interleave( gt_proposals_delta.unsqueeze(0), self.bbox_cov_num_samples, dim=0) loss_first_moment_match = 2.0 * smooth_l1_loss( distributions_samples_1, gt_proposals_delta_samples, beta=self.smooth_l1_beta, reduction="sum") / self.bbox_cov_num_samples # First term # Final Loss loss_box_reg = ( loss_first_moment_match + loss_covariance_regularize) / loss_reg_normalizer else: raise ValueError( 'Invalid regression loss name {}.'.format( self.bbox_cov_loss)) # Perform loss annealing. Not really essential in Generalized-RCNN case, but good practice for more # elaborate regression variance losses. standard_regression_loss = smooth_l1_loss(pred_proposal_deltas, gt_proposals_delta, self.smooth_l1_beta, reduction="sum",) standard_regression_loss = standard_regression_loss / loss_reg_normalizer probabilistic_loss_weight = get_probabilistic_loss_weight( current_step, self.annealing_step) loss_box_reg = (1.0 - probabilistic_loss_weight) * \ standard_regression_loss + probabilistic_loss_weight * loss_box_reg else: loss_box_reg = smooth_l1_loss(pred_proposal_deltas, gt_proposals_delta, self.smooth_l1_beta, reduction="sum",) loss_box_reg = loss_box_reg / loss_reg_normalizer return {"loss_cls": loss_cls, "loss_box_reg": loss_box_reg}
def forward(self, part, part_seg): # part_seg -> one hot coding part_seg = part_seg[:, :, 0] part_seg = torch.nn.functional.one_hot(part_seg.to(torch.int64), 12).transpose(1, 2) sp_feat, sp_cabins, sp_idx, trans = self.spcoder( torch.cat((part_seg.float(), part), 1)) loss_trans = feature_transform_regularizer(trans) pn_feat = self.pncoder(torch.cat((part_seg.float(), part), 1).float()) pn_feat = pn_feat.unsqueeze(2).expand(part.size(0), self.dim_pn, self.num_points).contiguous() part_regions = [] sp_feat_conv = self.ptmapper(sp_feat) out_sp_local = [] out_seg = [] out_sp_global = [] out_pcn = [] for i in range(0, self.n_primitives): """ part_regions.append( torch.gather(part, dim=2, index=sp_idx[:, :, i, :].long())) """ # stn3d part_regions.append(sp_feat[:, -3:, i, :]) rand_grid = Variable( torch.cuda.FloatTensor(part.size(0), 2, self.num_points // 16)) rand_grid.data.uniform_(0, 1) # here self.num_points // self.n_primitives = 8*4 mesh_grid = torch.meshgrid([ torch.linspace(0.0, 1.0, 64), torch.linspace(0.0, 1.0, self.num_points // 64) ]) mesh_grid = torch.cat((torch.reshape( mesh_grid[0], (self.num_points // self.n_primitives * self.n_primitives, 1)), torch.reshape( mesh_grid[1], (self.num_points // self.n_primitives * self.n_primitives, 1))), dim=1) mesh_grid = torch.transpose(mesh_grid, 0, 1).unsqueeze(0).repeat( sp_feat_conv.shape[0], 1, 1) mesh_grid = torch.cat( (mesh_grid, torch.zeros(part.size(0), 1, mesh_grid.shape[2])), dim=1) # y = SoftPool(sp_feat_conv[:, :, i, :])[0][:,:,i,:] y = sp_feat_conv[:, :, i, :] out_seg.append(y) # y = torch.cat((y, pn_feat), 1).contiguous() out_sp_local.append(self.decoder1[i](y)) # pn_feat = torch.max(sp_feat[:,:,:,0], dim=1)[0].unsqueeze(2).expand(part.size(0),sp_feat_conv.size(1), mesh_grid.size(2)).contiguous() y = torch.cat( (rand_grid.repeat(1, 1, 16), torch.repeat_interleave(sp_cabins[:, :, i, :], repeats=self.num_points // 16, dim=2), pn_feat), 1).contiguous() out_sp_global.append(self.decoder2[i](y)) # y = torch.cat((mesh_grid.cuda(), pn_feat), 1).contiguous() y = torch.cat((mesh_grid.cuda(), pn_feat), 1).contiguous() out_pcn = self.decoder3(y) # part_regions = torch.cat(part_regions, 2).contiguous() out1 = [] out3 = [] for i in range(np.size(part_regions)): part_regions[i] = part_regions[i].transpose(1, 2).contiguous() out1.append(out_sp_local[i].transpose(1, 2).contiguous()) out_seg[i] = out_seg[i].transpose(1, 2).contiguous() sm = nn.Softmax(dim=2) out_seg[i] = sm(out_seg[i]) out3.append(out_sp_global[i].transpose(1, 2).contiguous()) out4 = out_pcn.transpose(1, 2).contiguous() # out_sp_local = torch.cat(out_sp_local, 2).contiguous() # out_sp_global = torch.cat(out_sp_global, 2).contiguous() # out_pcn = torch.cat(out_pcn, 2).contiguous() # out_seg = torch.cat(out_seg, 2).contiguous() dist, _, mean_mst_dis = self.expansion( out1[0], self.num_points // self.n_primitives, 1.5) loss_mst = torch.mean(dist) id0 = torch.zeros(out_sp_local[0].shape[0], 1, out_sp_local[0].shape[2]).cuda().contiguous() out_sp_local[0] = torch.cat((out_sp_local[0], id0), 1) id1 = torch.ones(part.shape[0], 1, part.shape[2]).cuda().contiguous() part = torch.cat((part, id1), 1) """ id2 = torch.zeros(out_sp_global.shape[0], 1, out_sp_global.shape[2]).cuda().contiguous() out_sp_global = torch.cat((out_sp_global, id2), 1) id3 = torch.zeros(out_pcn.shape[0], 1, out_pcn.shape[2]).cuda().contiguous() out_pcn = torch.cat((out_pcn, id3), 1) """ fusion = torch.cat((out_sp_local[0], part), 2) # fusion = torch.cat((out_sp_global, out_pcn, part), 2) resampled_idx = MDS_module.minimum_density_sample( fusion[:, 0:3, :].transpose(1, 2).contiguous(), out1[0].shape[1], mean_mst_dis) fusion = MDS_module.gather_operation(fusion, resampled_idx) delta = self.res(fusion) fusion = fusion[:, 0:3, :] out2 = (fusion + delta).transpose(2, 1).contiguous() return out1, out2, out3, out4, loss_mst, out_seg, part_regions, loss_trans
def forward(self, S0, V0, rate, BS_vol, indices, z, z1, MC_samples): S_old = torch.repeat_interleave(S0, MC_samples, dim=0) # Uncomment when using BS Control Variate: # BS_old = torch.repeat_interleave(S0, MC_samples, dim=0) V_old = torch.repeat_interleave(V0, MC_samples, dim=0) K_call = self.strikes_call # K_put = self.strikes_put zeros = torch.repeat_interleave(torch.zeros(1, 1), MC_samples, dim=0) average_SS = torch.Tensor() average_SS1 = torch.Tensor() average_SS_OTM = torch.Tensor() average_SS1_ITM = torch.Tensor() # use fixed step size h = self.timegrid[1] - self.timegrid[0] n_steps = len(self.timegrid) - 1 # set maturity counter countmat = -1 # Control Variate cv = 0 # Solve for S_t, V_t (Euler) irand = [randrange(0, n_steps + 1, 1) for k in range(300)] for i in range(1, len(self.timegrid)): dW = (torch.sqrt(h) * z[:, i - 1]).reshape(MC_samples, 1) dW1 = (torch.sqrt(h) * z1[:, i - 1]).reshape(MC_samples, 1) current_time = torch.ones(1, 1) * self.timegrid[i - 1] input_time = torch.repeat_interleave(current_time, MC_samples, dim=0) inputNN = torch.cat( [input_time.reshape(MC_samples, 1), S_old, V_old], 1) inputNNvol = torch.cat([input_time.reshape(MC_samples, 1), V_old], 1) input_CV = torch.cat([input_time.reshape(MC_samples, 1), S_old], 1) input_CV = S_old cv += self.control_variate(input_CV.detach()) * dW if int(i) in irand: S_new = S_old + S_old * rate * h + self.diffusion(inputNN) * dW V_new = V_old + self.driftV(inputNNvol) * h + self.diffusionV( inputNNvol) * dW + self.diffusionV1(inputNNvol) * dW1 else: S_new = S_old + S_old * rate * h + self.diffusion( inputNN).detach() * dW V_new = V_old + self.driftV(inputNNvol).detach( ) * h + self.diffusionV(inputNNvol).detach( ) * dW + self.diffusionV1(inputNNvol).detach() * dW1 S_new = torch.cat([S_new, zeros], 1) S_new = torch.max(S_new, 1, keepdim=True)[0] S_old = S_new V_old = V_new # If particular timestep is a maturity for Vanilla option if int(i) in indices: countmat += 1 Z_new = torch.Tensor() Z_newP_ITM = torch.Tensor() Z_newP_OTM = torch.Tensor() Z_new2 = torch.Tensor() # countstrikecall=-1 # Evaluate put (OTM) and call (OTM) option prices for strike in K_call: # countstrikecall+=1 # strike_put = torch.ones(1,1)*K_put[countstrikecall] # K_extended_put = torch.repeat_interleave(strike_put, MC_samples, dim=0).float() # Since we use the same number of maturities for vanilla calls and puts: price = torch.clamp(S_old - strike, 0) - cv var_price_no_cv = torch.var(torch.clamp(S_old - strike, 0)) # price_OTM = torch.cat([K_extended_put-S_old,zeros],1) #put OTM # Discounting assumes we use 2-year time horizon var_price = torch.var(price) # price_OTM = torch.max(price_OTM, 1, keepdim=True)[0]*torch.exp(-rate*1*i/n_steps) Z_new = torch.cat([Z_new, price], 1) avg_S = Z_new.mean(dim=0, keepdim=True).T average_SS = torch.cat([average_SS, avg_S.T], 0) # call OTM return average_SS, var_price, var_price_no_cv
def train(args, train_loader, val_loader, generator, encoder, g_ema, g_optim, d_optim, device): max_cat = len(train_loader.dataset.classes) train_loader = sample_data(train_loader) max_vps = args.max_vps # num_nbrs = reproj_consist.num_nbrs if args.bins > 1: criterion = kldiv_loss elif args.soft_l1: criterion = nn.SmoothL1Loss() else: criterion = nn.L1Loss() colors = [torch.tensor(c, device=device) for c in gen_colors(max_vps)] to_discrete = ToDiscrete(args.bins, smoothing=args.smoothing) to_continuous = ToContinuous() wc2cc, cc2wc = read_camera_positions('camPosListDodecAzEl.txt', device, args.loader_type) pbar = range(args.iter) if get_rank() == 0: pbar = tqdm(pbar, initial=args.start_iter, dynamic_ncols=True, smoothing=0.01) loss_dict = {} r1_loss = torch.tensor(0.0, device=device) path_loss = torch.tensor(0.0, device=device) path_lengths = torch.tensor(0.0, device=device) mean_path_length_avg = 0 if args.distributed: g_module = generator.module e_module = encoder.module else: g_module = generator e_module = encoder accum = args.decay # 0.5 ** (32 / (10 * 1000)) n_fixed_samples = min( 8 * max_cat, args.batch if args.val_batch is None else args.val_batch) fixed_object, fixed_cat = next(iter(val_loader)) fixed_object, fixed_cat = fixed_object[: n_fixed_samples], fixed_cat[: n_fixed_samples] fixed_vp_in = torch.tensor(np.random.choice(max_vps, n_fixed_samples)) fixed_cat_in = fixed_cat fixed_cat_out = torch.repeat_interleave(fixed_cat, max_vps) fixed_input = fixed_object[np.arange(n_fixed_samples), fixed_vp_in] if args.bins > 1 and args.loader_type != 'merged': fixed_dm_in = to_discrete(fixed_input) elif args.bins > 1 and args.loader_type == 'merged' and args.input_quant: fixed_dm_in = fixed_input[:, [0]] fixed_sil = (fixed_dm_in > -1).float() fixed_dm_in = to_discrete(torch.cat((fixed_dm_in, fixed_sil), dim=1)) elif args.loader_type == 'merged': fixed_dm_in = fixed_input[:, [0]] else: fixed_dm_in = fixed_input fixed_dm_in, fixed_vp_in, fixed_cat = fixed_dm_in.to( device), fixed_vp_in.to(device), fixed_cat.to(device) if args.loader_type == 'merged': # Since first 20 are the inputs, we can directly index them fixed_object = fixed_object[:, :, [1, 2]] if args.wandb: # Render fixed output point clouds fixed_pc = [ depth2cloud(fixed_object[b, v, 0, :, :], cc2wc[v].cpu(), args.size, fixed_object[b, v, 1, :, :], data_type='ortho') for b in range(n_fixed_samples) for v in range(max_vps) ] if args.load_sil else [ depth2cloud(fixed_object[b, v, 0, :, :], cc2wc[v].cpu(), args.size, data_type='ortho') for b in range(n_fixed_samples) for v in range(max_vps) ] # Add colors fixed_pc = [ torch.cat([ fixed_pc[ii], torch.zeros_like(fixed_pc[ii]) + colors[ii % max_vps].cpu() ], dim=1) for ii in range(len(fixed_pc)) ] fixed_pc = [ torch.cat(fixed_pc[b * max_vps:(b + 1) * max_vps]) for b in range(n_fixed_samples) ] fixed_pc = [ wandb.Object3D(fixed_pc[b][:, [2, 0, 1, 3, 4, 5]].data.numpy(), caption="Object_%02d" % b) for b in range(n_fixed_samples) ] # Render inputs fixed_im = [ wandb.Image(fixed_input[ii, 0].data.numpy(), caption="Object_%02d_%02d" % (ii, vp.item())) for ii, vp in enumerate(fixed_vp_in) ] wandb.log({ "Fixed Input": fixed_im, "Fixed PC": fixed_pc, }, step=0) requires_grad(generator, True) requires_grad(encoder, True) for idx in pbar: i = idx + args.start_iter if i > args.iter: print('Done!') break ## Training objects, cats = next(train_loader) # objects, cats = objects.to(device), cats.to(device) B = objects.shape[0] vp_in_out = torch.tensor( np.random.choice(max_vps, B * args.num_vps * 2).reshape(-1, 2)) vp_in = vp_in_out[:, 0] vp_out = vp_in_out[:, 1] batch_ids = np.repeat(np.arange(B), args.num_vps) if args.loader_type == 'merged': objects_in = objects[:, :, [0]] objects_out = objects[:, :, [1, 2]] else: objects_in = objects_out = objects dm_in = objects_in[batch_ids, vp_in] dm_out = objects_out[batch_ids, vp_out] cats = torch.repeat_interleave(cats, repeats=args.num_vps) dm_in, dm_out, vp_in, vp_out, cats = \ dm_in.to(device), dm_out.to(device), vp_in.to(device), vp_out.to(device), cats.to(device) if args.bins > 1 and args.loader_type != 'merged': dm_in = to_discrete(dm_in) dm_out = to_discrete(dm_out) elif args.bins > 1 and args.input_quant: # dm_in = to_discrete(dm_in) dm_sil = (dm_in > -1).float() dm_in = to_discrete(torch.cat((dm_in, dm_sil), dim=1)) dm_out = to_discrete(dm_out) elif args.bins > 1: dm_out = to_discrete(dm_out) styles = encoder(dm_in, viewpoints=vp_in, categories=cats) # Take average of latents of viewpoints to retain only style info styles = styles.reshape(B, args.num_vps, -1) styles = torch.mean(styles, dim=1).unsqueeze(1) styles = styles.expand(-1, args.num_vps, -1).reshape(B * args.num_vps, -1) # Reproject this style in new viewpoints reprojections, _ = generator([styles], viewpoints=vp_out, categories=cats) loss = criterion(reprojections, dm_out) loss_dict['reproj_loss_train'] = loss d_regularize = args.d_reg_every > 0 and i % args.d_reg_every == 0 if d_regularize: dm_in.requires_grad = True r1_loss = d_r1_loss(styles, dm_in) loss += args.r1 / 2 * r1_loss * args.d_reg_every + 0 * styles loss_dict['r1'] = r1_loss g_regularize = args.g_reg_every > 0 and i % args.g_reg_every == 0 if g_regularize: path_batch_size = max(1, B // args.path_batch_shrink) path_loss, mean_path_length, path_lengths = g_path_regularize( reprojections[:path_batch_size], styles[:path_batch_size], mean_path_length) weighted_path_loss = args.path_regularize * args.g_reg_every * path_loss if args.path_batch_shrink: weighted_path_loss += 0 * reprojections[0, 0, 0, 0] loss += weighted_path_loss mean_path_length_avg = (reduce_sum(mean_path_length).item() / get_world_size()) loss_dict['path'] = path_loss loss_dict['path_length'] = path_lengths.mean() accumulate(g_ema, g_module, accum) loss_reduced = reduce_loss_dict(loss_dict) reproj_loss_train = loss_reduced['reproj_loss_train'].mean().item() r1_val = loss_reduced['r1'].mean().item() path_loss_val = loss_reduced['path'].mean().item() path_length_val = loss_reduced['path_length'].mean().item() encoder.zero_grad() generator.zero_grad() loss.backward() d_optim.step() g_optim.step() if get_rank() == 0: pbar.set_description(( f'reproj_loss_train: {reproj_loss_train:.4f}; ' # f'r1: {r1_val:.4f}; ' # f'path: {path_loss_val:.4f}; ' # f'mean path: {mean_path_length_avg:.4f}, ' )) if wandb and args.wandb: wandb.log({ 'Train reprojection': reproj_loss_train, }, step=i) if i % 1000 == 0: with torch.no_grad(): g_ema.eval() generator.eval() encoder.eval() reproj_loss_val = torch.tensor(0.0) val_count = 0 for objects, cats in val_loader: # Reshape appropriately # B, V, C, H, W = objects.shape B = objects.shape[0] objects, cats = objects.to(device), cats.to(device) if args.loader_type == 'merged': objects_in = objects[:, :, [0]] objects_out = objects[:, :, [1, 2]] else: objects_in = objects_out = objects batch_ids = np.repeat(np.arange(B), max_vps) vp_in_out = torch.tensor( B * list(range(max_vps))).to(device) dm_in = objects_in[batch_ids, vp_in_out] dm_out = objects_out[batch_ids, vp_in_out] dm_in, dm_out, vp_in, vp_out, cats = \ dm_in.to(device), dm_out.to(device), vp_in.to(device), vp_out.to(device), cats.to(device) # if args.bins > 1: # dm_out = to_discrete(dm_out) if args.bins > 1 and args.loader_type != 'merged': dm_in = to_discrete(dm_in) dm_out = to_discrete(dm_out) elif args.bins > 1 and args.input_quant: dm_sil = (dm_in > -1).float() dm_in = to_discrete( torch.cat((dm_in, dm_sil), dim=1)) # dm_in = to_discrete(dm_in) dm_out = to_discrete(dm_out) elif args.bins > 1: dm_out = to_discrete(dm_out) cats = torch.repeat_interleave(cats, repeats=max_vps) styles = encoder(dm_in, viewpoints=vp_in_out, categories=cats) # Take average of latents of viewpoints to retain only style info styles = styles.reshape(B, max_vps, -1) styles = torch.mean(styles, dim=1).unsqueeze(1) styles = styles.expand(-1, max_vps, -1).reshape(B * max_vps, -1) reprojections, _ = g_ema([styles], viewpoints=vp_in_out, categories=cats) reproj_loss_val += criterion(reprojections, dm_out) val_count += 1 # Do full validation every 5000 steps, else do partial if i % 5000 == 0 and i > 1: continue if val_count > 5: break reproj_loss_val /= val_count reproj_loss_val = reproj_loss_val.item() # loss_dict['reproj_loss_val'] = reproj_loss_val print(f'reproj_loss_valid: {reproj_loss_val:.4f}; ') if args.wandb: wandb.log({ "Valid reprojection": reproj_loss_val, }, step=i) styles = encoder(fixed_dm_in, viewpoints=fixed_vp_in, categories=fixed_cat_in) styles = styles.unsqueeze(1) styles = styles.expand(-1, max_vps, -1).reshape( n_fixed_samples * max_vps, -1) fixed_vp_out = torch.tensor( n_fixed_samples * list(range(max_vps))).to(device) reprojections, _ = g_ema([styles], viewpoints=fixed_vp_out, categories=fixed_cat_out) if args.bins > 1: reprojections = to_continuous(reprojections) # PC reconstructions for val data recon_pc = [ depth2cloud(reprojections[ii, 0, :, :], cc2wc[vp], args.size, reprojections[ii, 1, :, :], data_type=args.loader_type) for ii, vp in enumerate(fixed_vp_out) ] if args.load_sil else [ depth2cloud(reprojections[ii, 0, :, :], cc2wc[vp], args.size, data_type=args.loader_type) for ii, vp in enumerate(fixed_vp_out) ] # Add colors recon_pc = [ torch.cat([ recon_pc[ii], torch.zeros_like(recon_pc[ii]) + colors[vp] ], dim=1) for ii, vp in enumerate(fixed_vp_out) ] # print(len(recon_pc), B, max_vps, num_nbrs) recon_pc = [ torch.cat(recon_pc[b * max_vps:(b + 1) * max_vps]) for b in range(n_fixed_samples) ] recon_pc = [ wandb.Object3D( recon_pc[b] [:, [2, 0, 1, 3, 4, 5]].data.cpu().numpy(), caption="Object_%02d" % b) for b in range(n_fixed_samples) ] wandb.log({ "Recon PC": recon_pc, }, step=i) torch.save( { 'g': g_module.state_dict(), 'd': e_module.state_dict(), 'g_ema': g_ema.state_dict(), 'g_optim': g_optim.state_dict(), 'd_optim': d_optim.state_dict(), }, args.ckpt_save_directory + f'/{str(i).zfill(6)}.pt', ) requires_grad(generator, True) requires_grad(encoder, True) encoder.train() generator.train()
def run_one_epoch(self, model, optimizer_info={}, cur_epoch=1, mode='', freeze=False): model = model.to(self.device) if mode == 'train': model.train() optimizer = torch.optim.AdamW( # [ # {'params': model.encoder.parameters(), 'lr': optimizer_info['lr']}, # {'params': model.LSTM.parameters(), 'lr': optimizer_info['lr']}, # {'params': model.decoder.parameters(), 'lr': optimizer_info['lr']}, # {'params': model.MF.parameters(), 'lr': optimizer_info['lr']}, # ], model.parameters(), lr=optimizer_info['lr'], weight_decay=optimizer_info['weight_decay']) if freeze: model.embedding_layer.eval() for param in model.embedding_layer.parameters(): param.requires_grad = False else: model.eval() total_loss = 0. start_time = time.time() batch_id = 0 log_interval = 256 total_prediction = [] total_label = [] total_output = [] for data, masked_data in self.data_loader[mode]: input_name = model.embedding_layer.input_name mlm_loss = 0. if mode == 'train': masked_inputs = {} for name in input_name: masked_inputs[name] = masked_data[name].to(self.device) _, _, mlm_outputs, _, _ = model( src=masked_inputs, src_mask=data['src_mask'].to(self.device), segment_info=data['segment_info'].to(self.device)) for name in input_name: mask_label = torch.masked_select(input=data[name], mask=masked_data['mask']) mask_label = mask_label - 1 # id 从 0开始 mlm_output = mlm_outputs[name].cpu() # mlm_output:(batch_size, seq_len, output_dim)--> (batch_size,mask_num, output_dim) mlm_output = torch.masked_select( input=mlm_output, mask=torch.repeat_interleave( masked_data['mask'].unsqueeze(-1), repeats=mlm_output.size(-1), dim=-1)).view(-1, mlm_output.size(-1)) mlm_loss += torch.nn.CrossEntropyLoss()(input=mlm_output, target=mask_label) inputs = {} for name in input_name: inputs[name] = data[name].to(self.device) task_label = data['label'].view(-1) query = data['query'] cur_batch_size = task_label.size()[0] _, _, _, task_output, attention_weights = model( src=inputs, segment_info=data['segment_info'].to(self.device), src_mask=data['src_mask'].to(self.device)) task_output = task_output.cpu() task_output = torch.gather(task_output, dim=1, index=query - 1).view(-1) task_loss = torch.nn.BCELoss()(input=task_output, target=task_label) loss = mlm_loss / len(input_name) + task_loss total_loss += loss.item() * cur_batch_size prediction = torch.where(task_output > 0.5, 1, 0) total_prediction.extend(prediction.view(-1).detach().numpy()) total_label.extend(task_label.view(-1).detach().numpy()) total_output.extend(task_output.view(-1).detach().numpy()) # 防止梯度爆炸的梯度截断,梯度超过0.5就截断 if mode == 'train': # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) optimizer.zero_grad() loss.backward() optimizer.step() if batch_id % log_interval == 0 and batch_id > 0: elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | s {:5.2f} | ' 'loss {:5.3f} | ppl {:8.3f}'.format( cur_epoch, batch_id, len(self.data_loader[mode].dataset) // self.batch_size, elapsed, loss.item(), math.exp(loss.item()))) start_time = time.time() batch_id += 1 if mode == 'test': attention_weight = attention_weights[0][0].unsqueeze( 0).cpu().detach().numpy() # (BHCW) attention_weight = (1 - attention_weight) * 255 self.writer.add_image(tag='attention_weights', img_tensor=attention_weight, global_step=cur_epoch) auc = sklearn.metrics.roc_auc_score(total_label, total_output) acc = sklearn.metrics.accuracy_score(total_label, total_prediction) return model, total_loss / (len(self.data_loader[mode].dataset) - 1), auc, acc
def train(self, replay_buffer, iterations, batch_size=64): for it in range(iterations): # Sample replay buffer / batch state, action, next_state, reward, not_done = replay_buffer.sample( batch_size) # Variational Auto-Encoder Training recon, mean, std = self.vae(state, action) recon_loss = F.mse_loss(recon, action) # 一般(各分量独立的)正态分布与标准正态分布的 KL 散度 KL_loss = -0.5 * (1 + torch.log(std.pow(2)) - mean.pow(2) - std.pow(2)).mean() vae_loss = recon_loss + 0.5 * KL_loss self.vae_loss.append(vae_loss) # Clears the gradients self.vae_optimizer.zero_grad() vae_loss.backward() self.vae_optimizer.step() # Critic Training with torch.no_grad(): # Duplicate next state 10 times next_state = torch.repeat_interleave(next_state, 10, 0) # Compute value of perturbed actions sampled from the VAE target_Q1, target_Q2 = self.critic_target( next_state, self.actor_target(next_state, self.vae.decode(next_state))) # Soft Clipped Double Q-learning target_Q = self.lmbda * torch.min(target_Q1, target_Q2) + ( 1. - self.lmbda) * torch.max(target_Q1, target_Q2) # Take max over each action sampled from the VAE target_Q = target_Q.reshape(batch_size, -1).max(1)[0].reshape(-1, 1) target_Q = reward + not_done * self.discount * target_Q current_Q1, current_Q2 = self.critic(state, action) critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss( current_Q2, target_Q) self.critic_loss.append(critic_loss) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Pertubation Model / Action Training sampled_actions = self.vae.decode(state) perturbed_actions = self.actor(state, sampled_actions) # Update through DPG actor_loss = -self.critic.q1(state, perturbed_actions).mean() self.actor_loss.append(actor_loss) self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update Target Networks # soft update for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
def repeat_frames(input, n_frames, dim_t=1): """ repeat a input of shape [b x ... x H x W] n_frames times to generate a tensor of [b x n_frames x ... x H x W] """ input = input.unsqueeze(dim_t) input = torch.repeat_interleave(input, n_frames, dim=dim_t) return input
def forward( self, *, observations: types.ObservationsTorch, controls: types.ControlsTorch, ) -> types.StatesTorch: """Particle filter forward pass, single timestep. Args: observations (dict or torch.Tensor): observation inputs. should be either a dict of tensors or tensor of shape `(N, ...)`. controls (dict or torch.Tensor): control inputs. should be either a dict of tensors or tensor of shape `(N, ...)`. Returns: torch.Tensor: Predicted state for each batch element. Shape should be `(N, state_dim).` """ # Make sure our particle filter's been initialized assert self._initialized, "Particle filter not initialized!" # Get our batch size (N), current particle count (M), & state dimension N, M, state_dim = self.particle_states.shape assert state_dim == self.state_dim assert len(fannypack.utils.SliceWrapper(controls)) == N # Decide whether or not we're resampling resample = self.resample if resample is None: # If not explicitly set, we disable resampling in train mode (to allow # gradients to propagate through time) and enable in eval mode (to prevent # particle deprivation) resample = not self.training # If we're not resampling and our current particle count doesn't match # our desired particle count, we need to either expand or contract our # particle set if not resample and self.num_particles != M: indices = self.particle_states.new_zeros( (N, self.num_particles), dtype=torch.long ) # If output particles > our input particles, for the beginning part we copy # particles directly to reduce variance copy_count = (self.num_particles // M) * M if copy_count > 0: indices[:, :copy_count] = torch.arange(M).repeat(copy_count // M)[ None, : ] # For remaining particles, we sample w/o replacement (also lowers variance) remaining_count = self.num_particles - copy_count assert remaining_count >= 0 if remaining_count > 0: indices[:, copy_count:] = torch.randperm(M, device=indices.device)[ None, :remaining_count ] # Gather new particles, weights M = self.num_particles self.particle_states = self.particle_states.gather( 1, indices[:, :, None].expand((N, M, state_dim)) ) self.particle_log_weights = self.particle_log_weights.gather(1, indices) assert self.particle_states.shape == (N, self.num_particles, state_dim) assert self.particle_log_weights.shape == (N, self.num_particles) # Normalize particle weights to sum to 1.0 self.particle_log_weights = self.particle_log_weights - torch.logsumexp( self.particle_log_weights, dim=1, keepdim=True ) # Propagate particles through our dynamics model # A bit of extra effort is required for the extra particle dimension # > For our states, we flatten along the N/M axes # > For our controls, we repeat each one `M` times, if M=3: # [u0 u1 u2] should become [u0 u0 u0 u1 u1 u1 u2 u2 u2] # # Currently each of the M particles within a "sample" get the same action, but # we could also add noise in the action space (a la Jonschkowski et al. 2018) reshaped_states = self.particle_states.reshape(-1, self.state_dim) reshaped_controls = fannypack.utils.SliceWrapper(controls).map( lambda tensor: torch.repeat_interleave(tensor, repeats=M, dim=0) ) predicted_states, scale_trils = self.dynamics_model( initial_states=reshaped_states, controls=reshaped_controls ) self.particle_states = ( torch.distributions.MultivariateNormal( loc=predicted_states, scale_tril=scale_trils ) .rsample() # Note that we use `rsample` to make sampling differentiable .view(N, M, self.state_dim) ) assert self.particle_states.shape == (N, M, self.state_dim) # Re-weight particles using observations self.particle_log_weights = self.particle_log_weights + self.measurement_model( states=self.particle_states, observations=observations, ) # Normalize particle weights to sum to 1.0 self.particle_log_weights = self.particle_log_weights - torch.logsumexp( self.particle_log_weights, dim=1, keepdim=True ) # Compute output state_estimates: types.StatesTorch if self.estimation_method == "weighted_average": state_estimates = torch.sum( torch.exp(self.particle_log_weights[:, :, np.newaxis]) * self.particle_states, dim=1, ) elif self.estimation_method == "argmax": best_indices = torch.argmax(self.particle_log_weights, dim=1) state_estimates = torch.gather( self.particle_states, dim=1, index=best_indices ) else: assert False, "Unsupported estimation method!" # Resampling if resample: self._resample() # Post-condition :) assert state_estimates.shape == (N, state_dim) assert self.particle_states.shape == (N, self.num_particles, state_dim) assert self.particle_log_weights.shape == (N, self.num_particles) return state_estimates
def other_ops(self): a = torch.randn(4) b = torch.randn(4) c = torch.randint(0, 8, (5, ), dtype=torch.int64) e = torch.randn(4, 3) f = torch.randn(4, 4, 4) size = [0, 1] dims = [0, 1] return ( torch.atleast_1d(a), torch.atleast_2d(a), torch.atleast_3d(a), torch.bincount(c), torch.block_diag(a), torch.broadcast_tensors(a), torch.broadcast_to(a, (4)), # torch.broadcast_shapes(a), torch.bucketize(a, b), torch.cartesian_prod(a), torch.cdist(e, e), torch.clone(a), torch.combinations(a), torch.corrcoef(a), # torch.cov(a), torch.cross(e, e), torch.cummax(a, 0), torch.cummin(a, 0), torch.cumprod(a, 0), torch.cumsum(a, 0), torch.diag(a), torch.diag_embed(a), torch.diagflat(a), torch.diagonal(e), torch.diff(a), torch.einsum("iii", f), torch.flatten(a), torch.flip(e, dims), torch.fliplr(e), torch.flipud(e), torch.kron(a, b), torch.rot90(e), torch.gcd(c, c), torch.histc(a), torch.histogram(a), torch.meshgrid(a), torch.meshgrid(a, indexing="xy"), torch.lcm(c, c), torch.logcumsumexp(a, 0), torch.ravel(a), torch.renorm(e, 1, 0, 5), torch.repeat_interleave(c), torch.roll(a, 1, 0), torch.searchsorted(a, b), torch.tensordot(e, e), torch.trace(e), torch.tril(e), torch.tril_indices(3, 3), torch.triu(e), torch.triu_indices(3, 3), torch.vander(a), torch.view_as_real(torch.randn(4, dtype=torch.cfloat)), torch.view_as_complex(torch.randn(4, 2)), torch.resolve_conj(a), torch.resolve_neg(a), )
def attribute( self, inputs, baselines=None, target=None, n_steps=500, method="riemann_trapezoid", ): r""" Computes conductance using gradients along the path, applying riemann's method or gauss-legendre. The details of the approach can be found here: https://arxiv.org/abs/1805.12233 Args inputs: A single high dimensional input tensor, in which dimension 0 corresponds to number of examples. baselines: A single high dimensional baseline tensor, which has the same shape as the input target: Predicted class index. This is necessary only for classification use cases n_steps: The number of steps used by the approximation method method: Method for integral approximation, one of `riemann_right`, `riemann_middle`, `riemann_trapezoid` or `gausslegendre` Return attributions: Total conductance with respect to each neuron in output of given layer """ if baselines is None: baselines = 0 gradient_mask = apply_gradient_requirements((inputs,)) # retrieve step size and scaling factor for specified approximation method step_sizes_func, alphas_func = approximation_parameters(method) step_sizes, alphas = step_sizes_func(n_steps), alphas_func(n_steps) # compute scaled inputs from baseline to final input. scaled_features = torch.cat( [baselines + alpha * (inputs - baselines) for alpha in alphas], dim=0 ) # Conductance Gradients - Returns gradient of output with respect to # hidden layer, gradient of hidden layer with respect to input, # and number of hidden units. input_gradients, mid_layer_gradients, hidden_units = self._conductance_grads( self.forward_func, scaled_features, target ) # Multiply gradient of hidden layer with respect to input by input - baseline scaled_input_gradients = torch.repeat_interleave( inputs - baselines, hidden_units, dim=0 ) scaled_input_gradients = input_gradients * scaled_input_gradients.repeat( *([len(alphas)] + [1] * (len(scaled_input_gradients.shape) - 1)) ) # Sum gradients for each input neuron in order to have total # for each hidden unit and reshape to match hidden layer shape summed_input_grads = torch.sum( scaled_input_gradients, tuple(range(1, len(scaled_input_gradients.shape))) ).view_as(mid_layer_gradients) # Rescale gradients of hidden layer by by step size. scaled_grads = mid_layer_gradients.contiguous().view( n_steps, -1 ) * torch.tensor(step_sizes).view(n_steps, 1).to(mid_layer_gradients.device) undo_gradient_requirements((inputs,), gradient_mask) # Element-wise mutliply gradient of output with respect to hidden layer # and summed gradients with respect to input (chain rule) and sum across # stepped inputs. return _reshape_and_sum( scaled_grads.view(mid_layer_gradients.shape) * summed_input_grads, n_steps, inputs.shape[0], mid_layer_gradients.shape[1:], )
def forward(self, x, n): batch_dim = x.shape[0] mu = self.mu_head(x) mu = torch.repeat_interleave(mu, n, 0) return mu
def _conductance_grads(self, forward_fn, input, target_ind=None): with torch.autograd.set_grad_enabled(True): # Set a forward hook on specified module and run forward pass to # get output tensor size. saved_tensor = None def forward_hook(module, inp, out): nonlocal saved_tensor saved_tensor = out hook = self.layer.register_forward_hook(forward_hook) output = forward_fn(input) # Compute layer output tensor dimensions and total number of units. # The hidden layer tensor is assumed to have dimension (num_hidden, ...) # where the product of the dimensions >= 1 correspond to the total # number of hidden neurons in the layer. layer_size = tuple(saved_tensor.size())[1:] layer_units = int(np.prod(layer_size)) # Remove unnecessary forward hook. hook.remove() # Backward hook function to override gradients in order to obtain # just the gradient of each hidden unit with respect to input. saved_grads = None def backward_hook(grads): nonlocal saved_grads saved_grads = grads zero_mat = torch.zeros((1,) + layer_size) scatter_indices = torch.arange(0, layer_units).view_as(zero_mat) # Creates matrix with each layer containing a single unit with # value 1 and remaining zeros, which will provide gradients # with respect to each unit independently. to_return = torch.zeros((layer_units,) + layer_size).scatter( 0, scatter_indices, 1 ) to_repeat = [1] * len(to_return.shape) to_repeat[0] = grads.shape[0] // to_return.shape[0] expanded = to_return.repeat(to_repeat) return expanded # Create a forward hook in order to attach backward hook to appropriate # tensor. Save backward hook in order to remove hook appropriately. back_hook = None def forward_hook_register_back(module, inp, out): nonlocal back_hook back_hook = out.register_hook(backward_hook) hook = self.layer.register_forward_hook(forward_hook_register_back) # Expand input to include layer_units copies of each input. # This allows obtaining gradient with respect to each hidden unit # in one pass. expanded_input = torch.repeat_interleave(input, layer_units, dim=0) output = forward_fn(expanded_input) hook.remove() output = output[:, target_ind] if target_ind is not None else output input_grads = torch.autograd.grad(torch.unbind(output), expanded_input) # Remove backwards hook back_hook.remove() # Remove duplicates in gradient with respect to hidden layer, # choose one for each layer_units indices. output_mid_grads = torch.index_select( saved_grads, 0, torch.tensor(range(0, input_grads[0].shape[0], layer_units)), ) return input_grads[0], output_mid_grads, layer_units
def _gconv(self, inputs, adj_mx, global_embs, state, option='r'): r"""Graph attention network based convolution computation. Args: inputs: input vector, with shape (batch_size, self.num_nodes, self.rnn_units). adj_mx: adjacency matrix, with shape (self.num_nodes, self.num_nodes). global_embs: global embedding matrix, with shape (self.num_nodes, self.rnn_units). state: hidden vectors from the last unit, with shape(batch_size, self.num_nodes, self.rnn_units). If this is the first unit, usually hx is supposed to be a zero vector. option: indicate whether the output is reset gate vector ('r'), update gate vector ('u'), or candidate hidden vector ('c'). Returns: out: output, can be reset gate vector (option is 'r'), update gate vector (option is 'u'), or candidate hidden vector (option is 'c'). """ batch_size = inputs.shape[0] num_nodes = self.num_nodes x = torch.cat([inputs, state], dim=-1) # input_dim out = torch.zeros(size=(batch_size, num_nodes, self.rnn_units), device=device) for relation_id in range(self.num_relation_types - 1): if option == 'r': r_weights_left = self.r_weights[:2 * self.rnn_units, :, relation_id] r_biases_left = self.r_biases[:self.rnn_units, relation_id] r_weights_right = r_weights_left if self.share_attn_weights else self.r_weights[ 2 * self.rnn_units:, :, relation_id] r_biases_right = r_biases_left if self.share_attn_weights else self.r_biases[ self.rnn_units:, relation_id] x_left = torch.matmul(x, r_weights_left) + r_biases_left x_right = torch.matmul(x, r_weights_right) + r_biases_right elif option == 'u': u_weights_left = self.u_weights[:2 * self.rnn_units, :, relation_id] u_biases_left = self.u_biases[:self.rnn_units, relation_id] u_weights_right = u_weights_left if self.share_attn_weights else self.u_weights[ 2 * self.rnn_units:, :, relation_id] u_biases_right = u_biases_left if self.share_attn_weights else self.u_biases[ self.rnn_units:, relation_id] x_left = torch.matmul(x, u_weights_left) + u_biases_left x_right = torch.matmul(x, u_weights_right) + u_biases_right elif option == 'c': c_weights_left = self.c_weights[:2 * self.rnn_units, :, relation_id] c_biases_left = self.c_biases[:self.rnn_units, relation_id] c_weights_right = c_weights_left if self.share_attn_weights else self.c_weights[ 2 * self.rnn_units:, :, relation_id] c_biases_right = c_biases_left if self.share_attn_weights else self.c_biases[ self.rnn_units:, relation_id] x_left = torch.matmul(x, c_weights_left) + c_biases_left x_right = torch.matmul(x, c_weights_right) + c_biases_right i, j = torch.nonzero(adj_mx[:, :, relation_id], as_tuple=True) i, j = i.to(device), j.to(device) x_left_per_edge = x_left.index_select(1, i) x_right_per_edge = x_right.index_select(1, j) x_per_edge = x_left_per_edge + x_right_per_edge x_per_edge = nn.functional.leaky_relu(x_per_edge, self.negative_slope) alpha = (x_per_edge * global_embs[i]).sum(dim=2) alpha = softmax(alpha, index=i, num_nodes=num_nodes, dim=1) attns = torch.zeros([batch_size, num_nodes, num_nodes], device=device) batch_idxs = torch.arange(batch_size, device=device) batch_expand = torch.repeat_interleave(batch_idxs, len(i), dim=0) i_expand = torch.repeat_interleave(i.view(1, -1), batch_size, dim=0).view(-1) j_expand = torch.repeat_interleave(j.view(1, -1), batch_size, dim=0).view(-1) indices = (batch_expand, i_expand, j_expand) attns.index_put_(indices, alpha.view(-1)) zero_mask = ( adj_mx[:, :, relation_id] == 0).unsqueeze(0).repeat_interleave( batch_size, dim=0) zero_coeffs = torch.ones([batch_size, num_nodes, num_nodes], device=device) / zero_mask.float().sum( dim=-1, keepdim=True) attns[zero_mask] = zero_coeffs[zero_mask] out += torch.bmm(adj_mx[:, :, relation_id] * attns, x_right) + x_left return out
def policy( self, predictor: Union[RLPredictor, OnPolicyPredictor, None], state, test, state_preprocessor=None, ) -> Tuple[torch.Tensor, float]: """ Selects the next action. :param predictor: RLPredictor/OnPolicyPredictor object whose policy to follow. If set to None, use a random policy. :param state: State to evaluate predictor's policy on. :param test: Whether or not to bypass exploration (if predictor is not None). For discrete action problems, the exploration policy is epsilon-greedy. For continuous action problems, the exploration is achieved by adding noise to action outputs. :param state_preprocessor: State preprocessor to use to preprocess states """ assert len(state.size()) == 1 # Convert state to batch of size 1 state = state.unsqueeze(0) if predictor is None or (not test and self.action_type == EnvType.DISCRETE_ACTION and float(torch.rand(1)) < self.epsilon): raw_action, _, action_probability = self.sample_policy( state=None, use_continuous_action=False) if self.action_type == EnvType.DISCRETE_ACTION: action = torch.zeros([self.action_dim]) action[raw_action] = 1.0 return action, action_probability return raw_action, action_probability action = torch.zeros([self.action_dim]) if state_preprocessor: state = state_preprocessor.forward(state) if isinstance(predictor, DQNPredictor): action_probability = 1.0 if test else 1.0 - self.epsilon # Use DQNPredictor directly - useful to test caffe2 predictor # assumes state preprocessor already part of predictor net. sparse_states = predictor.in_order_dense_to_sparse(state) q_values = predictor.predict(sparse_states) action_idx = int(max(q_values[0], key=q_values[0].get)) - self.state_dim action[action_idx] = 1.0 return action, action_probability elif isinstance(predictor, ParametricDQNPredictor): # Needs to get a list of candidate actions if actions are continuous if self.action_type == EnvType.CONTINUOUS_ACTION: raise NotImplementedError() action_probability = 1.0 if test else 1.0 - self.epsilon state = np.repeat(state, repeats=self.action_dim, axis=0) sparse_states = predictor.in_order_dense_to_sparse(state) sparse_actions = [{ str(i + self.state_dim): 1 } for i in range(self.action_dim)] q_values = predictor.predict(sparse_states, sparse_actions) q_values = np.fromiter( map(lambda x: x["Q"], q_values), np.float # type: ignore ).reshape(self.action_dim) action_idx = np.argmax(q_values) action[action_idx] = 1.0 return action, action_probability elif predictor.policy_net(): # type: ignore action_set = predictor.policy(state) # type: ignore action, action_probability = action_set.greedy, action_set.greedy_propensity action = action[0, :] return action, action_probability else: action_probability = 1.0 if test else 1.0 - self.epsilon if predictor.discrete_action(): # type: ignore policy_action_set = predictor.policy( # type: ignore state, torch.ones([1, self.action_dim])) else: states_tiled = torch.repeat_interleave(state, repeats=self.action_dim, axis=0) policy_action_set = predictor.policy( # type: ignore states_tiled, (torch.eye( self.action_dim), torch.ones((self.action_dim, 1))), ) if self.softmax_policy: action[policy_action_set.softmax] = 1.0 else: action[policy_action_set.greedy] = 1.0 return action, action_probability
def beam_search(decoder: nn.Module, att_net: nn.Module, enc_out: th.Tensor, lm: Optional[LmType] = None, ctc_prob: Optional[th.Tensor] = None, lm_weight: float = 0, beam_size: int = 8, nbest: int = 1, max_len: int = -1, max_len_ratio: float = 1, min_len: int = 0, min_len_ratio: float = 0, sos: int = -1, eos: int = -1, unk: int = -1, len_norm: bool = True, end_detect: bool = False, ctc_weight: float = 0, len_penalty: float = 0, cov_penalty: float = 0, temperature: float = 1, allow_partial: bool = False, cov_threshold: float = 0.5, eos_threshold: float = 1) -> List[Dict]: """ Vectorized beam search algothrim (see batch version beam_search_batch) Args att_net (nn.Module): attention network enc_out (Tensor): 1 x T x F, encoder output """ if sos < 0 or eos < 0: raise RuntimeError(f"Invalid SOS/EOS ID: {sos:d}/{eos:d}") N, T, D_enc = enc_out.shape if N != 1: raise RuntimeError( f"Got batch size {N:d}, now only support one utterance") if not hasattr(decoder, "step"): raise RuntimeError("Function step should defined in decoder network") if beam_size > decoder.vocab_size: raise RuntimeError(f"Beam size({beam_size}) > vocabulary size") min_len = max(min_len, int(min_len_ratio * T)) max_len = min(max_len, int(max_len_ratio * T)) if max_len_ratio > 0 else T logger.info(f"--- shape of the encoder output: {T} x {D_enc}") logger.info("--- length constraint of the decoding " + f"sequence: ({min_len}, {max_len})") nbest = min(beam_size, nbest) device = enc_out.device att_ali = None dec_hid = None # N x T x F => N*beam x T x F enc_out = th.repeat_interleave(enc_out, beam_size, 0) att_ctx = th.zeros([N * beam_size, D_enc], device=device) proj = th.zeros([N * beam_size, D_enc], device=device) beam_param = BeamSearchParam(beam_size=beam_size, sos=sos, eos=eos, unk=unk, device=device, min_len=min_len, max_len=max_len, len_norm=len_norm, lm_weight=lm_weight, end_detect=end_detect, ctc_weight=ctc_weight, len_penalty=len_penalty, cov_penalty=cov_penalty, allow_partial=allow_partial, cov_threshold=cov_threshold, eos_threshold=eos_threshold, ctc_beam_size=int(beam_size * 1.5)) beam_tracker = BeamTracker(beam_param, ctc_prob=ctc_prob) lm_state = None # clear states att_net.clear() # step by step stop = False while not stop: # beam pre_tok, point = beam_tracker[-1] # step forward dec_hid = adjust_hidden(point, dec_hid) att_ali = None if att_ali is None else att_ali[point] dec_out, att_ctx, dec_hid, att_ali, proj = decoder.step( att_net, pre_tok, enc_out, att_ctx[point], dec_hid=dec_hid, att_ali=att_ali, proj=proj[point]) # compute prob: beam x V, nagetive am_prob = tf.log_softmax(dec_out / temperature, dim=-1) if lm and beam_param.lm_weight > 0: # beam x V lm_prob, lm_state = lm_score_impl(lm, point, pre_tok, lm_state) else: lm_prob = 0 # one beam search step stop = beam_tracker.step(am_prob, lm_prob, att_ali=att_ali) # return nbest return beam_tracker.nbest_hypos(nbest)
def forward(self, x, globel_step): """ Forward pass :param x: (B, 3, H, W) :param globel_step: global step (training) :return: fg_likelihood: (B, 3, H, W) y_nobg: (B, 3, H, W), foreground reconstruction alpha_map: (B, 1, H, W) kl: (B,) total foreground kl boundary_loss: (B,) log: a dictionary containing anything we need for visualization """ B = x.size(0) # if globel_step: self.anneal(globel_step) # Everything is (B, G*G, D), where D varies z_pres, z_depth, z_scale, z_shift, z_where, \ z_pres_logits, z_depth_post, z_scale_post, z_shift_post = self.img_encoder(x, self.tau) # (B, 3, H, W) -> (B*G*G, 3, H, W). Note we must use repeat_interleave instead of repeat x_repeat = torch.repeat_interleave(x, arch.G**2, dim=0) # (B*G*G, 3, H, W), where G is the grid size # Extract glimpse x_att = spatial_transform( x_repeat, z_where.view(B * arch.G**2, 4), (B * arch.G**2, 3, arch.glimpse_size, arch.glimpse_size), inverse=False) # (B*G*G, D) z_what, z_what_post = self.z_what_net(x_att) # Decode z_what into small reconstructed glimpses # All (B*G*G, 3, H, W) o_att, alpha_att = self.glimpse_dec(z_what) # z_pres: (B, G*G, 1) -> (B*G*G, 1, 1, 1) alpha_att_hat = alpha_att * z_pres.view(-1, 1, 1, 1) # (B*G*G, 3, H, W) y_att = alpha_att_hat * o_att # Compute pixel-wise object weights # (B*G*G, 1, H, W). These are glimpse size importance_map = alpha_att_hat * 100.0 * torch.sigmoid( -z_depth.view(B * arch.G**2, 1, 1, 1)) # (B*G*G, 1, H, W). These are of full resolution importance_map_full_res = spatial_transform( importance_map, z_where.view(B * arch.G**2, 4), (B * arch.G**2, 1, *arch.img_shape), inverse=True) # (B*G*G, 1, H, W) -> (B, G*G, 1, H, W) importance_map_full_res = importance_map_full_res.view( B, arch.G**2, 1, *arch.img_shape) # Normalize (B, >G*G<, 1, H, W) importance_map_full_res_norm = torch.softmax(importance_map_full_res, dim=1) # To full resolution # (B*G*G, 3, H, W) -> (B, G*G, 3, H, W) y_each_cell = spatial_transform(y_att, z_where.view(B * arch.G**2, 4), (B * arch.G**2, 3, *arch.img_shape), inverse=True).view( B, arch.G**2, 3, *arch.img_shape) # Weighted sum, (B, 3, H, W) y_nobg = (y_each_cell * importance_map_full_res_norm).sum(dim=1) # To full resolution # (B*G*G, 1, H, W) -> (B, G*G, 1, H, W) alpha_map = spatial_transform(alpha_att_hat, z_where.view(B * arch.G**2, 4), (B * arch.G**2, 1, *arch.img_shape), inverse=True).view( B, arch.G**2, 1, *arch.img_shape) # Weighted sum, (B, 1, H, W) alpha_map = (alpha_map * importance_map_full_res_norm).sum(dim=1) # Everything is computed. Now let's compute loss # Compute KL divergences # (B, G*G, 1) kl_z_pres = kl_divergence_bern_bern(z_pres_logits, self.prior_z_pres_prob) # (B, G*G, 1) kl_z_depth = kl_divergence(z_depth_post, self.z_depth_prior) # (B, G*G, 2) kl_z_scale = kl_divergence(z_scale_post, self.z_scale_prior) kl_z_shift = kl_divergence(z_shift_post, self.z_shift_prior) # Reshape z_what and z_what_post # (B*G*G, D) -> (B, G*G, D) z_what = z_what.view(B, arch.G**2, arch.z_what_dim) z_what_post = Normal(*[ x.view(B, arch.G**2, arch.z_what_dim) for x in [z_what_post.mean, z_what_post.stddev] ]) # (B, G*G, D) kl_z_what = kl_divergence(z_what_post, self.z_what_prior) # dimensionality check assert ((kl_z_pres.size() == (B, arch.G**2, 1)) and (kl_z_depth.size() == (B, arch.G**2, 1)) and (kl_z_scale.size() == (B, arch.G**2, 2)) and (kl_z_shift.size() == (B, arch.G**2, 2)) and (kl_z_what.size() == (B, arch.G**2, arch.z_what_dim))) # Reduce (B, G*G, D) -> (B,) kl_z_pres, kl_z_depth, kl_z_scale, kl_z_shift, kl_z_what = [ x.flatten(start_dim=1).sum(1) for x in [kl_z_pres, kl_z_depth, kl_z_scale, kl_z_shift, kl_z_what] ] # (B,) kl_z_where = kl_z_scale + kl_z_shift # Compute boundary loss # (1, 1, K, K) boundary_kernel = self.boundary_kernel[None, None].to(x.device) # (1, 1, K, K) * (B*G*G, 1, 1) -> (B*G*G, 1, K, K) boundary_kernel = boundary_kernel * z_pres.view(B * arch.G**2, 1, 1, 1) # (B, G*G, 1, H, W), to full resolution boundary_map = spatial_transform(boundary_kernel, z_where.view(B * arch.G**2, 4), (B * arch.G**2, 1, *arch.img_shape), inverse=True).view( B, arch.G**2, 1, *arch.img_shape) # (B, 1, H, W) boundary_map = boundary_map.sum(dim=1) # TODO: some magic number. For reproducibility I will keep it boundary_map = boundary_map * 1000 # (B, 1, H, W) * (B, 1, H, W) overlap = boundary_map * alpha_map # TODO: another magic number. For reproducibility I will keep it p_boundary = Normal(0, 0.7) # (B, 1, H, W) boundary_loss = p_boundary.log_prob(overlap) # (B,) boundary_loss = boundary_loss.flatten(start_dim=1).sum(1) # NOTE: we want to minimize this boundary_loss = -boundary_loss # Compute foreground likelhood fg_dist = Normal(y_nobg, self.fg_sigma) fg_likelihood = fg_dist.log_prob(x) kl = kl_z_what + kl_z_where + kl_z_pres + kl_z_depth if not arch.boundary_loss or globel_step > arch.bl_off_step: boundary_loss = boundary_loss * 0.0 # For visualizating # Dimensionality check assert ((z_pres.size() == (B, arch.G**2, 1)) and (z_depth.size() == (B, arch.G**2, 1)) and (z_scale.size() == (B, arch.G**2, 2)) and (z_shift.size() == (B, arch.G**2, 2)) and (z_where.size() == (B, arch.G**2, 4)) and (z_what.size() == (B, arch.G**2, arch.z_what_dim))) log = { 'fg': y_nobg, 'z_what': z_what, 'z_where': z_where, 'z_pres': z_pres, 'z_scale': z_scale, 'z_shift': z_shift, 'z_depth': z_depth, 'z_pres_prob': torch.sigmoid(z_pres_logits), 'prior_z_pres_prob': self.prior_z_pres_prob.unsqueeze(0), 'o_att': o_att, 'alpha_att_hat': alpha_att_hat, 'alpha_att': alpha_att, 'alpha_map': alpha_map, 'boundary_loss': boundary_loss, 'boundary_map': boundary_map, 'importance_map_full_res_norm': importance_map_full_res_norm, 'kl_z_what': kl_z_what, 'kl_z_pres': kl_z_pres, 'kl_z_scale': kl_z_scale, 'kl_z_shift': kl_z_shift, 'kl_z_depth': kl_z_depth, 'kl_z_where': kl_z_where, } return fg_likelihood, y_nobg, alpha_map, kl, boundary_loss, log
def radius_graph_pbc(data, radius, max_num_neighbors_threshold, device): batch_size = len(data.natoms) # position of the atoms atom_pos = data.pos # Before computing the pairwise distances between atoms, first create a list of atom indices to compare for the entire batch num_atoms_per_image = data.natoms num_atoms_per_image_sqr = (num_atoms_per_image ** 2).long() # index offset between images index_offset = ( torch.cumsum(num_atoms_per_image, dim=0) - num_atoms_per_image ) index_offset_expand = torch.repeat_interleave( index_offset, num_atoms_per_image_sqr ) num_atoms_per_image_expand = torch.repeat_interleave( num_atoms_per_image, num_atoms_per_image_sqr ) # Compute a tensor containing sequences of numbers that range from 0 to num_atoms_per_image_sqr for each image # that is used to compute indices for the pairs of atoms. This is a very convoluted way to implement # the following (but 10x faster since it removes the for loop) # for batch_idx in range(batch_size): # batch_count = torch.cat([batch_count, torch.arange(num_atoms_per_image_sqr[batch_idx], device=device)], dim=0) num_atom_pairs = torch.sum(num_atoms_per_image_sqr) index_sqr_offset = ( torch.cumsum(num_atoms_per_image_sqr, dim=0) - num_atoms_per_image_sqr ) index_sqr_offset = torch.repeat_interleave( index_sqr_offset, num_atoms_per_image_sqr ) atom_count_sqr = ( torch.arange(num_atom_pairs, device=device) - index_sqr_offset ) # Compute the indices for the pairs of atoms (using division and mod) # If the systems get too large this apporach could run into numerical precision issues index1 = ( (atom_count_sqr // num_atoms_per_image_expand) ).long() + index_offset_expand index2 = ( atom_count_sqr % num_atoms_per_image_expand ).long() + index_offset_expand # Get the positions for each atom pos1 = torch.index_select(atom_pos, 0, index1) pos2 = torch.index_select(atom_pos, 0, index2) # Tensor of unit cells. Assumes 9 cells in -1, 0, 1 offsets in the x and y dimensions unit_cell = torch.tensor( [ [-1, -1, 0], [-1, 0, 0], [-1, 1, 0], [0, -1, 0], [0, 0, 0], [0, 1, 0], [1, -1, 0], [1, 0, 0], [1, 1, 0], ], device=device, ).float() num_cells = len(unit_cell) unit_cell_per_atom = unit_cell.view(1, num_cells, 3).repeat( len(index2), 1, 1 ) unit_cell = torch.transpose(unit_cell, 0, 1) unit_cell_batch = unit_cell.view(1, 3, num_cells).expand( batch_size, -1, -1 ) # Compute the x, y, z positional offsets for each cell in each image data_cell = torch.transpose(data.cell, 1, 2) pbc_offsets = torch.bmm(data_cell, unit_cell_batch) pbc_offsets_per_atom = torch.repeat_interleave( pbc_offsets, num_atoms_per_image_sqr, dim=0 ) # Expand the positions and indices for the 9 cells pos1 = pos1.view(-1, 3, 1).expand(-1, -1, num_cells) pos2 = pos2.view(-1, 3, 1).expand(-1, -1, num_cells) index1 = index1.view(-1, 1).repeat(1, num_cells).view(-1) index2 = index2.view(-1, 1).repeat(1, num_cells).view(-1) # Add the PBC offsets for the second atom pos2 = pos2 + pbc_offsets_per_atom # Compute the squared distance between atoms atom_distance_sqr = torch.sum((pos1 - pos2) ** 2, dim=1) atom_distance_sqr = atom_distance_sqr.view(-1) # Remove pairs that are too far apart mask_within_radius = torch.le(atom_distance_sqr, radius * radius) # Remove pairs with the same atoms (distance = 0.0) mask_not_same = torch.gt(atom_distance_sqr, 0.0001) mask = torch.logical_and(mask_within_radius, mask_not_same) index1 = torch.masked_select(index1, mask) index2 = torch.masked_select(index2, mask) unit_cell = torch.masked_select( unit_cell_per_atom.view(-1, 3), mask.view(-1, 1).expand(-1, 3) ) unit_cell = unit_cell.view(-1, 3) num_atoms = len(data.pos) num_neighbors = torch.zeros(num_atoms, device=device) num_neighbors.index_add_(0, index1, torch.ones(len(index1), device=device)) num_neighbors = num_neighbors.long() max_num_neighbors = torch.max(num_neighbors).long() # Compute neighbors per image _max_neighbors = copy.deepcopy(num_neighbors) _max_neighbors[ _max_neighbors > max_num_neighbors_threshold ] = max_num_neighbors_threshold _num_neighbors = torch.zeros(num_atoms + 1, device=device).long() _natoms = torch.zeros(data.natoms.shape[0] + 1, device=device).long() _num_neighbors[1:] = torch.cumsum(_max_neighbors, dim=0) _natoms[1:] = torch.cumsum(data.natoms, dim=0) num_neighbors_image = ( _num_neighbors[_natoms[1:]] - _num_neighbors[_natoms[:-1]] ) # If max_num_neighbors is below the threshold, return early if ( max_num_neighbors <= max_num_neighbors_threshold or max_num_neighbors_threshold <= 0 ): return torch.stack((index2, index1)), unit_cell, num_neighbors_image atom_distance_sqr = torch.masked_select(atom_distance_sqr, mask) # Create a tensor of size [num_atoms, max_num_neighbors] to sort the distances of the neighbors. # Fill with values greater than radius*radius so we can easily remove unused distances later. distance_sort = torch.zeros( num_atoms * max_num_neighbors, device=device ).fill_(radius * radius + 1.0) # Create an index map to map distances from atom_distance_sqr to distance_sort index_neighbor_offset = torch.cumsum(num_neighbors, dim=0) - num_neighbors index_neighbor_offset_expand = torch.repeat_interleave( index_neighbor_offset, num_neighbors ) index_sort_map = ( index1 * max_num_neighbors + torch.arange(len(index1), device=device) - index_neighbor_offset_expand ) distance_sort.index_copy_(0, index_sort_map, atom_distance_sqr) distance_sort = distance_sort.view(num_atoms, max_num_neighbors) # Sort neighboring atoms based on distance distance_sort, index_sort = torch.sort(distance_sort, dim=1) # Select the max_num_neighbors_threshold neighbors that are closest distance_sort = distance_sort[:, :max_num_neighbors_threshold] index_sort = index_sort[:, :max_num_neighbors_threshold] # Offset index_sort so that it indexes into index1 index_sort = index_sort + index_neighbor_offset.view(-1, 1).expand( -1, max_num_neighbors_threshold ) # Remove "unused pairs" with distances greater than the radius mask_within_radius = torch.le(distance_sort, radius * radius) index_sort = torch.masked_select(index_sort, mask_within_radius) # At this point index_sort contains the index into index1 of the closest max_num_neighbors_threshold neighbors per atom # Create a mask to remove all pairs not in index_sort mask_num_neighbors = torch.zeros(len(index1), device=device).bool() mask_num_neighbors.index_fill_(0, index_sort, True) # Finally mask out the atoms to ensure each atom has at most max_num_neighbors_threshold neighbors index1 = torch.masked_select(index1, mask_num_neighbors) index2 = torch.masked_select(index2, mask_num_neighbors) unit_cell = torch.masked_select( unit_cell.view(-1, 3), mask_num_neighbors.view(-1, 1).expand(-1, 3) ) unit_cell = unit_cell.view(-1, 3) edge_index = torch.stack((index2, index1)) return edge_index, unit_cell, num_neighbors_image
def expand_mask(mask, block_rows, block_cols): mask = torch.repeat_interleave(mask, block_rows, dim=0) mask = torch.repeat_interleave(mask, block_cols, dim=1) return mask
def cut_lonely_connections(self): govs = [] gov_in = None gov_out = None do_avg_pool = 0 for layer, (is_conv, next_is_conv) in lookahead_type(self.model.modules()): is_conv = isinstance(layer, nn.Conv2d) is_fc = isinstance(layer, nn.Linear) is_avgpool = isinstance(layer, nn.AdaptiveAvgPool2d) if is_avgpool: do_avg_pool = int(np.prod(layer.output_size)) elif is_conv or is_fc: out_dim, in_dim = layer.weight.shape[:2] if gov_in is None: gov_in = nn.Parameter(torch.ones(in_dim).to(self.device), requires_grad=True) govs.append(gov_in) else: gov_in = gov_out gov_out = nn.Parameter(torch.ones(out_dim).to(self.device), requires_grad=True) govs.append(gov_out) # substitute activation function if is_fc: if do_avg_pool > 0: layer.do_avg_pool = do_avg_pool do_avg_pool = 0 # layer.forward = types.MethodType(group_snip_forward_linear, layer) # if is_conv: # layer.forward = types.MethodType(group_snip_conv2d_forward, layer) indices = {} idx = 0 for id, layer in self.model.mask.items(): if 'conv' in id: # input input = [] for i in range(layer.shape[1]): if len(torch.nonzero(layer[:, i, :, :])) == 0: input.append(0) else: input.append(1) # output output = [] for i in range(layer.shape[0]): if len(torch.nonzero(layer[i, :, :, :])) == 0: output.append(0) else: output.append(1) else: # input input = [] for i in range(layer.shape[1]): if len(torch.nonzero(layer[:, i])) == 0: input.append(0) else: input.append(1) # output output = [] for i in range(layer.shape[0]): if len(torch.nonzero(layer[i, :])) == 0: output.append(0) else: output.append(1) # indices indices[(idx, id)] = torch.tensor(input) idx += 1 indices[(idx, id)] = torch.tensor(output) idx += 1 old_key = () old_length = 0 input = True for key, value in indices.items(): length = len(value) # TODO: Handle early in training by resetting the optimizer if input == True: # breakpoint() if length == old_length: indices[old_key] = value.__or__(indices[old_key]) indices[key] = value.__or__(indices[old_key]) elif old_length != 0 and length % old_length == 0 and ( 'fc' in key[1] or 'classifier' in key[1]): ratio = length // old_length new_indices = torch.repeat_interleave( indices[old_key], ratio) for i in range(old_length): if sum(new_indices[i * ratio:ratio * (i + 1)].__or__( value[i * ratio:ratio * (i + 1)])) == ratio: indices[old_key][i] = 1 else: indices[old_key][i] = 0 indices[key] = torch.repeat_interleave( indices[old_key], ratio) old_length = length old_key = key input = not input self.structured_prune(indices) return indices
def forward(self, x, y=None, x_raw=None): """ Add loss for identifying signal in a frame? """ batch_size = x.shape[0] n_timesteps = x.shape[1] prev_h = torch.zeros(batch_size, self.rnn_dim).to(self.device) prev_c = torch.zeros(batch_size, self.rnn_dim).to(self.device) if self.use_frame_loss: frame_logits = [] for t in range(n_timesteps): x_t = x[:, t, :, :].unsqueeze(1) conv_t = self.conv_block(x_t).view(batch_size, -1) # conv_t = self.dropout(conv_t) proj_t = self.linear_projection(conv_t) # proj_t = self.dropout(proj_t) INCLUDED IN RNN BLOCK prev_h, prev_c = self.rnn_block(proj_t, prev_h, prev_c) if self.use_frame_loss: frame_logit = self.frame_linear(prev_h) frame_logits.append(frame_logit) logits = self.clf(prev_h) # if self.use_frame_loss: # frame_logits = torch.cat(frame_logits, dim=0) outputs = (logits, ) if y is not None: loss_fct = CrossEntropyLoss(weight=self.loss_weight) loss = loss_fct(logits, y) if self.use_frame_loss: # print('y.shape: {}'.format(y.shape)) y_frame = torch.repeat_interleave(y.view(-1, 1), n_timesteps, dim=1) # print('y_frame.shape: {}'.format(y_frame.shape)) x_frame = x_raw.view(batch_size, n_timesteps, -1) # print('x_frame.shape: {}'.format(x_frame.shape)) x_max = torch.max(x_frame, dim=-1)[0] # print('x_max: {}'.format(x_max)) # print('x_max.shape: {}'.format(x_max.shape)) y_frame[(y_frame == 1) & (x_max < 10)] = 0 # print('y_frame: {}'.format(y_frame)) # print('y_frame.shape: {} mean: {}'.format(y_frame.shape, y_frame.double().mean())) frame_logits = torch.cat(frame_logits, dim=0) # print('frame_logits.shape: {}'.format(frame_logits.shape)) frame_logits = frame_logits.view(-1, 2) # print('frame_logits.shape: {}'.format(frame_logits.shape)) y_frame = y_frame.view(-1) # print('y_frame.shape: {}'.format(y_frame.shape)) frame_loss = loss_fct(frame_logits, y_frame) loss += self.frame_loss_weight * frame_loss # input('okty') outputs = (loss, ) + outputs return outputs
def test_graph_size_norm(): batch = torch.repeat_interleave(torch.full((10, ), 10, dtype=torch.long)) norm = GraphSizeNorm() out = norm(torch.randn(100, 16), batch) assert out.size() == (100, 16)
def prediction_from_trained_model_beam_Search(self, i, ys, score_1, AM_local_scores, beam, hyps, gamma, batch_size): """ ####vecotorized beam-search ===>beam search that happens parllelly i.e., 1.Each prefix is treated as a invidual sequence when given to the model and the predictions for each prefixes are obtained; 2.Each prefix has a beam of new possible labels, so each prefix is repeated beam number of times and the new label is concatented so does the likeli-hood score; 3.the new prefixes are hyps_no*beam are pruned to settle with hyps_no prefixes 4.Eos threshold is used if any of the predicted labels in the beam is eos 5. If any of the hypotheis has ended the duplication avoided to increase diverse batches #folded accordingly and the beam of new """ if i == 0: ###for the first time just repeat the hyps and add the beam to the hyposhesis local_best_scores, local_best_ids = torch.topk(AM_local_scores, hyps, dim=1, largest=True, sorted=True) #--------------------- present_ids = (local_best_ids[::hyps]).contiguous().view(-1, 1) present_scores = (local_best_scores[::hyps]).contiguous().view( -1, 1) ##for not allow ing eos as first token ##first lable cannot be eos #----------------------------------------------------------------------- mask = torch.eq(present_ids, self.eos_id) ys = torch.cat((ys, present_ids), dim=1) score_1 = torch.cat((score_1, present_scores), dim=1) #----------------------------------------------------------------------- mask = torch.eq(present_ids, self.eos_id) score_1 = score_1 - mask * 1000 ###Not corrected ------>should be expanded and selected with selection index,,,,, but model regenerates them with labels in i=>1 #---------------------------------------------------------------------------- else: #--------------------- local_best_scores, local_best_ids = torch.topk(AM_local_scores, beam, dim=1, largest=True, sorted=True) #--------------------- ################################################### ####filtering EOS if EOS has occured with the value leess than the threshold then filtering out # ---------EoS threshold-------------------------------- not_eos_mask = (local_best_ids == self.eos_id) ###EOS scores and ids ,Non EOS score and IDs ##max of Non Eos in dim=1 by making non-Eos -1000 ##max of Eos in dim=1 by making non Eos -1000 ##compute the threshold if [ EOS > gamma * NON_EOS] #####filter out using outer product NON_EOS_mask, NON_EOS_mask_ids = torch.max( local_best_scores * ~not_eos_mask + not_eos_mask * -1000, dim=1) EOS_mask, EOS_mask_ids = torch.max( local_best_scores * not_eos_mask + ~not_eos_mask * -1000, dim=1) EOS_out = EOS_mask > gamma * NON_EOS_mask EOS_SCORE_MASK = (not_eos_mask.transpose(0, 1) * EOS_out).transpose(0, 1) local_best_scores = local_best_scores - (not_eos_mask * 1 * ~EOS_SCORE_MASK * 1000.0) #-------------------------------------------------------- #repeat the prefixes beam times ys_1 = torch.repeat_interleave(ys, beam, 0) score_2 = torch.repeat_interleave(score_1, beam, 0) #---------------------------------------------------- present_ids = (local_best_ids).contiguous().view(-1, 1) present_scores = (local_best_scores).contiguous().view(-1, 1) #---------------------------------------------------- #concatenate labels and scores to the prefixes ys = torch.cat((ys_1, present_ids), dim=1) score_1 = torch.cat((score_2, present_scores), dim=1) #---------------------------------------------------- ###fold accordingly to get hyps *beam and prune out the worst hypothisis keeping beam no of prefixes pres_acuml_score = torch.cumsum(score_1, dim=1)[:, -1] al1, al2 = torch.topk(pres_acuml_score.view( batch_size, hyps * beam, 1), hyps, dim=1, largest=True, sorted=True) selecting_index = torch.cat([al2] * ys.size(1), dim=2) #---------------------------------------------------- #---------------------------------------------------- ###regrouping acording to utterances after selecting top K this is needed for gathering as per topk ys = ys.view(batch_size, hyps * beam, -1) score_1 = score_1.view(batch_size, hyps * beam, -1) #---------------------------------------------------- ###prunng the output using gather #selecting the top labels and scores ys = torch.gather(ys, 1, selecting_index) score_1 = torch.gather(score_1, 1, selecting_index) ###making it ready for next iteration ### converting the selected hypothesis per utterances to the seperate hypothesis to process the parallel ys = ys.view(batch_size * hyps, -1) score_1 = score_1.view(batch_size * hyps, -1) #################################################### #----------Eos servived the past iteration then it is #acccepted EOS so no new labels after EOS, score, should be set to zero otherwise we get bad hypotheis if i > 1: selected_EOS = torch.eq(ys[:, -2], self.eos_id) score_1[:, -1] = score_1[:, -1] * (~selected_EOS) ys[:, -1][selected_EOS] = self.eos_id #------------------------------ return ys, score_1
def _repeat_one_sequence(self, x, d): """Repeat each frame according to duration for torch 1.1+.""" return torch.repeat_interleave(x, d, dim=0)
def _apply_texture( self, videos: "torch.Tensor", patch: "torch.Tensor", foreground: Optional["torch.Tensor"], patch_points: Optional[np.ndarray], ) -> "torch.Tensor": """ Apply texture over background and overlay foreground. :param videos: Video samples. :param patch: Patch to apply. :param foreground: Foreground mask. :param patch_points: Array of shape (nb_frames, 4, 2) containing four pairs of integers (height, width) corresponding to the coordinates of the four corners top-left, top-right, bottom-right, bottom-left of the transformed image in the coordinate-system of the original image. :return: Patched videos. """ import torch # lgtm [py/repeated-import] import torchvision nb_samples = videos.shape[0] nb_frames = videos.shape[1] frame_height = videos.shape[2] frame_width = videos.shape[3] image_mask = self._get_patch_mask(nb_samples=nb_samples) image_mask = image_mask.float() patch = patch.float() padded_patch = torch.stack([patch] * nb_samples) if patch_points is None: pad_h_before = self.x_min pad_h_after = int(videos.shape[self.i_h + 1] - pad_h_before - image_mask.shape[self.i_h_patch + 1]) pad_w_before = self.y_min pad_w_after = int(videos.shape[self.i_w + 1] - pad_w_before - image_mask.shape[self.i_w_patch + 1]) image_mask = image_mask.permute(0, 3, 1, 2) image_mask = torchvision.transforms.functional.pad( img=image_mask, padding=[pad_w_before, pad_h_before, pad_w_after, pad_h_after], fill=0, padding_mode="constant", ) image_mask = image_mask.permute(0, 2, 3, 1) image_mask = torch.unsqueeze(image_mask, dim=1) image_mask = torch.repeat_interleave(image_mask, dim=1, repeats=nb_frames) image_mask = image_mask.float() padded_patch = padded_patch.permute(0, 3, 1, 2) padded_patch = torchvision.transforms.functional.pad( img=padded_patch, padding=[pad_w_before, pad_h_before, pad_w_after, pad_h_after], fill=0, padding_mode="constant", ) padded_patch = padded_patch.permute(0, 2, 3, 1) padded_patch = torch.unsqueeze(padded_patch, dim=1) padded_patch = torch.repeat_interleave(padded_patch, dim=1, repeats=nb_frames) padded_patch = padded_patch.float() else: startpoints = [[0, 0], [frame_width, 0], [frame_width, frame_height], [0, frame_height]] endpoints = np.zeros_like(patch_points) endpoints[:, :, 0] = patch_points[:, :, 1] endpoints[:, :, 1] = patch_points[:, :, 0] image_mask = image_mask.permute(0, 3, 1, 2) image_mask = torchvision.transforms.functional.resize( img=image_mask, size=[int(videos.shape[2]), int(videos.shape[3])], interpolation=torchvision.transforms.InterpolationMode. BILINEAR, ) image_mask_list = [] for i_frame in range(nb_frames): image_mask_i = torchvision.transforms.functional.perspective( img=image_mask, startpoints=startpoints, endpoints=endpoints[i_frame], interpolation=torchvision.transforms.InterpolationMode. BILINEAR, fill=0, ) image_mask_i = image_mask_i.permute(0, 2, 3, 1) image_mask_list.append(image_mask_i) image_mask = torch.stack(image_mask_list, dim=1) image_mask = image_mask.float() padded_patch = padded_patch.permute(0, 3, 1, 2) padded_patch = torchvision.transforms.functional.resize( img=padded_patch, size=[int(videos.shape[2]), int(videos.shape[3])], interpolation=torchvision.transforms.InterpolationMode. BILINEAR, ) padded_patch_list = [] for i_frame in range(nb_frames): padded_patch_i = torchvision.transforms.functional.perspective( img=padded_patch, startpoints=startpoints, endpoints=endpoints[i_frame], interpolation=torchvision.transforms.InterpolationMode. BILINEAR, fill=0, ) padded_patch_i = padded_patch_i.permute(0, 2, 3, 1) padded_patch_list.append(padded_patch_i) padded_patch = torch.stack(padded_patch_list, dim=1) padded_patch = padded_patch.float() inverted_mask = (torch.from_numpy( np.ones(shape=image_mask.shape, dtype=np.float32)).to( self.estimator.device) - image_mask) if foreground is not None: combined = (videos * inverted_mask + padded_patch * image_mask - padded_patch * ~foreground.bool() + videos * ~foreground.bool() * image_mask) else: combined = videos * inverted_mask + padded_patch * image_mask return combined
def update_first_layer_conv(net, layers, layer_index, data_, dtype, scd_args, criterion, target, device): layer = layers[layer_index] data = data_ batch_size = scd_args.batch_size for idx in np.random.permutation( net._modules[layer].weight.shape[0])[:scd_args.updated_nodes]: net._modules[layer].bias[idx].zero_() # Get the global bias for this batch train_loss, global_bias = init_conv(net, data, layer, criterion, target, dtype, idx, scd_args) weights = net._modules[layer].weight weight_size = weights.size()[1:] n_nodes = weight_size[0] * weight_size[1] * weight_size[2] updated_features = min(n_nodes, scd_args.updated_conv_features) cords_index = np.random.choice(n_nodes, updated_features, False) cords = [] for i in range(weight_size[0]): for j in range(weight_size[1]): for k in range(weight_size[2]): cords.append([i, j, k]) cords = torch.tensor(cords)[cords_index] best_w = weights[idx:idx+1].clone() w_incs1 = torch.tensor([-1, 1]).type_as(best_w) * scd_args.w_inc1 if 'si' in layer: inc = [] for i in range(w_incs1.shape[0]): w_inc = w_incs1[i] w_ = torch.repeat_interleave( best_w, updated_features, dim=0) for i in range(updated_features): w_[i, cords[i][0], cords[i][1], cords[i][2]] += w_inc inc.append(w_) w_ = torch.cat(inc, dim=0) del inc if scd_args.normalize: w_ /= w_.view((updated_features* w_incs1.shape[0], -1)).norm(dim=1).view((-1, 1, 1, 1)) # w_ = torch.cat([w_, -1.0 * w_], dim=1) else: w_incs2 = -1 w_ = torch.repeat_interleave( best_w, updated_features, dim=0) for i in range(updated_features): w_[i, cords[i][0], cords[i][1], cords[i][2]] *= w_incs2 ic = updated_features * w_incs1.shape[0] if 'si' in layer else \ updated_features temp_module = torch.nn.Conv2d(in_channels=data.size(1), out_channels=ic, kernel_size=list(weights.size()[2:]), padding=net._modules[layer].padding).to(dtype=dtype, device=device) temp_module.weight = nn.Parameter(w_) temp_module.bias.zero_() temp_module.requires_grad_(False) # projection's shape nrows(1500) * ic(96) * H * W projection = temp_module(data) del temp_module new_projection, bias = update_conv_weight(projection, global_bias, scd_args) del projection n_batch = data_.size(0) // batch_size yps = [] for i in range(n_batch): new_projection_batch = new_projection[ batch_size * i: batch_size * (i + 1)] n_r = new_projection_batch.size(0) # 1500 n_w = new_projection_batch.size(1) # 16 n_b = new_projection_batch.size(2) # 20 height = new_projection_batch.size(3) # 32 width = new_projection_batch.size(4) new_projection_batch = new_projection_batch.reshape((n_r, n_w * n_b, height, width)) # new_projection 1500*16*20 bias 16*20 # original projection feed into next layer projection_batch = net(data[batch_size * i: batch_size * (i + 1)], input_=layer, layer=layer+'_projection') # replace projection[:, idx] after flatten variations projection_batch = torch.repeat_interleave(projection_batch.unsqueeze_(dim=1), n_w*n_b, dim=1) projection_batch[:, :, idx] = new_projection_batch del new_projection_batch projection_batch = projection_batch.transpose_(0, 1).reshape((-1, projection_batch.size(2), height, width)) yp = net(projection_batch, input_=layer + '_ap').reshape((n_w * n_b, n_r, -1)) del projection_batch yp = yp.transpose_(0, 1).reshape((n_r, n_w, n_b)) yps.append(yp) yps = torch.cat(yps, dim=0) loss_group = criterion(yps, target[:n_batch * batch_size].unsqueeze(dim=1)) loss_group = loss_group.cpu().numpy() new_loss = loss_group.min() if new_loss <= train_loss: row, col = np.unravel_index(loss_group.argmin(), loss_group.shape) net._modules[layer].weight[idx] = nn.Parameter(w_[row], requires_grad=False) net._modules[layer].bias[idx].fill_(bias[row, col]) del w_, loss_group, bias return min(new_loss, train_loss)