def getDataInformation(dataset): totalSize = dataset.__len__() print("total light curves: ", totalSize) # samples by filter samplesByFilter = np.zeros(shape=(totalSize, 6)) # time length mean timeLength = np.zeros(shape=(totalSize, 6)) # mean error errorMeanByFilter = np.zeros(shape=(totalSize, 6)) # flux fluxMeanByFilter = np.zeros(shape=(totalSize, 6)) fluxStdByFilter = np.zeros(shape=(totalSize, 6)) # delta time mean deltaTimeMean = np.zeros(shape=(totalSize, 6)) targets = np.zeros(shape=(totalSize, 6)) # to filter after arrays = [ samplesByFilter, timeLength, errorMeanByFilter, fluxMeanByFilter, fluxStdByFilter, deltaTimeMean, targets ] for idx, data_ in enumerate(dataset): data = data_[0] # get samples by filter samplesByFilter[idx, :] = torch.count_nonzero(data[:, 3, :], dim=1) # get targets targets[idx, :] = data_[1] # iterating by each filter for i in range(6): # data = data_[0] # taking last valid index # mask= [1,1,1,1,1, 0,0,0,0] # so it is taking the last non zero -1 (last item index) lastIndex = samplesByFilter[idx, i].astype(int) - 1 # test # assert data[0][i, 3, lastIndex] == 1 and data[0][i, 3, lastIndex+1] == 1 if data[i, 0, 0:lastIndex].shape[0] <= 1: timeLength[idx, i] = float("NaN") errorMeanByFilter[idx, i] = float("NaN") fluxMeanByFilter[idx, i] = float("NaN") fluxStdByFilter[idx, i] = float("NaN") deltaTimeMean[idx, i] = float("NaN") else: # diff between last and first time timeLength[idx, i] = data[i, 0, lastIndex] - data[i, 0, 0] # mean error # print(torch.mean(data[0][i, 2, 0:lastIndex], dim = 0).shape) # errorMeanByFilter[idx, i] = torch.mean(data[i, 2, 0:lastIndex], dim = 0) errorMeanByFilter[idx, i] = torch.median(data[i, 2, 0:lastIndex]) # flux mean # fluxMeanByFilter[idx, i] = torch.mean(data[i, 1, 0:lastIndex], dim = 0) fluxMeanByFilter[idx, i] = torch.median(data[i, 1, 0:lastIndex]) # unbiased = True give nan for channle 0 # fluxStdByFilter[idx, i] = torch.std(data[i, 1, 0:lastIndex], dim = 0) fluxStdByFilter[idx, i] = torch.quantile( data[i, 1, 0:lastIndex], 0.75, dim=0) - torch.quantile( data[i, 1, 0:lastIndex], 0.25, dim=0) # delta time originalTime = data[i, 0, 0:lastIndex] # print(originalTime.shape) # if originalTime.shape[0] <= 1 : # # print("shape short") # deltaTimeMean[idx, i] = float("NaN") # print(deltaTimeMean[idx, i]) # print(np.isnan(deltaTimeMean[idx, i])) # else: # deltaTimeMean[idx, i] = torch.mean(originalTime[1:] - originalTime[:-1], dim = 0) deltaTimeMean[idx, i] = torch.median(originalTime[1:] - originalTime[:-1]) # print(torch.median(originalTime[1:] - originalTime[:-1])) # print(torch.mean(originalTime[1:] - originalTime[:-1], dim = 0)) # remove nanas mask = ~np.isnan(deltaTimeMean).any(axis=1) # print(np.any(mask)) for idx, array in enumerate(arrays): arrays[idx] = array[mask, :] print(arrays[idx].shape) return arrays
def main(): parser = argparse.ArgumentParser() parser.add_argument("--work_dir", default="./exp_results", type=str, help="output dir") parser.add_argument("--exp_name", default="mimicry_pretrained-seed1", type=str, help="exp name") parser.add_argument("--model", default="sngan", type=str, help="network model") parser.add_argument("--loss_type", default="hinge", type=str, help="loss type") parser.add_argument("--classifier", default="vgg16", type=str, help="calssifier network model") parser.add_argument('--gpu', default='0', type=str, help='id(s) for CUDA_VISIBLE_DEVICES') parser.add_argument('--batch_size', default=100, type=int) parser.add_argument('--seed', default=1, type=int) parser.add_argument("--netG_ckpt_step", type=int) parser.add_argument("--netG_train_mode", action='store_true') parser.add_argument("--use_original_netD", action='store_true') parser.add_argument('--attr', default='Bald', type=str) parser.add_argument('--drs', action='store_true') parser.add_argument('--num_samples', default=50000, type=int) args = parser.parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu set_seed(args.seed) print(args) save_path = f'{args.work_dir}/{args.exp_name}' if torch.cuda.is_available(): device = "cuda" cudnn.benchmark = True else: device = "cpu" # load model assert args.netG_ckpt_step print(f'load model from {save_path} step: {args.netG_ckpt_step}') if args.drs: netG, _, netD_drs, _, _, _ = get_gan_model(dataset_name='celeba', model=args.model, loss_type=args.loss_type, drs=True) else: netG, _, _, _ = get_gan_model( dataset_name='celeba', model=args.model, loss_type=args.loss_type, ) netG.to(device) if not args.netG_train_mode: netG.eval() netG.to(device) if args.drs: netD_drs.eval() netD_drs.to(device) gan_ckpt = f'{args.work_dir}/{args.exp_name}/checkpoints/netG/netG_{args.netG_ckpt_step}_steps.pth' if args.use_original_netD: netD_drs_ckpt = f'{args.work_dir}/{args.exp_name}/checkpoints/netD/netD_{args.netG_ckpt_step}_steps.pth' else: netD_drs_ckpt = f'{args.work_dir}/{args.exp_name}/checkpoints/netD_drs/netD_drs_{args.netG_ckpt_step}_steps.pth' print(gan_ckpt) netG.restore_checkpoint(ckpt_file=gan_ckpt) if args.drs: netD_drs.restore_checkpoint(ckpt_file=netD_drs_ckpt) netG = DRS(netG=netG, netD=netD_drs, device=device) # load classifier print('Load classifier') if args.classifier == 'vgg16': model = models.vgg16(pretrained=True) elif args.classifier == 'resnet18': model = models.resnet18(pretrained=True) elif args.classifier == 'inception': model = models.inception_v3(pretrained=True) else: raise ValueError('model should be vgg16 or resnet18 or inception') # change the number of classes in_features = model.classifier[6].in_features model.classifier[6] = nn.Linear(in_features, 2, bias=True) classifier_path = './convnet_celeba' model.load_state_dict( torch.load(os.path.join(classifier_path, f'{args.attr}.pth'))) model.to(device) batch_size = min(args.batch_size, args.num_samples) num_batches = args.num_samples // batch_size attr_num = 0 not_attr_num = 0 for i in range(num_batches): with torch.no_grad(): img = netG.generate_images(batch_size, device=device) labels = model(img) answers = torch.argmax(labels, dim=1) attr = torch.count_nonzero(answers).item() not_attr = batch_size - attr attr_num += attr not_attr_num += not_attr print(f'attr: {attr_num}') print(f'not attr: {not_attr_num}') output_dir = os.path.join(save_path, 'evaluate', f'step-{args.netG_ckpt_step}') if not os.path.exists(output_dir): os.makedirs(output_dir) output_file = os.path.join(output_dir, f'count_attribute.csv') if os.path.exists(output_file): with open(output_file, 'a', newline='') as f: wr = csv.writer(f) wr.writerow([args.attr, attr_num, not_attr_num]) else: with open(output_file, 'w', newline='') as f: wr = csv.writer(f) wr.writerow(['', 'attr', 'not attr']) wr.writerow([args.attr, attr_num, not_attr_num])
def custom_loss(data, model, p): """ Compute total loss: supervised + unsupervised consistency loss. Parameters ---------- data : tuple The output of the generator. Returns ------- loss_value : scalar Total loss. loss_sup : scalar Supervised loss. loss_usup : scalar Unsupervised loss. pair_sup : a tuple of tensors Ground truth labels and predictions on labeled examples. pair_usup : a tuple of tensors Predictions on two differently transformed labeled and unlabeled examples. """ inputs, y, labeled = data x = inputs[0] transform_parameters = inputs[1] # number of unique labeled and labeled+unlabeled images n_labeled = torch.count_nonzero(labeled) n = x.shape[0] # -- transformation -- # get a transform function # func.get_batch_transform_displacement_map(*transform_parameters,**p.transform.params_apply) transform = getattr(func, 'get_batch_transform_' + p.transform.apply_func) \ (*transform_parameters, **p.transform.params_apply) t_x = torch.stack(transform(x)) # transform input images x = torch.cat((x, t_x), dim=0) # form a batch to feed to the network # if network outputs and labels also need to be transformed (as in the segmentation case): if p.transform_output: transform_output = getattr(func, 'get_batch_transform_' + p.transform_output.apply_func) \ (*transform_parameters, **p.transform_output.params_apply) t_y = transform_output(y) # transform GT labels y = torch.cat((y, t_y), dim=0) # form a batch corresponding to x else: y = torch.cat((y, y), dim=0) # pred1 = deform.deform(pred1) # pred1 = transform_output(pred1) # save original and transformed inputs when in the debugging mode: if p.debug: # and run_eagerly: improc.plot_batch_sample( p, x.numpy(), y.numpy(), os.path.join(p.results_path, p.exp_name, 'debug/model_input.png')) improc.plot_batch_sample( p, x[n:, ...].numpy(), y[n:, ...].numpy(), os.path.join(p.results_path, p.exp_name, 'debug/model_transformed_input.png')) # -- end transformation -- pred = model(x) # separate differently transformed pred1, pred2 = pred[:n, ...], pred[n:, ...] if p.transform_output: # transform the first half of the predictions to align it with the second half: pred1 = transform_output(pred1) # separate labeled images from the rest yl = torch.cat((y[:n_labeled, ...], y[:n_labeled, ...]), dim=0) predl = torch.cat((pred[:n_labeled, ...], pred[n:(n + n_labeled), ...]), dim=0) # supervised loss loss_sup = kl_divergence(yl, predl) # unsupervised loss made symmetric (e.g. KL divergence is not symmetric) loss_usup = (kl_divergence(pred1, pred2) + kl_divergence(pred2, pred1)) / 2 # total loss: supervised + weight * unsupervised consistency loss_value = loss_sup + p.alpha * loss_usup return loss_value, loss_sup, loss_usup, (yl, predl), (pred1, pred2)
def get_costs(dataset, pi): if pi.size( -1 ) == 1: # In case all tours directly return to depot, prevent further problems assert ( pi == 0).all(), "If all length 1 tours, they should be zero" # Return return torch.zeros(pi.size(0), dtype=torch.float, device=pi.device), None # Check that tours are valid, i.e. contain 0 to n -1 sorted_pi = pi.data.sort(1)[0] # Make sure each node visited once at most (except for depot) assert ((sorted_pi[:, 1:] == 0) | (sorted_pi[:, 1:] > sorted_pi[:, :-1])).all(), "Duplicates" chGain = dataset['chGain'] #(batch_size, n_loc) dataLoad = dataset['dataLoad'] #(batch_size, n_users) puncFlag = dataset['puncFlag'] #(batch_size, n_RBs) numerology = dataset['numerology'] #(batch_size) availRBNum = dataset['availRBNum'] #(batch_size) batch_size, n_loc = chGain.size() _, n_users = dataLoad.size() _, n_RBs = puncFlag.size() depotChGain = torch.zeros(batch_size, dtype=torch.float, device=chGain.device) chCond = torch.cat((depotChGain[:, None], chGain), -1) dataRate = torch.zeros_like(dataLoad) remainingDemands = dataLoad.clone() remainingUnusedRB = availRBNum - torch.count_nonzero(puncFlag, dim=-1) consumedPower = torch.zeros(batch_size, dtype=torch.float, device=chGain.device) puncRBsNum = torch.zeros(batch_size, dtype=torch.int64, device=chGain.device) ids = torch.arange(batch_size, dtype=torch.int64, device=chGain.device) for i in range(pi.size(1) - 1): selected = pi[:, i] ids_withoutDepot = ids[selected >= 1] selectedUsers = (selected[ids_withoutDepot] - 1) // n_RBs # -1 because of depot selectedRBs = (selected[ids_withoutDepot] - 1) % n_RBs # -1 because of depot ids_selectedPuncRBs = ids_withoutDepot[puncFlag[ids_withoutDepot, selectedRBs] == 1] ids_selectedUnusedRBs = ids_withoutDepot[puncFlag[ ids_withoutDepot, selectedRBs] == 0] puncRBsNum[ids_selectedPuncRBs] += 1 remainingUnusedRB[ids_selectedUnusedRBs] -= 1 cur_chCond = chCond[ids, selected] allocatedPower = LL.SNR_THR / cur_chCond[ids_withoutDepot] #consumedPower[ids_withoutDepot] += allocatedPower consumedPower[ids_withoutDepot] += 10.0 * torch.log10( allocatedPower) chDispersion = 1. - (1. / ( (1. + allocatedPower * cur_chCond[ids_withoutDepot])**2)) chUse = torch.tensor( list( map(LL.CHANNEL_USE.get, numerology[ids_withoutDepot].tolist()))).to( chCond.device) qFuncInv = math.sqrt(2) * erfinv(1 - (2 * LL.EPSILON)) dataRate[ids_withoutDepot, selectedUsers] += (1. / math.log(2)) * ( chUse * (torch.log(1 + allocatedPower * cur_chCond[ids_withoutDepot])) - torch.sqrt(chUse * chDispersion) * qFuncInv) remainingDemands[ids_withoutDepot, selectedUsers] -= dataRate[ids_withoutDepot, selectedUsers] remainingDemands[ids_withoutDepot, selectedUsers] = torch.max( remainingDemands[ids_withoutDepot, selectedUsers], torch.tensor([0]).to(chCond.device)) return (consumedPower / 60.0 + torch.count_nonzero( (remainingDemands > 0).float(), dim=-1) + ((remainingUnusedRB > 0) & (puncRBsNum > 0)).float()), None
shape = (42, 32, 10, 10) x = torch.randn(shape) z = torch.randn(shape) tnew = fft.rfftn(x, dim=[2, 3]) * torch.conj(fft.rfftn(z, dim=[2, 3])) # tnew2 = torch.randn(shape, dtype=torch.complex64) # tnew2 = torch.empty(shape, dtype=torch.complex64).normal_(mean=0, std=0.00001) # h1 = torch.histc(torch.view_as_real(tnew)) # h2 = torch.histc(torch.view_as_real(tnew2)) # # import matplotlib.pyplot as plt # plt.plot(h1) # plt.plot(h2) # plt.show() sum_float = torch.sum(torch.view_as_real(tnew), dim=1, keepdim=True) sum_complex = torch.sum(tnew, dim=1, keepdim=True) print( 'Equality as real: ', torch.count_nonzero(torch.eq(sum_float, torch.view_as_real(sum_complex))).item()) print( 'Equality as complex: ', torch.count_nonzero(torch.eq(torch.view_as_complex(sum_float), sum_complex)).item()) print('Compared as real: ', torch.allclose(sum_float, torch.view_as_real(sum_complex))) print('Compared as complex: ', torch.allclose(torch.view_as_complex(sum_float), sum_complex))
def get_aux_loss(self, aux_model: nn.Module, observations: ObservationType, obs_embeds: torch.FloatTensor, actions: torch.FloatTensor, beliefs: torch.FloatTensor, masks: torch.FloatTensor, *args, **kwargs): ## we discard the last action in the batch num_steps, num_sampler = actions.shape # T, B actions = cast(torch.LongTensor, actions) actions = actions[:-1] # (T-1, B) ## find the final belief state based on masks # we did not compute loss here as model.forward is compute-heavy masks = masks.squeeze(-1) # (T, B) final_beliefs, _, _ = _propagate_final_beliefs_to_all_steps( beliefs, masks, num_sampler, num_steps, ) ## compute CE loss decoder_in = torch.cat( [obs_embeds[:-1], obs_embeds[1:], final_beliefs[:-1]], dim=2) # (T-1, B, *) preds = aux_model(decoder_in) # (T-1, B, A) # cross entropy loss require class dim at 1 loss = self.cross_entropy_loss( preds.view((num_steps - 1) * num_sampler, -1), # ((T-1)*B, A) actions.flatten(), # ((T-1)*B,) ) loss = loss.view(num_steps - 1, num_sampler) # (T-1, B) # def vanilla_valid_losses(loss, num_sampler, end_locs_batch): # ## this is just used to verify the vectorized version works correctly. # ## not used for experimentation # valid_losses = [] # for i in range(num_sampler): # end_locs = end_locs_batch[i] # for j in range(len(end_locs)): # if j == 0: # start_loc = 0 # else: # start_loc = end_locs[j - 1] + 1 # end_loc = end_locs[j] # if end_loc - start_loc <= 0: # the episode only 1-step # continue # valid_losses.append(loss[start_loc:end_loc, i]) # if len(valid_losses) == 0: # valid_losses = torch.zeros(1, dtype=torch.float).to(loss) # else: # valid_losses = torch.cat(valid_losses) # (sum m, ) # return valid_losses # valid_losses = masks[1:] * loss # (T-1, B) # valid_losses0 = vanilla_valid_losses(loss, num_sampler, end_locs_batch) # assert valid_losses0.sum() == valid_losses.sum() num_valid_losses = torch.count_nonzero(masks[1:]) if num_valid_losses < self.subsample_min_num: # don't subsample subsample_rate = 1.0 else: subsample_rate = self.subsample_rate loss_masks = masks[1:] * _bernoulli_subsample_mask_like( masks[1:], subsample_rate) num_valid_losses = torch.count_nonzero(loss_masks) avg_loss = (loss * loss_masks).sum() / torch.clamp(num_valid_losses, min=1.0) return ( avg_loss, { "total": cast(torch.Tensor, avg_loss).item(), }, )
def calc_sparsity(tensor): """Calculate the sparsity of a given tensor.""" num_total = tensor.numel() num_zero = num_total - count_nonzero(tensor) return num_zero / num_total
def __calculate_recall_precision_scores( recall: Tensor, precision: Tensor, scores: Tensor, idx_cls: int, idx_bbox_area: int, idx_max_det_thrs: int, eval_imgs: list, rec_thresholds: Tensor, max_det: int, nb_imgs: int, nb_bbox_areas: int, ) -> Tuple[Tensor, Tensor, Tensor]: nb_rec_thrs = len(rec_thresholds) idx_cls_pointer = idx_cls * nb_bbox_areas * nb_imgs idx_bbox_area_pointer = idx_bbox_area * nb_imgs # Load all image evals for current class_id and area_range img_eval_cls_bbox = [ eval_imgs[idx_cls_pointer + idx_bbox_area_pointer + i] for i in range(nb_imgs) ] img_eval_cls_bbox = [e for e in img_eval_cls_bbox if e is not None] if not img_eval_cls_bbox: return recall, precision, scores det_scores = torch.cat( [e["dtScores"][:max_det] for e in img_eval_cls_bbox]) # different sorting method generates slightly different results. # mergesort is used to be consistent as Matlab implementation. inds = torch.argsort(det_scores, descending=True) det_scores_sorted = det_scores[inds] det_matches = torch.cat( [e["dtMatches"][:, :max_det] for e in img_eval_cls_bbox], axis=1)[:, inds] det_ignore = torch.cat( [e["dtIgnore"][:, :max_det] for e in img_eval_cls_bbox], axis=1)[:, inds] gt_ignore = torch.cat([e["gtIgnore"] for e in img_eval_cls_bbox]) npig = torch.count_nonzero(gt_ignore == False) # noqa: E712 if npig == 0: return recall, precision, scores tps = torch.logical_and(det_matches, torch.logical_not(det_ignore)) fps = torch.logical_and(torch.logical_not(det_matches), torch.logical_not(det_ignore)) tp_sum = torch.cumsum(tps, axis=1, dtype=torch.float) fp_sum = torch.cumsum(fps, axis=1, dtype=torch.float) for idx, (tp, fp) in enumerate(zip(tp_sum, fp_sum)): nd = len(tp) rc = tp / npig pr = tp / (fp + tp + torch.finfo(torch.float64).eps) prec = torch.zeros((nb_rec_thrs, )) score = torch.zeros((nb_rec_thrs, )) recall[idx, idx_cls, idx_bbox_area, idx_max_det_thrs] = rc[-1] if nd else 0 # Remove zigzags for AUC diff_zero = torch.zeros((1, ), device=pr.device) diff = torch.ones((1, ), device=pr.device) while not torch.all(diff == 0): diff = torch.clamp(torch.cat((pr[1:] - pr[:-1], diff_zero), 0), min=0) pr += diff inds = torch.searchsorted(rc, rec_thresholds.to(rc.device), right=False) num_inds = inds.argmax() if inds.max() >= nd else nb_rec_thrs inds = inds[:num_inds] prec[:num_inds] = pr[inds] score[:num_inds] = det_scores_sorted[inds] precision[idx, :, idx_cls, idx_bbox_area, idx_max_det_thrs] = prec scores[idx, :, idx_cls, idx_bbox_area, idx_max_det_thrs] = score return recall, precision, scores
def forward(cxt, input, dim=None, quant=8, _scale=None, _initial_zero_point=None): #if not isinstance(dim, int): # raise NotImplemented("Currently Only Support Selecting One Dimension.") if dim != None and input.shape[dim] < 2: return input if _scale != None: output = ((input / _scale + _initial_zero_point).round_().clamp_( min=0, max=(2**quant - 1)) - _initial_zero_point) * _scale elif dim == None: scale = (torch.max(input) - torch.min(input)) / 255 if scale == 0: #scale = 1 return input initial_zero_point = 0 - torch.min(input) / scale if initial_zero_point < 0: initial_zero_point = 0 elif initial_zero_point > 255 * scale: initial_zero_point = 255 * scale else: if math.isnan(initial_zero_point): initial_zero_point = 0 initial_zero_point = int(initial_zero_point) #dtype = torch.qint8 #qm = nn.quantized.Quantize(scale, initial_zero_point, dtype) #dqm = nn.quantized.DeQuantize() #output = dqm(qm(input)) output = ((input / scale + initial_zero_point).round_().clamp_( min=0, max=(2**quant - 1)) - initial_zero_point) * scale else: scale = (1.0 / (2**quant - 1)) * ( torch.max(input, dim=dim, keepdim=True)[0] - torch.min(input, dim=dim, keepdim=True)[0]) if torch.count_nonzero(scale) == 0: return input scale[scale == 0] = 1 #initial_zero_point = 0 + -1*torch.min(input, dim=dim, keepdim=True)[0] #initial_zero_point[initial_zero_point<0] = 0 #initial_zero_point[initial_zero_point>(2**quant-1)] = (2**quant-1) #initial_zero_point = 0 + -1*torch.div(initial_zero_point, scale) initial_zero_point = 0 + -1 * torch.div( torch.min(input, dim=dim, keepdim=True)[0], scale) initial_zero_point[initial_zero_point < 0] = 0 initial_zero_point[initial_zero_point > (2**quant - 1)] = ( 2**quant - 1) initial_zero_point[initial_zero_point != initial_zero_point] = 0 initial_zero_point = initial_zero_point.int() output = ((input / scale + initial_zero_point).round_().clamp_( min=0, max=(2**quant - 1)) - initial_zero_point) * scale #print("SCALE = {}".format(scale)) #print("ZERO_POINT = {}".format(zero_point)) #dtype = torch.qint8 #qm = nn.quantized.Quantize(scale, zero_point, dtype) #dqm = nn.quantized.DeQuantize() #output = dqm(qm(input)) #output = ((input/scale + initial_zero_point).round_().clamp_(min=0, max=(2**quant-1)) - initial_zero_point) * scale #mse_loss = nn.MSELoss() #loss = mse_loss(input, output) #print("Quantization loss: {}".format(loss)) return output
def check_inf_nan(input): if torch.count_nonzero(torch.isinf(input).int()) > 0: print("Inf detected") if torch.count_nonzero(torch.isnan(input).int()) > 0: print("NaN detected")
def optimize_model(self): if len(self.transitions) < self.BATCH_SIZE: return batch = random.sample(list(self.transitions), self.BATCH_SIZE) batch = Transition(*zip(*batch)) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # compute a mask of non-final states and prepare the batch elements non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next__to)), device=device, dtype=torch.bool) non_final_next_states = torch.cat( [s for s in batch.next__to if s is not None]) state_batch = torch.cat(batch.state).to(device) try: action_batch = torch.from_numpy( np.asarray(batch.action).astype("int64")).to(device) except ValueError: print(batch.action) reward_batch = torch.from_numpy(np.asarray(batch.reward)).to(device) reward_batch = reward_batch.float() # compute Q(s_t, a) state_action_values = self.policy_net(state_batch).gather( 1, action_batch.unsqueeze(1)) # compute V(s_{t+1}) for all next states.. next_state_values = torch.zeros(self.BATCH_SIZE, device=device) next_state_values[non_final_mask] = self.model(non_final_next_states).max( 1)[0].detach() # compute the expected Q values expected_state_action_values = (next_state_values * self.GAMMA) + reward_batch # compute Huber loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) # compute imitation loss and combine losses if self.steps_done % 4000 == 0: print("Episode: ", self.i_episode) print("steps done: ", self.steps_done) print("Huber loss: ", loss) if self.steps_done <= self.imitationSteps: if self.steps_done == self.imitationSteps: print("end imitation learning") if self.steps_done == self.BATCH_SIZE: print("start imitation learning") values = self.model(state_batch) a = values.gather(1, action_batch.unsqueeze(1)) output = values.clone() output += 0.8 output[torch.arange(self.BATCH_SIZE), action_batch] -= 0.8 output = values.max(1)[0].detach().unsqueeze(1) imitationLoss = output - a self.accuracy = 1 - (torch.count_nonzero(imitationLoss) / self.BATCH_SIZE) imitationLoss = torch.sum(imitationLoss) if self.steps_done % 4000 == 0: print("imitation loss: ", imitationLoss) loss += imitationLoss # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step()
def train(self, batch: EpisodeBatch, t_env: int, episode_num: int): # Default batch size = 32 episodes # Get the relevant batch quantities rewards = batch["reward"][:, :-1] actions = batch["actions"][:, :-1] terminated = batch["terminated"][:, :-1].float() # Filled boolean indicates if steps were filled to match max. sequence length in the batch mask = batch["filled"][:, :-1].float() mask[:, 1:] = mask[:, 1:] * (1 - terminated[:, :-1]) avail_actions = batch["avail_actions"] # Calculate estimated Q-Values mac_out = [] self.mac.init_hidden(batch.batch_size) # Iterate over all timesteps defined by the max. in the batch for t in range(batch.max_seq_length): agent_outs = self.mac.forward(batch, t=t) mac_out.append(agent_outs) mac_out = th.stack(mac_out, dim=1) # Concat over time # Pick the Q-Values for the actions taken by each agent chosen_action_qvals = th.gather(mac_out[:, :-1], dim=3, index=actions).squeeze( 3) # Remove the last dim # Calculate the Q-Values necessary for the target target_mac_out = [] self.target_mac.init_hidden(batch.batch_size) for t in range(batch.max_seq_length): target_agent_outs = self.target_mac.forward(batch, t=t) target_mac_out.append(target_agent_outs) # We don't need the first timesteps Q-Value estimate for calculating targets target_mac_out = th.stack(target_mac_out[1:], dim=1) # Concat across time # Mask out unavailable actions by setting utility very low target_mac_out[avail_actions[:, 1:] == 0] = -9999999 if self.args.double_q: # Get actions that maximise live Q (for double q-learning) mac_out_detach = mac_out.clone().detach() mac_out_detach[avail_actions == 0] = -9999999 cur_max_actions = mac_out_detach[:, 1:].max(dim=3, keepdim=True)[1] target_max_qvals = th.gather(target_mac_out, 3, cur_max_actions).squeeze(3) else: # Max over target Q-Values target_max_qvals = target_mac_out.max(dim=3)[0] # Mix if self.mixer is not None: chosen_action_qvals = self.mixer(chosen_action_qvals, batch["state"][:, :-1]) target_max_qvals = self.target_mixer(target_max_qvals, batch["state"][:, 1:]) # Calculate 1-step Q-Learning targets targets = rewards + self.args.gamma * (1 - terminated) * target_max_qvals # Td-error td_error = (chosen_action_qvals - targets.detach()) # Mask out previously filled time steps if the env was already terminated in the corresponding batch entry mask = mask.expand_as(td_error) # 0-out the targets that came from padded data masked_td_error = td_error * mask # Normal L2 loss, take mean over actual data loss = (masked_td_error**2).sum() / mask.sum() # Optimise self.optimiser.zero_grad() # Computes dloss/dx for every parameter x which has requires_grad=True. loss.backward() grad_norm = th.nn.utils.clip_grad_norm_(self.parameters(), self.args.grad_norm_clip) self.optimiser.step() # Update target in interval if (episode_num - self.last_target_update_episode ) / self.args.target_update_interval >= 1.0: self.update_targets() self.last_target_update_episode = episode_num trained_steps = th.count_nonzero(mask) self.mac.update_trained_steps( trained_steps.item()) # tell mac how many steps have been trained # Log learner stats in interval if t_env - self.log_stats_t >= self.args.learner_log_interval: self.logger.log_stat(self.name + "loss", loss.item(), t_env) self.logger.log_stat(self.name + "grad_norm", grad_norm.cpu().numpy(), t_env) mask_elems = mask.sum().item() self.logger.log_stat( self.name + "td_error_abs", (masked_td_error.abs().sum().item() / mask_elems), t_env) self.logger.log_stat(self.name + "q_taken_mean", (chosen_action_qvals * mask).sum().item() / (mask_elems * self.args.n_agents), t_env) self.logger.log_stat(self.name + "target_mean", (targets * mask).sum().item() / (mask_elems * self.args.n_agents), t_env) self.log_stats_t = t_env
def angle_rotation_box(self, box, angle, box_flip): rotated_box = tv.transforms.functional.rotate(box_flip,angle) merged_boxes = torch.sum(torch.multiply(box,rotated_box))/torch.count_nonzero(rotated_box) # self.rotated_boxes.append(rotated_box) return merged_boxes
pickle.dump({ 'trial settings': trial, 'initial weight': initial_W, 'initial mask': initial_W_mask}, F) Network.calculate_test_error() training_errors.append(Network.test_error) test_errors.append(Network.test_error) mean_dW = torch.zeros(Network.W.shape).to(trial['configuration']['device']) for epoch_idx in np.arange(n_epochs): print('Starting epoch %d.'%(epoch_idx+1)) t0 = time.time() Network.train_epoch() #Network.calculate_training_error() Network.calculate_test_error() states.append((int(torch.count_nonzero(Network.s==0).cpu()), int(torch.count_nonzero(Network.s==1).cpu()))) training_errors.append(Network.training_error) test_errors.append(Network.test_error) mean_dW = torch.abs(Network.mean_dW)/(epoch_idx+1) + (epoch_idx/(epoch_idx+1))*mean_dW for p in Network.per_layer_rates: per_layer_rates.append(p) print('\tDone.') print('\tTime taken:', (time.time()-t0)) print('\tTraining error:', Network.training_error) print('\tTest error:', Network.test_error) with open(os.path.join(filepath, 'e%d.pickle'%(epoch_idx)), 'wb') as F: pickle.dump({ 'training error': Network.training_error, 'test error': Network.test_error, #'true training error': Network.true_training_error, 'per-layer rates': per_layer_rates[-Network.dataset.n_trainb:],
std=[1 / 0.229, 1 / 0.224, 1 / 0.225]), transforms.Normalize(mean=[-0.485, -0.456, -0.406], std=[1., 1., 1.]), ]) for i, ((img1, img2), y, (class1, class2)) in enumerate(val_dataloader): print("[{} / {}]".format(i, len(val_dataloader))) img1, img2, y = map(lambda x: x.to(device), [img1, img2, y]) class1 = class1[0] class2 = class2[0] prob = model(img1, img2) loss = criterion(prob, y) losses.append(loss.item()) correct += torch.count_nonzero(y == (prob > 0.5)).item() total += len(y) fig = plt.figure("class1={}\tclass2={}".format(class1, class2), figsize=(4, 2)) plt.suptitle("cls1={} conf={:.2f} cls2={}".format( class1, prob[0][0].item(), class2)) # Apply inverse transform (denormalization) on the images to retrieve original images. img1 = inv_transform(img1).cpu().numpy()[0] img2 = inv_transform(img2).cpu().numpy()[0] # show first image ax = fig.add_subplot(1, 2, 1) plt.imshow(img1[0], cmap=plt.cm.gray) plt.axis("off")
def loss_single(self, grid_cell_centers, cls_score, bbox_pred, labels, bbox_targets, label_weights, stride): ''' Args: grid_center [h * w, 2] cls_score [class_num, h, w] bbox_pred [4*(reg_max + 1), h, w] labels [h * w, 1] bbox_targets [h * w, 4] label_weight [h * w, 1] bbox_weight [h * w, 4] Returns: ''' h, w = cls_score.shape[-2], cls_score.shape[-1] cls_score = cls_score.permute(1, 2, 0).reshape(-1, cls_score.shape[-3]) bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, bbox_pred.shape[-3]) labels = labels.squeeze(dim=-1) pos_inds = torch.nonzero((labels >= 0) & (labels < self.class_num), as_tuple=False).squeeze(1) num_pos = torch.count_nonzero(pos_inds) score = labels.new_zeros(labels.shape).float() if len(pos_inds) > 0: pos_bbox_targets = bbox_targets[pos_inds] pos_bbox_pred = bbox_pred[pos_inds] # (n, 4 * (reg_max + 1)) pos_grid_cell_centers = grid_cell_centers[pos_inds] weight_targets = cls_score.detach().sigmoid() weight_targets = weight_targets.max(dim=1)[0][pos_inds] pos_bbox_pred_corners = self.distribution_project(pos_bbox_pred) pos_decode_bbox_pred = distance2bbox(pos_grid_cell_centers, pos_bbox_pred_corners) pos_decode_bbox_targets = pos_bbox_targets / stride score[pos_inds] = bbox_overlaps( pos_decode_bbox_pred.detach().float(), pos_decode_bbox_targets, is_aligned=True) pred_corners = pos_bbox_pred.reshape(-1, self.gl_cfg.reg_max + 1) target_corners = bbox2distance(pos_grid_cell_centers, pos_decode_bbox_targets, self.gl_cfg.reg_max).reshape(-1) # regression loss loss_bbox = self.loss_bbox(pos_decode_bbox_pred, pos_decode_bbox_targets, weight=weight_targets, avg_factor=1.0) # dfl loss loss_dfl = self.loss_dfl(pred_corners, target_corners, weight=weight_targets[:, None].expand( -1, 4).reshape(-1), avg_factor=4.0) else: loss_bbox = bbox_pred.sum() * 0 loss_dfl = bbox_pred.sum() * 0 weight_targets = torch.tensor(0).to(cls_score.device) # qfl loss loss_qfl = self.loss_qfl(cls_score, (labels, score), weight=label_weights) return loss_qfl.sum(), loss_bbox.sum(), loss_dfl.sum(), num_pos
for name, _ in model.named_parameters(): if "logit" in name: layer_name.append(name) for layer_index in range(num_of_layers): average_weights_shared_add = 0 average_weights_shared_mult = 0 average_add_size = 0 average_mult_size = 0 average_original_size = 0 #Loop N times since we're sampling from a distribution N = 100 for n in range(N): add_mask = gumbal(precision_add_mult[layer_index]) mult_mask = gumbal(precision_mult_add[layer_index]) add_size = torch.count_nonzero(add_mask) mult_size = torch.count_nonzero(mult_mask) shared_size = torch.count_nonzero(add_mask * mult_mask) # if n == 0: #plot first mask # encoding = add_mask + 2*mult_mask # if(encoding.shape.__len__() == 1): # encoding = encoding[None, :] #Extend dimension # cmap = colors.ListedColormap(['black', 'blue', 'red', 'purple']) # bounds = [0, 1, 2,3,4] # norm = colors.BoundaryNorm(bounds, cmap.N) # plt.figure(x) # if x % 2 != 0: #odd is bias layer # aspect_size = 100 # elif x == 0: # aspect_size = 5 # elif x == 2:
gpu_X = X.to(device) gpu_Y = Y.to(device) model.train() optimizer.zero_grad() hypothesis = model(gpu_X) cost = criterion(hypothesis, gpu_Y) cost.backward() avg_cost += (cost / total_batch) optimizer.step() model.eval() prediction = model(gpu_X) output = (prediction > accuracy_threshold).float() equal_matrix = torch.logical_and(output, gpu_Y) accuracy = torch.count_nonzero(equal_matrix) / torch.count_nonzero( gpu_Y) avg_acc += (accuracy / total_batch) print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.9f}'.format(avg_cost), 'acc =', '{:.9f}'.format(avg_acc)) if avg_acc > target_accuracy: break ## no Train Model Save model.eval() trace_input = torch.rand(1, 3, 224, 224).to(device, dtype=torch.float32) traced_script_module = torch.jit.trace(model, trace_input) traced_script_module.save( "C://Github//DeepLearningStudy//trained_model//TrainAnimalClassificationV1_FIAT.pt" )
def get_aux_loss(self, aux_model: nn.Module, observations: ObservationType, obs_embeds: torch.FloatTensor, actions: torch.FloatTensor, beliefs: torch.FloatTensor, masks: torch.FloatTensor, *args, **kwargs): # prepare for autoregressive inputs: c_{t+1:t+k} = GRU(b_t, a_{t:t+k-1}) <-> z_{t+k} ## where b_t = RNN(b_{t-1}, z_t, a_{t-1}), prev action is optional num_steps, num_sampler, obs_embed_size = obs_embeds.shape # T, N, H_O assert 0 < self.planning_steps <= num_steps ## prepare positive and negatives that sample from all the batch positives = obs_embeds # (T, N, -1) negative_inds = torch.randperm(num_steps * num_sampler).to( positives.device) negatives = torch.gather( # input[index[i,j]][j] positives.view(num_steps * num_sampler, -1), dim=0, index=negative_inds.view(num_steps * num_sampler, 1).expand(num_steps * num_sampler, positives.size(-1)), ).view(num_steps, num_sampler, -1) # (T, N, -1) ## prepare action sequences and initial beliefs action_embedding = aux_model.action_embedder(actions) # (T, N, -1) action_embed_size = action_embedding.size(-1) action_padding = torch.zeros(self.planning_steps - 1, num_sampler, action_embed_size).to( action_embedding) # (k-1, N, -1) action_padded = torch.cat((action_embedding, action_padding), dim=0) # (T+k-1, N, -1) ## unfold function will create consecutive action sequences action_seq = (action_padded.unfold(dimension=0, size=self.planning_steps, step=1).permute(3, 0, 1, 2).view( self.planning_steps, num_steps * num_sampler, action_embed_size) ) # (k, T*N, -1) beliefs = beliefs.view(num_steps * num_sampler, -1).unsqueeze(0) # (1, T*N, -1) # get future contexts c_{t+1:t+k} = GRU(b_t, a_{t:t+k-1}) future_contexts_all, _ = aux_model.context_model( action_seq, beliefs) # (k, T*N, -1) ## NOTE: future_contexts_all starting from next step t+1 to t+k, not t to t+k-1 future_contexts_all = future_contexts_all.view( self.planning_steps, num_steps, num_sampler, -1).permute(1, 0, 2, 3) # (k, T, N, -1) # get all the classifier scores I(c_{t+1:t+k}; z_{t+1:t+k}) positives_padding = torch.zeros(self.planning_steps, num_sampler, obs_embed_size).to( positives) # (k, N, -1) positives_padded = torch.cat((positives[1:], positives_padding), dim=0) # (T+k-1, N, -1) positives_expanded = positives_padded.unfold(dimension=0, size=self.planning_steps, step=1).permute( 0, 3, 1, 2) # (T, k, N, -1) positives_logits = aux_model.classifier( torch.cat([positives_expanded, future_contexts_all], -1)) # (T, k, N, 1) positive_loss = self.cross_entropy_loss( positives_logits, torch.ones_like(positives_logits)) # (T, k, N, 1) negatives_padding = torch.zeros(self.planning_steps, num_sampler, obs_embed_size).to( negatives) # (k, N, -1) negatives_padded = torch.cat((negatives[1:], negatives_padding), dim=0) # (T+k-1, N, -1) negatives_expanded = negatives_padded.unfold(dimension=0, size=self.planning_steps, step=1).permute( 0, 3, 1, 2) # (T, k, N, -1) negatives_logits = aux_model.classifier( torch.cat([negatives_expanded, future_contexts_all], -1)) # (T, k, N, 1) negative_loss = self.cross_entropy_loss( negatives_logits, torch.zeros_like(negatives_logits)) # (T, k, N, 1) # Masking to get valid scores ## masks: Note which timesteps [1, T+k+1] could have valid queries, at distance (k) (note offset by 1) ## we will extract the **diagonals** as valid_masks from masks later as below ## the vertical axis is (absolute) real timesteps, the horizontal axis is (relative) planning timesteps ## | - - - - - | ## | . | ## | , . | ## | . , . | ## | , . , . | ## | , . , . | ## | , . , | ## | , . | ## | , | ## | - - - - - | masks = masks.squeeze(-1) # (T, N) pred_masks = torch.ones( num_steps + self.planning_steps, self.planning_steps, num_sampler, 1, dtype=torch.bool, ).to(beliefs.device) # (T+k, k, N, 1) pred_masks[ num_steps - 1:] = False # GRU(b_t, a_{t:t+k-1}) is invalid when t >= T, as we don't have real z_{t+1} for j in range(1, self.planning_steps + 1): # for j-step predictions pred_masks[:j - 1, j - 1] = False # Remove the upper triangle above the diagnonal (but I think this is unnecessary for valid_masks) for n in range(num_sampler): has_zeros_batch = torch.where(masks[:, n] == 0)[0] # in j-step prediction, timesteps z -> z + j are disallowed as those are the first j timesteps of a new episode # z-> z-1 because of pred_masks being offset by 1 for z in has_zeros_batch: pred_masks[z - 1:z - 1 + j, j - 1, n] = False # can affect j timesteps # instead of the whole range, we actually are only comparing a window i:i+k for each query/target i - for each, select the appropriate k # we essentially gather diagonals from this full mask, t of them, k long valid_diagonals = [ torch.diagonal(pred_masks, offset=-i) for i in range(num_steps) ] # pull the appropriate k per timestep valid_masks = (torch.stack(valid_diagonals, dim=0).permute(0, 3, 1, 2).float() ) # (T, N, 1, k) -> (T, k, N, 1) # print(valid_masks.int().squeeze(-1)); print(masks) # verify its correctness loss_masks = valid_masks * _bernoulli_subsample_mask_like( valid_masks, self.subsample_rate) # (T, k, N, 1) num_valid_losses = torch.count_nonzero(loss_masks) avg_positive_loss = (positive_loss * loss_masks).sum() / torch.clamp( num_valid_losses, min=1.0) avg_negative_loss = (negative_loss * loss_masks).sum() / torch.clamp( num_valid_losses, min=1.0) avg_loss = avg_positive_loss + avg_negative_loss return ( avg_loss, { "total": cast(torch.Tensor, avg_loss).item(), "positive_loss": cast(torch.Tensor, avg_positive_loss).item(), "negative_loss": cast(torch.Tensor, avg_negative_loss).item(), }, )
def patch_sampler(img_filenames, labelmap_filenames, patch_size, sampler_type, out_dir, max_patches=None, voxel_spacing=(), patch_overlap=(0, 0, 0), min_labeled_voxels=1.0, label_prob=0.8, save_patches=False, batch_size=None, prepare_batches=False, inference=False): """Reshape a 3D volumes into a collection of 2D patches The resulting patches are allocated in a dedicated array. Parameters ---------- img_filenames : list of strings Paths to images to extract patches from patch_size : tuple of ints (patch_x, patch_y, patch_z) The dimensions of one patch patch_overlap : tuple of ints (0, patch_x, patch_y) The maximum patch overlap between the patches min_labeled_voxels is not None: : float between 0 and 1 The minimum percentage of labeled pixels for a patch. If set to None patches are extracted based on center_voxel. labelmap_filenames : list of strings Paths to labelmap Returns ------- img_patches, label_patches : array, shape = (n_patches, patch_x, patch_y, patch_z, 1) The collection of patches extracted from the volumes, where `n_patches` is the total number of patches extracted. """ if max_patches is not None: max_patches = int(max_patches / len(img_filenames)) img_patches = [] label_patches = [] patch_counter = 0 save_counter = 0 img_ids = [] label_ids = [] save_size = 1 if prepare_batches: save_size = batch_size print(f'\nExtracting patches from: {img_filenames}\n') for i in tqdm(range(len(img_filenames)), leave=False): if voxel_spacing: util.update_affine(img_filenames[i], labelmap_filenames[i]) if labelmap_filenames: subject = tio.Subject(img=tio.Image(img_filenames[i], type=tio.INTENSITY), labelmap=tio.LabelMap(labelmap_filenames[i])) # Apply transformations #transform = tio.ZNormalization() #transformed = transform(subject) transform = tio.RescaleIntensity((0, 1)) transformed = transform(subject) if voxel_spacing: transform = tio.Resample(voxel_spacing) transformed = transform(transformed) num_img_patches = 0 if sampler_type == 'grid': sampler = tio.data.GridSampler(transformed, patch_size, patch_overlap) for patch in sampler: img_patch = np.array(patch.img.data) label_patch = np.array(patch.labelmap.data) labeled_voxels = torch.count_nonzero( patch.labelmap.data) >= patch_size[0] * patch_size[ 1] * patch_size[2] * min_labeled_voxels center = label_patch[0, int(patch_size[0] / 2), int(patch_size[1] / 2), int(patch_size[2] / 2)] != 0 if labeled_voxels or center: img_patches.append(img_patch) label_patches.append(label_patch) patch_counter += 1 num_img_patches += 1 if save_patches: img_patches, label_patches, img_ids, label_ids, save_counter, patch_counter = save( img_patches, label_patches, img_ids, label_ids, save_counter, patch_counter, save_size, patch_size, inference, out_dir) # Check if max_patches for img if max_patches is not None: if num_img_patches > max_patches: break else: # Define sampler one_label = 1.0 - label_prob label_probabilities = {0: one_label, 1: label_prob} sampler = tio.data.LabelSampler( patch_size, label_probabilities=label_probabilities) if max_patches is None: generator = sampler(transformed) else: generator = sampler(transformed, max_patches) for patch in generator: img_patches.append(np.array(patch.img.data)) label_patches.append(np.array(patch.labelmap.data)) patch_counter += 1 if save_patches: img_patches, label_patches, img_ids, label_ids, save_counter, patch_counter = save( img_patches, label_patches, img_ids, label_ids, save_counter, patch_counter, save_size, patch_size, inference, out_dir) print(f'Finished extracting patches.') if save_patches: return img_ids, label_ids else: if patch_size[0] == 1: return np.array(img_patches).reshape( len(img_patches), patch_size[1], patch_size[2], 1), np.array(label_patches).reshape(len(label_patches), patch_size[1], patch_size[2], 1) else: return np.array(img_patches).reshape( len(img_patches), patch_size[0], patch_size[1], patch_size[2], 1), np.array(label_patches).reshape(len(label_patches), patch_size[1], patch_size[2], 1)
def _prediction_loop( self, dataloader: DataLoader, description: str, prediction_loss_only: Optional[bool] = None) -> PredictionOutput: """ Prediction/evaluation loop, shared by `evaluate()` and `predict()`. Works both with or without labels. """ prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only # multi-gpu eval if self.args.n_gpu > 1 and not isinstance(self.model, torch.nn.DataParallel): model = torch.nn.DataParallel(self.model) else: model = self.model model.to(self.args.device) logger.info("***** Running %s *****", description) logger.info(" Num examples = %d", len(dataloader.dataset)) logger.info(" Batch size = %d", dataloader.batch_size) eval_losses: List[float] = [] preds: np.ndarray = None label_ids: np.ndarray = None model.eval() avg_attentions_CLS = torch.zeros([12, 12], dtype=torch.float64) avg_attentions_SEP = torch.zeros([12, 12], dtype=torch.float64) avg_attentions_SEP1 = torch.zeros([12, 12], dtype=torch.float64) avg_attentions_SEP2 = torch.zeros([12, 12], dtype=torch.float64) max_attention = torch.zeros([1], dtype=torch.float64) min_attention = torch.zeros([1], dtype=torch.float64) + 1 count = 0 negation_words = [ self.tokenizer.encode("neither")[1], self.tokenizer.encode("nor")[1], self.tokenizer.encode("not")[1], self.tokenizer.encode("never")[1], self.tokenizer.encode("none")[1], self.tokenizer.encode("don't")[1], self.tokenizer.encode("won't")[1], self.tokenizer.encode("didn't")[1], self.tokenizer.encode("hadn't")[1], self.tokenizer.encode("haven't")[1], self.tokenizer.encode("can't")[1], self.tokenizer.encode("isn't")[1], self.tokenizer.encode("wasn't")[1], self.tokenizer.encode("shouldn't")[1], self.tokenizer.encode("couldn't")[1], self.tokenizer.encode("nothing")[1], self.tokenizer.encode("nowhere")[1] ] pos_tag = spacy.load("en_core_web_lg") # Values are tensor with 1 * 12 tags = ["NOUN", "PRON", "VERB", "[SEP]", "[CLS]", "PUNCT"] ling_feature_attention = dict() ling_feature_count = dict() for tag in tags: ling_feature_attention[tag] = torch.zeros([12], dtype=torch.float64) ling_feature_count[tag] = 0 number_of_nonzeros = 0 number_of_larger_than_64 = 0 examples = 0 for inputs in tqdm(dataloader, desc=description): for input_id in inputs['input_ids']: sentence_length = torch.count_nonzero(input_id) if sentence_length > 64: number_of_larger_than_64 += 1 number_of_nonzeros += sentence_length examples += 1 has_labels = any( inputs.get(k) is not None for k in ["labels", "masked_lm_labels"]) count += 1 for k, v in inputs.items(): inputs[k] = v.to(self.args.device) # Record the position of [SEP] sep_indices = np.where(inputs["input_ids"].numpy() == 102) ## PoS tagging for the data batch_pos_index = [ dict() for x in range(inputs["input_ids"].shape[0]) ] for i, sentence in enumerate(inputs["input_ids"]): # Record [SEP] batch_pos_index[i]["[SEP]"] = np.where( inputs["input_ids"][i].numpy() == 102)[0].tolist() # Recore negation neg_indices = np.where(np.in1d(negation_words, sentence))[0] # PoS tag the sentence if len(neg_indices) != 0: batch_pos_index[i]["NEG"] = neg_indices tagged_sentence = pos_tag(self.tokenizer.decode(sentence)) for j, token in enumerate(tagged_sentence): index = 0 if j >= 3 and j < batch_pos_index[i]["[SEP]"][0] + 2: # Between [CLS] and first [SEP] if (sentence[j - 2:] == self.tokenizer.encode( token.text)[1]).nonzero().shape[0] != 0: index = (sentence[j - 2:] == self.tokenizer.encode( token.text)[1]).nonzero()[0].numpy()[0] + j - 2 elif len( batch_pos_index[i] ["[SEP]"]) == 2 and j >= batch_pos_index[i]["[SEP]"][ 0] + 5 and j < batch_pos_index[i]["[SEP]"][1] + 4: if (sentence[j - 4:] == self.tokenizer.encode( token.text)[1]).nonzero().shape[0] != 0: index = (sentence[j - 4:] == self.tokenizer.encode( token.text)[1]).nonzero()[0].numpy()[0] + j - 4 if index > 0 and (index < batch_pos_index[i]["[SEP]"][0] or len(batch_pos_index[i]["[SEP]"]) == 2 and index < batch_pos_index[i]["[SEP]"][1]): tag = tagged_sentence[j].pos_ if tag in tags: if tag in batch_pos_index[i]: batch_pos_index[i][tag].append(index) else: batch_pos_index[i][tag] = [index] with torch.no_grad(): outputs = model(**inputs) # Code for Analysis if self.args.plot_fig == 7: # Get related attention scores from the last layer attention_matrix = outputs[2][11][:, :, 0, :] # Loop over the batch for batch in range(attention_matrix.shape[0]): for head in range(12): for tag in tags: if tag == '[CLS]': ling_feature_attention[tag][ head] += attention_matrix[batch, head, 0] ling_feature_count[tag] += 1 elif tag in batch_pos_index[batch]: ling_feature_count[tag] += 1 ling_feature_attention[tag][ head] += torch.max( torch.index_select( attention_matrix[batch, head, :], dim=-1, index=torch.as_tensor( batch_pos_index[batch] [tag]))) if self.args.plot_fig != 7: for layer in range(12): # Loop over the layer # Shape of avg_attention_CLS[i, :, :]=[12, batch_size, 12] # 0 for [CLS] if self.args.plot_fig == 6: local_max = torch.max(outputs[2][layer]) local_min = torch.min(outputs[2][layer]) max_attention = local_max if local_max > max_attention else max_attention min_attention = local_min if local_min < min_attention else min_attention avg_attentions_CLS[layer, :] += torch.mean( torch.mean(outputs[2][layer], dim=-2)[:, :, 0], dim=-2) attention_SEP_over_batches = torch.zeros([ outputs[2][layer].shape[0], outputs[2][layer].shape[1] ]) # [8, 12] in our case attention_SEP1_over_batches = torch.zeros([ outputs[2][layer].shape[0], outputs[2][layer].shape[1] ]) attention_SEP2_over_batches = torch.zeros([ outputs[2][layer].shape[0], outputs[2][layer].shape[1] ]) for j in range(outputs[2][layer].shape[0]): if sep_indices[0].size == 1 or ( sep_indices[0].size > 1 and sep_indices[0][0] != sep_indices[0][1]): # One [SEP] attention_SEP_over_batches[ j, :] += torch.mean( outputs[2][layer], dim=-2)[j, :, sep_indices[1][j]] else: # Two [SEP] attention_SEP1_over_batches[ j, :] += torch.mean( outputs[2][layer], dim=-2)[j, :, sep_indices[1][j * 2]] attention_SEP2_over_batches[ j, :] += torch.mean( outputs[2][layer], dim=-2)[j, :, sep_indices[1][j * 2 + 1]] attention_SEP_over_batches[ j, :] += torch.max( torch.mean( outputs[2][layer], dim=-2)[j, :, sep_indices[1][j * 2]], torch.mean( outputs[2][layer], dim=-2)[j, :, sep_indices[1][j * 2 + 1]]) avg_attentions_SEP[layer, :] += torch.mean( attention_SEP_over_batches, dim=-2) avg_attentions_SEP1[layer, :] += torch.mean( attention_SEP1_over_batches, dim=-2) avg_attentions_SEP2[layer, :] += torch.mean( attention_SEP1_over_batches, dim=-2) elif self.args.plot_fig == 2: # Save the attention map into dir for head in range(12): for batch in range(outputs[2][layer].shape[0]): if sep_indices[0].size == 1 or ( sep_indices[0].size > 1 and sep_indices[0][0] != sep_indices[0][1]): # One [SEP] attention_map = outputs[2][layer][ batch, head, :sep_indices[1] [batch], :sep_indices[1][batch]] else: attention_map = outputs[2][layer][ batch, head, :sep_indices[1][batch * 2 + 1], : sep_indices[1][batch * 2 + 1]] np.save( '/mnt/d/glue_results/attentions/' + self.args.attention_type + '/' + self.args.dataset_name + '/' + str(layer) + '_' + str(head) + '_' + str(batch) + '.npy', attention_map.numpy()) if has_labels: step_eval_loss, logits = outputs[:2] eval_losses += [step_eval_loss.mean().item()] else: logits = outputs[0] if not prediction_loss_only: if preds is None: preds = logits.detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) if inputs.get("labels") is not None: if label_ids is None: label_ids = inputs["labels"].detach().cpu().numpy() else: label_ids = np.append( label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) pdb.set_trace() average_seq_length = number_of_nonzeros / examples percentage_of_larger_than_64 = number_of_larger_than_64 / examples if self.args.plot_fig == 7: for tag in tags: ling_feature_attention[tag] /= ling_feature_count[tag] torch.save( ling_feature_attention[tag], '/mnt/c/Users/kaise/Desktop/researchData/' + self.args.dataset_name + '/ling_feature_attentions/' + self.args.attention_type + '_' + tag + '.pt') if self.args.plot_fig == 6: ### Plotting for fig 6 if self.args.attention_type == 'effective': torch.save( min_attention, '/mnt/c/Users/kaise/Desktop/researchData/' + self.args.dataset_name + '/min_attention.pt') torch.save( max_attention, '/mnt/c/Users/kaise/Desktop/researchData/' + self.args.dataset_name + '/max_attention.pt') # Averaged over number of data and batch avg_attentions_CLS /= count avg_attentions_SEP /= count avg_attentions_SEP1 /= count avg_attentions_SEP2 /= count torch.save( avg_attentions_CLS, '/mnt/c/Users/kaise/Desktop/researchData/' + self.args.dataset_name + '/' + self.args.attention_type + '_CLS_' + self.args.dataset_name + '.pt') torch.save( avg_attentions_SEP, '/mnt/c/Users/kaise/Desktop/researchData/' + self.args.dataset_name + '/' + self.args.attention_type + 'SEP_' + self.args.dataset_name + '.pt') plt.figure() plt.imshow(avg_attentions_CLS.numpy(), vmin=0, vmax=1.0, cmap='Greens') plt.colorbar() plt.ylabel("Layer") plt.xlabel("Head") plt.title(self.args.dataset_name) plt.savefig('/mnt/c/Users/kaise/Desktop/researchData/' + self.args.dataset_name + '/' + self.args.attention_type + '_CLS_' + self.args.dataset_name + '.png') plt.figure() plt.imshow(avg_attentions_SEP.numpy(), vmin=0, vmax=1.0, cmap='Greens') plt.title(self.args.dataset_name) plt.ylabel("Layer") plt.xlabel("Head") plt.colorbar() plt.savefig('/mnt/c/Users/kaise/Desktop/researchData/' + self.args.dataset_name + '/' + self.args.attention_type + '_SEP_' + self.args.dataset_name + '.png') plt.figure() plt.imshow(avg_attentions_SEP1.numpy(), vmin=0, vmax=1.0, cmap='Greens') plt.title(self.args.dataset_name) plt.ylabel("Layer") plt.xlabel("Head") plt.colorbar() plt.savefig('/mnt/c/Users/kaise/Desktop/researchData/' + self.args.dataset_name + '/' + self.args.attention_type + '_SEP1_' + self.args.dataset_name + '.png') plt.figure() plt.imshow(avg_attentions_SEP2.numpy(), vmin=0, vmax=1.0, cmap='Greens') plt.title(self.args.dataset_name) plt.ylabel("Layer") plt.xlabel("Head") plt.colorbar() plt.savefig('/mnt/c/Users/kaise/Desktop/researchData/' + self.args.dataset_name + '/' + self.args.attention_type + '_SEP2_' + self.args.dataset_name + '.png') if self.compute_metrics is not None and preds is not None and label_ids is not None: metrics = self.compute_metrics( EvalPrediction(predictions=preds, label_ids=label_ids)) else: metrics = {} if len(eval_losses) > 0: metrics["loss"] = np.mean(eval_losses) return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics)
def quantize_model(model, bitwidth=8, layerwise_bitwidth=None, retrain=True, ref_model=None, flags=None, adaround=False, lr=0.00000001): res = check_metrics(dataloader, model, image_resolution) print(res) input_shape = coord_dataset.mgrid.shape dummy_in = ((torch.rand(input_shape).unsqueeze(0) * 2) - 1).cuda() aimet_dataloader = DataLoader(AimetDataset(coord_dataset), shuffle=True, batch_size=1, pin_memory=True, num_workers=0) # Create QuantSim using adarounded_model sim = QuantizationSimModel(model, default_param_bw=bitwidth, default_output_bw=31, dummy_input=dummy_in) modules_to_exclude = ( Sine, ImageDownsampling, PosEncodingNeRF, FourierFeatureEncodingPositional, FourierFeatureEncodingGaussian) excl_layers = [] for mod in sim.model.modules(): if isinstance(mod, QcPostTrainingWrapper) and isinstance(mod._module_to_wrap, modules_to_exclude): excl_layers.append(mod) sim.exclude_layers_from_quantization(excl_layers) i = 0 for name, mod in sim.model.named_modules(): if isinstance(mod, QcPostTrainingWrapper): mod.output_quantizer.enabled = False mod.input_quantizer.enabled = False weight_quantizer = mod.param_quantizers['weight'] bias_quantizer = mod.param_quantizers['bias'] weight_quantizer.use_symmetric_encodings = True bias_quantizer.use_symmetric_encodings = True if torch.count_nonzero(mod._module_to_wrap.bias.data): mod.param_quantizers['bias'].enabled = True if layerwise_bitwidth: mod.param_quantizers['bias'].bitwidth = layerwise_bitwidth[i] mod.param_quantizers['weight'].bitwidth = layerwise_bitwidth[i] i += 1 res = check_metrics(dataloader, sim.model, image_resolution) print(res) if adaround: params = AdaroundParameters(data_loader=aimet_dataloader, num_batches=1, default_num_iterations=500, default_reg_param=0.001, default_beta_range=(20, 2)) # adarounded_model_1 = Adaround.apply_adaround(model=model, dummy_input=dummy_in, params=params,path='', filename_prefix='adaround', # default_param_bw=bitwidth, ignore_quant_ops_list=excl_layers ) # Compute only param encodings Adaround._compute_param_encodings(sim) # Get the module - activation function pair using ConnectedGraph module_act_func_pair = connectedgraph_utils.get_module_act_func_pair(model, dummy_in) Adaround._adaround_model(model, sim, module_act_func_pair, params, dummy_in) #res = check_metrics(dataloader, sim.model, image_resolution) #print('1st stage ada round ', res) # Update every module (AdaroundSupportedModules) weight with Adarounded weight (Soft rounding) Adaround._update_modules_with_adarounded_weights(sim) path='' # from aimet_torch.cross_layer_equalization import equalize_model # equalize_model(model, input_shape) # params = QuantParams(weight_bw=4, act_bw=4, round_mode="nearest", quant_scheme='tf_enhanced') # # # Perform Bias Correction # bias_correction.correct_bias(model.to(device="cuda"), params, num_quant_samples=1, # data_loader=aimet_dataloader, num_bias_correct_samples=1) # torch.save(sim.model, # os.path.join( # os.path.join(exp_folder, # image_name + '/checkpoints/model_aimet_quantized.pth'))) quantized_model = sim.model #res = check_metrics(dataloader, sim.model, image_resolution) #print('After Adaround ', res) # # if retrain: # loss_fn = partial(loss_functions.image_mse, None) # #quantized_model = retrain_model(sim.model, dataloader, 200, loss_fn, 0.0000005, flags['l1_reg'] if flags is not None else 0) # quantized_model = retrain_model(sim.model, dataloader, 300, loss_fn, lr, # flags['l1_reg'] if flags is not None else 0) # # Fine-tune the model's parameter using training # # torch.save(quantized_model, # # os.path.join( # # os.path.join(exp_folder, # # image_name + '/checkpoints/model_aimet_quantized_retrained.pth'))) # res = check_metrics(dataloader, quantized_model, image_resolution) # print('After retraining ',res) # state_dict ={} # quantized_dict = {} # for name, module in sim.model.named_modules(): # if isinstance(module, QcPostTrainingWrapper) and isinstance(module._module_to_wrap, torch.nn.Linear): # weight_quantizer = module.param_quantizers['weight'] # bias_quantizer = module.param_quantizers['bias'] # weight_quantizer.enabled = True # bias_quantizer.enabled = True # weight_quantizer.use_soft_rounding = False # bias_quantizer.use_soft_rounding = False # wrapped_linear = module._module_to_wrap # weight = wrapped_linear.weight # bias = wrapped_linear.bias # if not (torch.all(weight < weight_quantizer.encoding.max) and torch.all( # weight > weight_quantizer.encoding.min)): # print("not within bounds") # # weight_dequant = weight_quantizer.quantize_dequantize(weight, # weight_quantizer.round_mode).cpu().detach() # state_dict[name + '.weight'] = weight_dequant # # assert(len(torch.unique(state_dict[name + '.weight'])) <= 2**bitwidth) # bias_dequant = bias_quantizer.quantize_dequantize(bias, # bias_quantizer.round_mode).cpu().detach() # state_dict[name + '.bias'] = bias_dequant # # assert (len(torch.unique(state_dict[name + '.bias'])) <= 2 ** bitwidth) # quantized_weight = weight_dequant / weight_quantizer.encoding.delta # quantized_bias = bias_dequant / bias_quantizer.encoding.delta # weights_csc = scipy.sparse.csc_matrix(quantized_weight + weight_quantizer.encoding.offset) # quantized_dict[name] = {'weight': {'data': quantized_weight, 'encoding': weight_quantizer.encoding}, # 'bias': {'data': quantized_bias, 'encoding': bias_quantizer.encoding}} # res = check_metrics(dataloader, quantized_model, image_resolution) # print('After hard rounding ', res) if adaround: filename_prefix = 'adaround' # Export quantization encodings to JSON-formatted file Adaround._export_encodings_to_json(path, filename_prefix, sim) #res = check_metrics(dataloader, sim.model, image_resolution) SaveUtils.remove_quantization_wrappers(sim.model) adarounded_model = sim.model #print('After Adaround ', res) sim = QuantizationSimModel(adarounded_model, default_param_bw=bitwidth, default_output_bw=31, dummy_input=dummy_in) for mod in sim.model.modules(): if isinstance(mod, QcPostTrainingWrapper) and isinstance(mod._module_to_wrap, modules_to_exclude): excl_layers.append(mod) sim.exclude_layers_from_quantization(excl_layers) i = 0 for name, mod in sim.model.named_modules(): if isinstance(mod, QcPostTrainingWrapper): mod.output_quantizer.enabled = False mod.input_quantizer.enabled = False weight_quantizer = mod.param_quantizers['weight'] bias_quantizer = mod.param_quantizers['bias'] weight_quantizer.use_symmetric_encodings = True bias_quantizer.use_symmetric_encodings = True if torch.count_nonzero(mod._module_to_wrap.bias.data): mod.param_quantizers['bias'].enabled = True if layerwise_bitwidth: mod.param_quantizers['bias'].bitwidth = layerwise_bitwidth[i] mod.param_quantizers['weight'].bitwidth = layerwise_bitwidth[i] i += 1 sim.set_and_freeze_param_encodings(encoding_path='adaround.encodings') # Quantize the untrained MNIST model #sim.compute_encodings(forward_pass_callback=evaluate_model, forward_pass_callback_args=5) res = check_metrics(dataloader, sim.model, image_resolution) print(res) if retrain: loss_fn = partial(loss_functions.image_mse, None) #quantized_model = retrain_model(sim.model, dataloader, 200, loss_fn, 0.0000005, flags['l1_reg'] if flags is not None else 0) quantized_model = retrain_model(sim.model, dataloader, 1000, loss_fn, lr, flags['l1_reg'] if flags is not None else 0) #sim.compute_encodings(forward_pass_callback=evaluate_model, forward_pass_callback_args=5) # Fine-tune the model's parameter using training # torch.save(quantized_model, # os.path.join( # os.path.join(exp_folder, # image_name + '/checkpoints/model_aimet_quantized_retrained.pth'))) res = check_metrics(dataloader, quantized_model, image_resolution) print('After retraining ',res) # # w = sim.model.net.net[0][0]._module_to_wrap.weight # q = sim.model.net.net[0][0].param_quantizers['weight'] # wq = q.quantize(w, q.round_mode) #Compute the difference for each parameter if ref_model is not None: new_state_dict=sim.model.state_dict() lis = [[i, j, a, b] for i, a in ref_model.named_parameters() for j, b in sim.model.named_parameters() if i == j.replace('._module_to_wrap','')] for module in lis: new_state_dict[module[1]] = module[3] - module[2] sim.model.load_state_dict(new_state_dict) #sim.compute_encodings(forward_pass_callback=evaluate_model, forward_pass_callback_args=1) quantized_dict = {} state_dict = {} for name, module in sim.model.named_modules(): if isinstance(module, QcPostTrainingWrapper) and isinstance(module._module_to_wrap, torch.nn.Linear): weight_quantizer = module.param_quantizers['weight'] bias_quantizer = module.param_quantizers['bias'] weight_quantizer.enabled = True bias_quantizer.enabled = True wrapped_linear = module._module_to_wrap weight = wrapped_linear.weight bias = wrapped_linear.bias if not (torch.all(weight < weight_quantizer.encoding.max) and torch.all(weight > weight_quantizer.encoding.min)): print("not within bounds") state_dict[name + '.weight'] = weight_quantizer.quantize_dequantize(weight,weight_quantizer.round_mode).cpu().detach() #assert(len(torch.unique(state_dict[name + '.weight'])) <= 2**bitwidth) state_dict[name + '.bias'] = bias_quantizer.quantize_dequantize(bias, bias_quantizer.round_mode).cpu().detach() #assert (len(torch.unique(state_dict[name + '.bias'])) <= 2 ** bitwidth) quantized_weight = weight_quantizer.quantize(weight, weight_quantizer.round_mode).cpu().detach().numpy() + weight_quantizer.encoding.offset quantized_bias = bias_quantizer.quantize(bias, bias_quantizer.round_mode).cpu().detach().numpy() + bias_quantizer.encoding.offset weights_csc = scipy.sparse.csc_matrix(quantized_weight + weight_quantizer.encoding.offset) quantized_dict[name] = {'weight': {'data': quantized_weight, 'encoding': weight_quantizer.encoding}, 'bias': {'data': quantized_bias, 'encoding': bias_quantizer.encoding}} weights_np = [] for l in quantized_dict.values(): w = l['weight']['data'] b = l['bias']['data'] Q = l['weight']['encoding'].bw if Q < 9: tpe = 'int8' elif Q < 17: tpe = 'int16' else: tpe = 'int32' w = w.astype(tpe).flatten() weights_np.append(w) if l['bias']['encoding']: Q = l['bias']['encoding'].bw if Q < 9: tpe = 'int8' elif Q < 17: tpe = 'int16' else: tpe = 'int32' b = b.astype(tpe).flatten() weights_np.append(b) weights_np = np.concatenate(weights_np) comp = zlib.compress(weights_np, level=9) print(len(comp)) # sim.export(path=os.path.join( # os.path.join(exp_folder, # image_name, 'checkpoints')), filename_prefix='model_aimet_quantized_retrained', dummy_input=dummy_in, set_onnx_layer_names=False) print(res) return quantized_model, res, len(comp), state_dict
print( torch.bernoulli(a) ) # draw binary random numbers (0 or 1) from a Bernoulli distribution weights = torch.tensor([0, 10, 3, 0], dtype=torch.float) # create a tensor of weights print(torch.multinomial(weights, 2)) x = torch.randn(3) print(x) print(torch.mean(x)) print(torch.sum(x)) print(torch.median(x)) # print(torch.nanmedian(x)) # ignoring NaN values print(torch.min(x)) print(torch.max(x)) print(torch.mode(x)) print(torch.std(x)) print(torch.var(x)) print(torch.quantile(x, 0.1)) print(x.nanquantile(0.1)) print(torch.nansum(x)) # treating NaN as zero print(torch.unique(torch.tensor([1, 3, 2, 3], dtype=torch.long))) # count the frequency of each value in an array of non-negative int print(torch.bincount(torch.randint(0, 8, (5, ), dtype=torch.int64))) print(torch.histc(torch.tensor([1., 2, 1]), bins=4, min=0, max=3)) x = torch.zeros(3, 3) x[torch.randn(3, 3) > 0.5] = 1 print(torch.count_nonzero(x))
tokens_tensors = tokens_tensors.to(device) segments_tensors = segments_tensors.to(device) masks_tensors = masks_tensors.to(device) labels = labels.to(device) # 將參數梯度歸零 optimizer.zero_grad() # forward pass outputs = model(input_ids=tokens_tensors, token_type_ids=segments_tensors, attention_mask=masks_tensors, labels=labels) loss = outputs.loss acc += torch.count_nonzero(outputs.logits.argmax( axis=1) == labels).item() / labels.shape[0] c += 1 # backward loss.backward() optimizer.step() # 紀錄當前 batch loss running_loss += loss.item() # break # break CHECKPOINT_NAME = f"./model/{args.model_name}_{args.LM.replace('-', '_')}_E_{str(epoch+1)}.pt" torch.save(model.state_dict(), CHECKPOINT_NAME) timestamp(f"[epoch {epoch+1}] loss: {running_loss:.3f} acc: {acc/c}")