def __init__( self, file_name, sequence_len: int, hop: int, sr: int = 44100, fft_size: int = 4096, fft_hop: int = 441, n_freq_bins: int = 256, freq_compression: str = "linear", f_min: int = 200, f_max: int = 18000, cache_dir=None #added ): self.sequence_len = sequence_len self.hop = hop self.audio = T.load_audio_file(file_name, sr=sr, mono=True) self.n_frames = self.audio.shape[1] self.t = [ T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]), T.Spectrogram(fft_size, fft_hop, center=False), ] if freq_compression == "linear": self.t.append(T.Interpolate(n_freq_bins, sr, f_min, f_max)) elif freq_compression == "mel": self.t.append( T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max)) elif freq_compression == "mfcc": t_mel = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max) self.t.append(T.Compose(t_mel, T.M2MFCC())) else: raise "Undefined frequency compression" self.t.append( T.Amp2Db(min_level_db=DefaultSpecDatasetOps["min_level_db"])) self.t.append( T.Normalize( min_level_db=DefaultSpecDatasetOps["min_level_db"], ref_level_db=DefaultSpecDatasetOps["ref_level_db"], )) #self.file_reader = AsyncFileReader() self.t = T.CachedSpectrogram( cache_dir=cache_dir, spec_transform=T.Compose(self.t), n_fft=fft_size, hop_length=fft_hop, #file_reader=AsyncFileReader() )
def _create_transform(self): print('Set Augmentation...') self.transforms = transforms.Compose([ transforms.RandomCrop([self.opt.crop_height, self.opt.crop_width]), transforms.RandomVerticalFlip(), transforms.RandomHorizontalFlip() ])
def __init__( self, file_name, sequence_len: int, hop: int, sr: int = 44100, fft_size: int = 4096, fft_hop: int = 441, n_freq_bins: int = 256, freq_compression: str = "linear", f_min: int = 200, f_max: int = 18000, center=True ): self.sp = signal.signal_proc() self.hop = hop self.center = center self.filename = file_name self.sequence_len = sequence_len self.audio = T.load_audio_file(file_name, sr=sr, mono=True) self.n_frames = self.audio.shape[1] spec_t = [ T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]), T.Spectrogram(fft_size, fft_hop, center=self.center), ] self.spec_transforms = T.Compose(spec_t) if freq_compression == "linear": self.t_compr_f = (T.Interpolate(n_freq_bins, sr, f_min, f_max)) elif freq_compression == "mel": self.t_compr_f = (T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max)) elif freq_compression == "mfcc": t_mel = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max) self.t_compr_f = (T.Compose(t_mel, T.M2MFCC())) else: raise "Undefined frequency compression" self.t_compr_a = T.Amp2Db(min_level_db=DefaultSpecDatasetOps["min_level_db"]) self.t_norm = T.Normalize( min_level_db=DefaultSpecDatasetOps["min_level_db"], ref_level_db=DefaultSpecDatasetOps["ref_level_db"] )
def evaluate(args): assert torch.cuda.is_available(), 'CUDA is not available.' torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True print('The image is {:}'.format(args.image)) print('The model is {:}'.format(args.model)) snapshot = Path(args.model) assert snapshot.exists(), 'The model path {:} does not exist' facebox=face_detect(args.image,args.face_detector) print('The face bounding box is {:}'.format(facebox)) assert len(facebox)==4,'Invalid face input : {:}'.format(facebox) snapshot = torch.load(str(snapshot)) # General Data Argumentation mean_fill = tuple([int(x * 255) for x in [0.485, 0.456, 0.406]]) normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) param = snapshot['args'] eval_transform = transforms.Compose( [transforms.PreCrop(param.pre_crop_expand), transforms.TrainScale2WH((param.crop_width, param.crop_height)), transforms.ToTensor(), normalize]) model_config = load_configure(param.model_config, None) dataset = GeneralDataset(eval_transform, param.sigma, model_config.downsample, param.heatmap_type, param.data_indicator) dataset.reset(param.num_pts) net = obtain_model(model_config, param.num_pts + 1) net = net.cuda() weights = remove_module_dict(snapshot['state_dict']) net.load_state_dict(weights) print('Prepare input data') [image, _, _, _, _, _, cropped_size], meta = dataset.prepare_input(args.image, facebox) inputs = image.unsqueeze(0).cuda() # network forward with torch.no_grad(): batch_heatmaps, batch_locs, batch_scos = net(inputs) # obtain the locations on the image in the orignial size cpu = torch.device('cpu') np_batch_locs, np_batch_scos, cropped_size = batch_locs.to(cpu).numpy(), batch_scos.to( cpu).numpy(), cropped_size.numpy() locations, scores = np_batch_locs[0, :-1, :], np.expand_dims(np_batch_scos[0, :-1], -1) scale_h, scale_w = cropped_size[0] * 1. / inputs.size(-2), cropped_size[1] * 1. / inputs.size(-1) locations[:, 0], locations[:, 1] = locations[:, 0] * scale_w + cropped_size[2], locations[:, 1] * scale_h + \ cropped_size[3] prediction = np.concatenate((locations, scores), axis=1).transpose(1, 0) print('the coordinates for {:} facial landmarks:'.format(param.num_pts)) for i in range(param.num_pts): point = prediction[:, i] print('the {:02d}/{:02d}-th point : ({:.1f}, {:.1f}), score = {:.2f}'.format(i+1, param.num_pts, float(point[0]), float(point[1]), float(point[2]))) image = draw_image_by_points(args.image, prediction, 2, (255, 0, 0), facebox, None,None) image.show() image.save(args.image.split('.')[0]+'_result.jpg')
def build_transforms(config): transform_train = T.Compose([ T.RandomCroping(config.DATA.HEIGHT, config.DATA.WIDTH, p=config.AUG.RC_PROB), T.RandomHorizontalFlip(), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), T.RandomErasing(probability=config.AUG.RE_PROB) ]) transform_test = T.Compose([ T.Resize((config.DATA.HEIGHT, config.DATA.WIDTH)), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) return transform_train, transform_test
def Generate_transform_Dict(origin_width=256, width=227, ratio=0.16, rot=0, args=None): std_value = 1.0 / 255.0 if (args is not None) and ("ResNet" in args.net): normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) cc = [] else: normalize = transforms.Normalize( mean=[104 / 255.0, 117 / 255.0, 128 / 255.0], std=[1.0 / 255, 1.0 / 255, 1.0 / 255]) print("bgr init") cc = [transforms.CovertBGR()] transform_dict = {} transform_dict['rand-crop'] = \ transforms.Compose(cc + [transforms.Resize((origin_width)), transforms.RandomResizedCrop(scale=(ratio, 1), size=width), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) transform_dict['center-crop'] = \ transforms.Compose(cc + [ transforms.Resize((origin_width)), transforms.CenterCrop(width), transforms.ToTensor(), normalize, ]) transform_dict['resize'] = \ transforms.Compose(cc + [ transforms.Resize((width)), transforms.ToTensor(), normalize, ]) return transform_dict
def __init__(self, file_name, sequence_len: int, hop: int, sr: int = 44100, fft_size: int = 4096, fft_hop: int = 441, n_freq_bins: int = 256, freq_compression: str = "linear", f_min: int = 200, f_max: int = 18000): self.sequence_len = sequence_len self.hop = hop self.audio = T.load_audio_file(file_name, sr=sr, mono=True) self.n_frames = self.audio.shape[ 1] # total num of samples in the audio (transposed mono) self.t = [ T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]), T.Spectrogram(fft_size, fft_hop, center=False), ] if freq_compression == "linear": self.t.append(T.Interpolate(n_freq_bins, sr, f_min, f_max)) elif freq_compression == "mel": self.t.append( T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max)) elif freq_compression == "mfcc": t_mel = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max) self.t.append(T.Compose(t_mel, T.M2MFCC())) else: raise "Undefined frequency compression" self.t.append( T.Amp2Db(min_level_db=DefaultSpecDatasetOps["min_level_db"])) self.t.append( T.Normalize( min_level_db=DefaultSpecDatasetOps["min_level_db"], ref_level_db=DefaultSpecDatasetOps["ref_level_db"], )) self.t = T.Compose(self.t)
def get_transform(train, dataset_name): base_size = cfg.DATA_TRANSFORM.LOADSIZE crop_size = cfg.DATA_TRANSFORM.CROPSIZE ignore_label = cfg.DATASET.IGNORE_LABEL if dataset_name == cfg.DATASET.SOURCE: input_size = cfg.DATA_TRANSFORM.INPUT_SIZE_S else: input_size = cfg.DATA_TRANSFORM.INPUT_SIZE_T min_size = int((1.0 if train else 1.0) * base_size) max_size = int((1.3 if train else 1.0) * base_size) transforms = [] if cfg.DATA_TRANSFORM.RANDOM_RESIZE_AND_CROP: if train: transforms.append(T.RandomResize(min_size, max_size)) transforms.append(T.RandomHorizontalFlip(0.5)) transforms.append( T.RandomCrop(crop_size, ignore_label=ignore_label)) else: transforms.append(T.Resize(cfg.DATA_TRANSFORM.INPUT_SIZE_T, True)) else: if train: transforms.append(T.Resize(input_size)) transforms.append(T.RandomHorizontalFlip(0.5)) else: transforms.append(T.Resize(input_size, True)) mapping = get_label_map(cfg.DATASET.SOURCE, cfg.DATASET.TARGET) transforms.append(T.LabelRemap(mapping[dataset_name])) transforms.append(T.ToTensor(cfg.DATASET.IMG_MODE)) if cfg.DATASET.IMG_MODE == "BGR": mean = (104.00698793, 116.66876762, 122.67891434) std = (1.0, 1.0, 1.0) else: mean = (0.485, 0.456, 0.406) std = (0.229, 0.224, 0.225) transforms.append(T.Normalize(mean, std)) return T.Compose(transforms)
def get_transform(dataset_name): base_size = cfg.DATA_TRANSFORM.LOADSIZE ignore_label = cfg.DATASET.IGNORE_LABEL min_size = base_size max_size = base_size transforms = [] transforms.append(T.Resize(cfg.DATA_TRANSFORM.INPUT_SIZE_T, True)) mapping = get_label_map(cfg.DATASET.SOURCE, cfg.DATASET.TARGET) transforms.append(T.LabelRemap(mapping[dataset_name])) transforms.append(T.ToTensor(cfg.DATASET.IMG_MODE)) if cfg.DATASET.IMG_MODE == "BGR": mean = (104.00698793, 116.66876762, 122.67891434) std = (1.0, 1.0, 1.0) else: mean = (0.485, 0.456, 0.406) std = (0.229, 0.224, 0.225) transforms.append(T.Normalize(mean, std)) return T.Compose(transforms)
def __init__( self, # exp params exp_name="u50_block", # arch params backbone="resnet50", backbone_kwargs={}, dim_embedding=256, feature_spatial_scale=0.25, max_junctions=512, junction_pooling_threshold=0.2, junc_pooling_size=15, attention_sigma=1., junction_heatmap_criterion="binary_cross_entropy", block_inference_size=64, adjacency_matrix_criterion="binary_cross_entropy", # data params data_root=r"/home/ziheng/indoorDist_new2", img_size=416, junc_sigma=3., batch_size=2, # train params gpus=[ 0, ], num_workers=5, resume_epoch="latest", is_train_junc=True, is_train_adj=True, # vis params vis_junc_th=0.3, vis_line_th=0.3): os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(c) for c in gpus) self.is_cuda = bool(gpus) self.model = LSDModule( backbone=backbone, dim_embedding=dim_embedding, backbone_kwargs=backbone_kwargs, junction_pooling_threshold=junction_pooling_threshold, max_junctions=max_junctions, feature_spatial_scale=feature_spatial_scale, junction_heatmap_criterion=junction_heatmap_criterion, junction_pooling_size=junc_pooling_size, attention_sigma=attention_sigma, block_inference_size=block_inference_size, adjacency_matrix_criterion=adjacency_matrix_criterion, weight_fn=weight_fn, is_train_adj=is_train_adj, is_train_junc=is_train_junc) self.exp_name = exp_name os.makedirs(os.path.join("log", exp_name), exist_ok=True) os.makedirs(os.path.join("ckpt", exp_name), exist_ok=True) self.writer = SummaryWriter(log_dir=os.path.join("log", exp_name)) # checkpoints self.states = dict(last_epoch=-1, elapsed_time=0, state_dict=None) if resume_epoch and os.path.isfile( os.path.join("ckpt", exp_name, f"train_states_{resume_epoch}.pth")): states = torch.load( os.path.join("ckpt", exp_name, f"train_states_{resume_epoch}.pth")) print(f"resume traning from epoch {states['last_epoch']}") self.model.load_state_dict(states["state_dict"]) self.states.update(states) self.train_data = SISTLine(data_root=data_root, transforms=tf.Compose( tf.Resize((img_size, img_size)), tf.RandomHorizontalFlip(), tf.RandomColorAug()), phase="train", sigma_junction=junc_sigma, max_junctions=max_junctions) self.train_loader = DataLoader(self.train_data, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True) self.eval_data = SISTLine(data_root=data_root, transforms=tf.Compose( tf.Resize((img_size, img_size)), ), phase="val", sigma_junction=junc_sigma, max_junctions=max_junctions) self.eval_loader = DataLoader(self.eval_data, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True) self.vis_junc_th = vis_junc_th self.vis_line_th = vis_line_th self.block_size = block_inference_size self.max_junctions = max_junctions self.is_train_junc = is_train_junc self.is_train_adj = is_train_adj
def main(): anchor_generator = AnchorGenerator(sizes=tuple([(16, 24, 32, 48, 96) for _ in range(5)]), aspect_ratios=tuple([ (0.5, 1.0, 2.0) for _ in range(5) ])) rpnhead = RPNHead(256, anchor_generator.num_anchors_per_location()[0]) model = maskrcnn_resnet50_fpn(num_classes=2, pretrained_backbone=True, max_size=MAX_SIZE, rpn_head=rpnhead, rpn_anchor_generator=anchor_generator, rpn_pre_nms_top_n_train=12000, rpn_pre_nms_top_n_test=6000, rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=300, rpn_fg_iou_thresh=0.5, rpn_bg_iou_thresh=0.3, rpn_positive_fraction=0.7, bbox_reg_weights=(1.0, 1.0, 1.0, 1.0), box_batch_size_per_image=32) model.load_state_dict( torch.load('saved_models' + os.sep + '0_deeplesion.pth', map_location='cpu')) data_transforms = { 'train': T.Compose([ T.ToOriginalHU(INTENSITY_OFFSET), T.IntensityWindowing(WINDOWING), T.SpacingResize(NORM_SPACING, MAX_SIZE), T.ToTensor() ]), 'val': T.Compose([ T.ToOriginalHU(INTENSITY_OFFSET), T.IntensityWindowing(WINDOWING), T.SpacingResize(NORM_SPACING, MAX_SIZE), T.ToTensor() ]), 'test': T.Compose([ T.ToOriginalHU(INTENSITY_OFFSET), T.IntensityWindowing(WINDOWING), T.SpacingResize(NORM_SPACING, MAX_SIZE), T.ToTensor() ]) } image_datasets = { x: DeepLesion(DIR_IN + os.sep + x, GT_FN_DICT[x], data_transforms[x]) for x in ['train', 'val', 'test'] } dataloaders = { x: DataLoader(image_datasets[x], batch_size=3, shuffle=True, num_workers=0, collate_fn=BatchCollator) for x in ['train', 'val', 'test'] } for batch_id, (inputs, targets) in enumerate(dataloaders['test']): outputs = test_model(model, inputs) outputs = remove_overlapping(outputs, 0.655) for image, target, output in zip(inputs, targets, outputs): img_copy = image.squeeze().numpy() images = [img_copy] * 3 images = [im.astype(float) for im in images] img_copy = cv2.merge(images) for bbox, pseudo_mask in zip(target["boxes"], target["masks"]): bbox = bbox.squeeze().numpy() bbox = np.int16(bbox) mask = pseudo_mask.squeeze().numpy() cv2.rectangle(img_copy, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 255, 0), 1) msk_idx = np.where(mask == 1) img_copy[msk_idx[0], msk_idx[1], 0] = 255 for predbox, predmask, score in zip(output['boxes'], output['masks'], output['scores']): if score < 0.655: break predbox = predbox.numpy() predmask = predmask.squeeze().numpy() score = score.numpy() predmask = np.where(predmask > 0.5, 1, 0) cv2.rectangle(img_copy, (predbox[0], predbox[1]), (predbox[2], predbox[3]), (0, 0, 255), 1) pmsk_idx = np.where(predmask == 1) img_copy[pmsk_idx[0], pmsk_idx[1], 2] = 255 cv2.putText(img_copy, str(score), (int(predbox[0]), int(predbox[1] - 5)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA) # cv2.imshow(str(target['image_id']), img_copy) cv2.imwrite( 'simple_test' + os.sep + str(target['image_id']).replace(os.sep, '_') + '_pred.jpg', img_copy * 255)
def __init__(self, file_names: Iterable[str], working_dir=None, cache_dir=None, sr=44100, n_fft=4096, hop_length=441, freq_compression="linear", n_freq_bins=256, f_min=0, f_max=18000, seq_len=128, augmentation=False, noise_files=[], min_max_normalize=False, *args, **kwargs): super().__init__(file_names, working_dir, sr, *args, **kwargs) if self.dataset_name is not None: self._logger.info("Init dataset {}...".format(self.dataset_name)) self.n_fft = n_fft self.hop_length = hop_length self.f_min = f_min self.f_max = f_max valid_freq_compressions = ["linear", "mel", "mfcc"] if freq_compression not in valid_freq_compressions: raise ValueError( "{} is not a valid freq_compression. Must be one of {}", format(freq_compression, valid_freq_compressions), ) self.freq_compression = freq_compression self.possible_call_labels = re.compile("|".join(["call"])) self.possible_nocall_labels = re.compile("|".join(["noise"])) self._logger.debug("Number of files : {}".format(len(self.file_names))) _n_calls = 0 for f in self.file_names: if self.is_call(f): _n_calls += 1 self._logger.debug("Number of calls: {}".format(_n_calls)) self._logger.debug( "Number of noise: {}".format(len(self.file_names) - _n_calls)) self.augmentation = augmentation spec_transforms = [ lambda fn: T.load_audio_file(fn, sr=sr), T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]), T.Spectrogram(n_fft, hop_length, center=False), ] self.file_reader = AsyncFileReader() if cache_dir is None: self.t_spectrogram = T.Compose(spec_transforms) else: self.t_spectrogram = T.CachedSpectrogram( cache_dir=cache_dir, spec_transform=T.Compose(spec_transforms), n_fft=n_fft, hop_length=hop_length, file_reader=AsyncFileReader(), ) if augmentation: self._logger.debug( "Init augmentation transforms for time and pitch shift") self.t_amplitude = T.RandomAmplitude(3, -6) self.t_timestretch = T.RandomTimeStretch() self.t_pitchshift = T.RandomPitchSift() else: self._logger.debug("Running without augmentation") if self.freq_compression == "linear": self.t_compr_f = T.Interpolate(n_freq_bins, sr, f_min, f_max) elif self.freq_compression == "mel": self.t_compr_f = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max) elif self.freq_compression == "mfcc": self.t_compr_f = T.Compose( T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max)) self.t_compr_mfcc = T.M2MFCC(n_mfcc=32) else: raise "Undefined frequency compression" if augmentation: if noise_files: self._logger.debug( "Init augmentation transform for random noise addition") self.t_addnoise = T.RandomAddNoise( noise_files, self.t_spectrogram, T.Compose(self.t_timestretch, self.t_pitchshift, self.t_compr_f), min_length=seq_len, return_original=True) else: self.t_addnoise = None self.t_compr_a = T.Amp2Db( min_level_db=DefaultSpecDatasetOps["min_level_db"]) if min_max_normalize: self.t_norm = T.MinMaxNormalize() self._logger.debug("Init min-max-normalization activated") else: self.t_norm = T.Normalize( min_level_db=DefaultSpecDatasetOps["min_level_db"], ref_level_db=DefaultSpecDatasetOps["ref_level_db"], ) self._logger.debug("Init 0/1-dB-normalization activated") self.t_subseq = T.PaddedSubsequenceSampler(seq_len, dim=1, random=augmentation)
def main(): logging.basicConfig(filename='logs' + os.sep + 'example.log', level=logging.DEBUG) data_transforms = { 'train': T.Compose([ T.ToOriginalHU(INTENSITY_OFFSET), T.IntensityWindowing(WINDOWING), T.SpacingResize(NORM_SPACING, MAX_SIZE), T.ToTensor() ]), 'val': T.Compose([ T.ToOriginalHU(INTENSITY_OFFSET), T.IntensityWindowing(WINDOWING), T.SpacingResize(NORM_SPACING, MAX_SIZE), T.ToTensor() ]), 'test': T.Compose([ T.ToOriginalHU(INTENSITY_OFFSET), T.IntensityWindowing(WINDOWING), T.SpacingResize(NORM_SPACING, MAX_SIZE), T.ToTensor() ]) } logging.info('Loading data sets') image_datasets = { x: DeepLesion(DIR_IN + os.sep + x, GT_FN_DICT[x], data_transforms[x]) for x in ['train', 'val', 'test'] } logging.info('data sets loaded') logging.info('Loading data loaders') dl_dataloaders = { x: DataLoader(image_datasets[x], batch_size=3, shuffle=True, num_workers=0, collate_fn=BatchCollator) for x in ['train', 'val', 'test'] } logging.info('data loaders loaded\n') dl_dataset_sizes = { x: len(image_datasets[x]) for x in ['train', 'val', 'test'] } # for batch_id, (inputs, targets) in enumerate(dl_dataloaders['train']): # i = 0 # for i, (image, target) in enumerate(zip(inputs, targets)): # img_copy = image.squeeze().numpy() # images = [img_copy] * 3 # images = [im.astype(float) for im in images] # img_copy = cv2.merge(images) # for j, (bbox, pseudo_mask) in enumerate(zip(target["boxes"], target["masks"])): # bbox = target["boxes"][j].squeeze().numpy() # bbox = np.int16(bbox) # mask = target["masks"][j].squeeze().numpy() # cv2.rectangle(img_copy, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 255, 0), 1) # msk_idx = np.where(mask == 1) # img_copy[msk_idx[0], msk_idx[1], 0] = 255 # cv2.imshow(str(batch_id) + " " + str(i), img_copy) # # cv2.waitKey(0) # cv2.destroyAllWindows() dl_model = get_model(False, True, 2) params = [p for p in dl_model.parameters() if p.requires_grad] # Observe that not all parameters are being optimized optimizer_ft = SGD(params, lr=0.001, momentum=0.9, weight_decay=0.0001) # optimizer_ft = Adam(params, lr=0.001) # Decay LR by a factor of 0.1 every 7 epochs # exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=4, gamma=0.1) # exp_lr_scheduler = lr_scheduler.CosineAnnealingLR(optimizer_ft, T_max=100) num_epochs = 10 since = time.time() # best_model_wts = copy.deepcopy(dl_model.state_dict()) # best_llf = 0 # best_nlf = 999 logging.info('momentum:' + str(optimizer_ft.state_dict()['param_groups'][0]['momentum'])) logging.info( 'weight_decay:' + str(optimizer_ft.state_dict()['param_groups'][0]['weight_decay'])) # logging.info('LR decay gamma:' + str(exp_lr_scheduler.state_dict()['gamma'])) # logging.info('LR decay step size:' + str(exp_lr_scheduler.state_dict()['step_size']) + '\n') for epoch in range(num_epochs): # deep_copy_flag = False logging.info('Epoch {}/{}'.format(epoch, num_epochs - 1)) logging.info('-' * 20) train_one_epoc(dl_model, optimizer_ft, dl_dataloaders['train'], dl_dataset_sizes['train']) llf, nlf = evaluate(dl_model, dl_dataloaders['val']) logging.info('LLF: {}'.format(llf)) logging.info('NLF: {}'.format(nlf) + '\n') # exp_lr_scheduler.step() # if llf > best_llf: # deep_copy_flag = True # best_nlf = nlf # best_llf = llf # elif (llf == best_llf) & (nlf < best_nlf): # deep_copy_flag = True # best_nlf = nlf # if deep_copy_flag: best_model_wts = copy.deepcopy(dl_model.state_dict()) torch.save(best_model_wts, 'saved_models' + os.sep + str(epoch) + '_deeplesion.pth') time_elapsed = time.time() - since logging.info('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60))
def __init__( self, file_names: Iterable[str], working_dir=None, cache_dir=None, sr=44100, n_fft=2048, #4096 hop_length=220, #441 freq_compression="linear", n_freq_bins=256, # determines the width of the image f_min=0, f_max=18000, seq_len=128, # shd be adjusted together with sequence_len in class StridedAudioDataset (called by predict.py) augmentation=False, noise_files=[], *args, **kwargs): super().__init__(file_names, working_dir, sr, *args, **kwargs) if self.dataset_name is not None: self._logger.info("Init dataset {}...".format(self.dataset_name)) self.n_fft = n_fft self.hop_length = hop_length self.f_min = f_min self.f_max = f_max # mel: log transformation of freq (Hz scale to Mel scale) # attention: Mel-spectrograms as a network input led to an excessive loss of resolution in higher frequency bands, which was # a big problem considering the high-frequency pulsed calls and whistles. valid_freq_compressions = ["linear", "mel", "mfcc"] if freq_compression not in valid_freq_compressions: raise ValueError( "{} is not a valid freq_compression. Must be one of {}", format(freq_compression, valid_freq_compressions), ) self.freq_compression = freq_compression # combine a RegExp pattern into pattern objects for pattern matching self.possible_call_labels = re.compile("|".join(["call"])) self.possible_nocall_labels = re.compile("|".join(["noise"])) self._logger.debug("Number of files : {}".format(len(self.file_names))) _n_calls = 0 for f in self.file_names: if self.is_call(f): _n_calls += 1 self._logger.debug("Number of calls: {}".format(_n_calls)) self._logger.debug( "Number of noise: {}".format(len(self.file_names) - _n_calls)) self.augmentation = augmentation spec_transforms = [ lambda fn: T.load_audio_file(fn, sr=sr), # return: a vector tensor T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]), T.Spectrogram(n_fft, hop_length, center=False), ] self.file_reader = AsyncFileReader() # if user chooses to not cache .spec by omitting the directory if cache_dir is None: self.t_spectrogram = T.Compose(spec_transforms) else: # where .spec is created and stored # n_fft, hop_length: meta in spec_dict self.t_spectrogram = T.CachedSpectrogram( cache_dir=cache_dir, spec_transform=T.Compose(spec_transforms), n_fft=n_fft, hop_length=hop_length, file_reader=AsyncFileReader(), ) if augmentation: self._logger.debug( "Init augmentation transforms for time and pitch shift") self.t_amplitude = T.RandomAmplitude(3, -6) self.t_timestretch = T.RandomTimeStretch() self.t_pitchshift = T.RandomPitchSift() else: self._logger.debug("Running without augmentation") if self.freq_compression == "linear": self.t_compr_f = T.Interpolate(n_freq_bins, sr, f_min, f_max) elif self.freq_compression == "mel": self.t_compr_f = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max) elif self.freq_compression == "mfcc": self.t_compr_f = T.Compose( T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max), T.M2MFCC()) else: raise "Undefined frequency compression" if augmentation: if noise_files: self._logger.debug( "Init augmentation transform for random noise addition") self.t_addnoise = T.RandomAddNoise( noise_files, self.t_spectrogram, T.Compose(self.t_timestretch, self.t_pitchshift, self.t_compr_f), min_length=seq_len, return_original=True ) # if return_original = True, both augmented and original specs are returned else: self.t_addnoise = None self.t_compr_a = T.Amp2Db( min_level_db=DefaultSpecDatasetOps["min_level_db"]) self.t_norm = T.Normalize( min_level_db=DefaultSpecDatasetOps["min_level_db"], ref_level_db=DefaultSpecDatasetOps["ref_level_db"], ) self.t_subseq = T.PaddedSubsequenceSampler(seq_len, dim=1, random=augmentation)
def __init__( self, file_names: Iterable[str], working_dir=None, cache_dir=None, sr=44100, n_fft=4096, hop_length=441, freq_compression="linear", n_freq_bins=256, f_min=0, f_max=18000, seq_len=128, augmentation=False, noise_files_train=[], noise_files_val=[], noise_files_test=[], random=False, *args, **kwargs ): super().__init__(file_names, working_dir, sr, *args, **kwargs) if self.dataset_name is not None: self._logger.info("Init dataset {}...".format(self.dataset_name)) self.sp = signal.signal_proc() self.df = 15.0 self.exp_e = 0.1 self.bin_pow = 2.0 self.gaus_mean = 0.0 self.gaus_stdv = 12.5 self.poisson_lambda = 15.0 self.orig_noise_value = -5 self.f_min = f_min self.f_max = f_max self.n_fft = n_fft self.random = random self.hop_length = hop_length self.augmentation = augmentation self.file_reader = AsyncFileReader() self.noise_files_val = noise_files_val self.noise_files_test = noise_files_test self.freq_compression = freq_compression self.noise_files_train = noise_files_train valid_freq_compressions = ["linear", "mel", "mfcc"] if self.freq_compression not in valid_freq_compressions: raise ValueError( "{} is not a valid freq_compression. Must be one of {}", format(self.freq_compressio, valid_freq_compressions), ) self._logger.debug( "Number of files to denoise : {}".format(len(self.file_names)) ) spec_transforms = [ lambda fn: T.load_audio_file(fn, sr=sr), T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]), T.Spectrogram(n_fft, hop_length, center=False), ] if cache_dir is None: self.t_spectrogram = T.Compose(spec_transforms) else: self.t_spectrogram = T.CachedSpectrogram( cache_dir=cache_dir, spec_transform=T.Compose(spec_transforms), n_fft=n_fft, hop_length=hop_length, file_reader=AsyncFileReader()) if self.augmentation: self._logger.debug("Init augmentation transforms for intensity, time, and pitch shift") self.t_amplitude = T.RandomAmplitude(3, -6) self.t_timestretch = T.RandomTimeStretch() self.t_pitchshift = T.RandomPitchSift() else: #only for noise augmentation during validation phase - intensity, time and pitch augmentation is not used during validation/test self.t_timestretch = T.RandomTimeStretch() self.t_pitchshift = T.RandomPitchSift() self._logger.debug("Running without intensity, time, and pitch augmentation") if self.freq_compression == "linear": self.t_compr_f = T.Interpolate(n_freq_bins, sr, f_min, f_max) elif self.freq_compression == "mel": self.t_compr_f = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max) elif self.freq_compression == "mfcc": self.t_compr_f = T.Compose(T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max)) self.t_compr_mfcc = T.M2MFCC(n_mfcc=32) else: raise "Undefined frequency compression" if self.augmentation and self.noise_files_train and self.dataset_name == "train": self._logger.debug("Init training real-world noise files for noise2noise adding") self.t_addnoise = T.RandomAddNoise( self.noise_files_train, self.t_spectrogram, T.Compose(self.t_timestretch, self.t_pitchshift, self.t_compr_f), min_length=seq_len, min_snr=-2, max_snr=-8, return_original=True ) elif not self.augmentation and self.noise_files_val and self.dataset_name == "val": self._logger.debug("Init validation real-world noise files for noise2noise adding") self.t_addnoise = T.RandomAddNoise( self.noise_files_val, self.t_spectrogram, T.Compose(self.t_timestretch, self.t_pitchshift, self.t_compr_f), min_length=seq_len, min_snr=-2, max_snr=-8, return_original=True ) elif not self.augmentation and self.noise_files_test and self.dataset_name == "test": self._logger.debug("Init test real-world noise files for noise2noise adding") self.t_addnoise = T.RandomAddNoise( self.noise_files_test, self.t_spectrogram, T.Compose(self.t_timestretch, self.t_pitchshift, self.t_compr_f), min_length=seq_len, min_snr=-2, max_snr=-8, return_original=True ) else: self.t_addnoise = None raise "ERROR: Init noise files for noise adding does not have a proper setup per split!" self.t_compr_a = T.Amp2Db(min_level_db=DefaultSpecDatasetOps["min_level_db"]) self.t_norm = T.Normalize( min_level_db=DefaultSpecDatasetOps["min_level_db"], ref_level_db=DefaultSpecDatasetOps["ref_level_db"], ) self.t_subseq = T.PaddedSubsequenceSampler(seq_len, dim=1, random=augmentation)
def main(): print('starting denoising') noise_sigma = 4e-5 # sigma for the noise simulation batch_size = 8 # number of images to run for each minibach num_epochs = 200 # number of epochs to train validation_seed = 15 # rng seed for validation loop log_dir = 'logs/denoise/' # log dir for models and tensorboard device = torch.device('cpu') # model will run on this device dtype = torch.float # dtype for data and model # set up tensorboard writer = SummaryWriter(log_dir=log_dir) # checkpoint file name checkpoint_file = os.path.join(log_dir + 'best_model.pt') # ------------------------------------------------------------------------- # NOISE SIMULATION SETUP transform_list = [ transforms.AddNoise(target_op=False, sigma=noise_sigma), transforms.Ifft(norm='ortho'), transforms.SquareRootSumSquare(), transforms.Normalize(), transforms.ToTensor(dat_complex=False, target_complex=False) ] # ------------------------------------------------------------------------- # DATALOADER SETUP train_dataset = KneeDataSet('pytorch_tutorial_data/', 'train', transform=transforms.Compose(transform_list)) print('data set information:') print(train_dataset) val_dataset = KneeDataSet('pytorch_tutorial_data/', 'val', transform=transforms.Compose(transform_list)) # convert to a PyTorch dataloader # this handles batching, random shuffling, parallelization train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, ) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=batch_size, shuffle=True, ) display_dat = val_dataset[15]['dat'].unsqueeze(0).to(device=device, dtype=dtype) display_target = val_dataset[15]['target'].unsqueeze(0).to(device=device, dtype=dtype) display_vmax = np.max(np.squeeze(display_dat.cpu().numpy())) # ------------------------------------------------------------------------- # MODEL SETUP model = DenoiseCnn(num_chans=64, num_layers=4, magnitude_input=True, magnitude_output=True) model = model.to(device) model = model.train() print('CNN model information:') print(model) # ------------------------------------------------------------------------- # OPTIMIZER SETUP optimizer = torch.optim.Adam(model.parameters()) loss_fn = torch.nn.MSELoss() # ------------------------------------------------------------------------- # LOAD PREVIOUS STATE start_epoch, model, optimizer, min_val_loss = load_checkpoint( checkpoint_file, model, optimizer) current_seed = 20 # ------------------------------------------------------------------------- # NETWORK TRAINING for epoch_index in range(start_epoch, num_epochs): print('epoch {} of {}'.format(epoch_index + 1, num_epochs)) # --------------------------------------------------------------------- # TRAINING LOOP model = model.train() # rng seed for noise generation torch.manual_seed(current_seed) np.random.seed(current_seed) torch.cuda.manual_seed(current_seed) # batch loop losses = [] for batch in train_loader: target = batch['target'].to(device=device, dtype=dtype) dat = batch['dat'].to(device=device, dtype=dtype) est = model(dat) # forward propagation loss = loss_fn(est, target) # calculate the loss optimizer.zero_grad() # clear out old gradients loss.backward() # back propagation optimizer.step() # update the CNN weights # keep last 10 minibatches to compute training loss losses.append(loss.item()) losses = losses[-10:] print('trailing training loss: {}'.format(np.mean(losses))) # --------------------------------------------------------------------- # EVALUATION LOOP model = model.eval() # rng seed for noise generation current_seed = np.random.get_state()[1][0] torch.manual_seed(validation_seed) np.random.seed(validation_seed) torch.cuda.manual_seed(validation_seed) # batch loop val_losses = [] with torch.no_grad(): for batch in val_loader: target = batch['target'].to(device=device, dtype=dtype) dat = batch['dat'].to(device=device, dtype=dtype) est = model(dat) loss = loss_fn(est, target) val_losses.append(loss.item()) print('validation loss: {}'.format(np.mean(val_losses))) # --------------------------------------------------------------------- # VISUALIZATIONS AND CHECKPOINTS if np.mean(val_losses) < min_val_loss: save_checkpoint(epoch_index, model, optimizer, np.mean(val_losses), checkpoint_file) # write the losses writer.add_scalar('loss/train', np.mean(losses), epoch_index + 1) writer.add_scalar('loss/validation', np.mean(val_losses), epoch_index + 1) # show an example image from the validation data model = model.eval() with torch.no_grad(): display_est = model(display_dat) writer.add_image('validation/dat', display_dat[0] / display_vmax, global_step=epoch_index + 1) writer.add_image('validation/cnn', display_est[0] / display_vmax, global_step=epoch_index + 1) writer.add_image('validation/target', display_target[0] / display_vmax, global_step=epoch_index + 1) writer.close()
args.checkname = args.arc # Define Saver saver = Saver(args) saver.save_experiment_config() # Define Tensorboard Summary summary = TensorboardSummary(saver.experiment_dir) writer = summary.create_summary() # Data normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_trans = transforms.Compose([transforms.Resize(321), transforms.RandomCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) val_trans = transforms.Compose([transforms.Resize(321), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ]) train_ds = VOCSBDClassification('/path/to/VOC', '/path/to/SBD/benchmark_RELEASE/dataset', transform=train_trans, image_set='train') train_dl = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True, num_workers=4, drop_last=True) val_ds = VOCClassification('/path/to/VOC', transform=val_trans, image_set='val') val_dl = DataLoader(val_ds, batch_size=8, shuffle=True, num_workers=2, drop_last=True)
def initialize_data_loader(DatasetClass, config, phase, threads, shuffle, repeat, augment_data, batch_size, limit_numpoints, input_transform=None, target_transform=None): if isinstance(phase, str): phase = str2datasetphase_type(phase) if config["return_transformation"]: collate_fn = t.cflt_collate_fn_factory(limit_numpoints) else: collate_fn = t.cfl_collate_fn_factory(limit_numpoints) prevoxel_transform_train = [] if augment_data: prevoxel_transform_train.append(t.ElasticDistortion(DatasetClass.ELASTIC_DISTORT_PARAMS)) if len(prevoxel_transform_train) > 0: prevoxel_transforms = t.Compose(prevoxel_transform_train) else: prevoxel_transforms = None input_transforms = [] if input_transform is not None: input_transforms += input_transform if augment_data: input_transforms += [ t.RandomHorizontalFlip(DatasetClass.ROTATION_AXIS, DatasetClass.IS_TEMPORAL), t.ChromaticAutoContrast(), t.ChromaticTranslation(config["data_aug_color_trans_ratio"]), t.ChromaticJitter(config["data_aug_color_jitter_std"]), t.HueSaturationTranslation(config["data_aug_hue_max"], config["data_aug_saturation_max"]), ] if len(input_transforms) > 0: input_transforms = t.Compose(input_transforms) else: input_transforms = None dataset = DatasetClass( config, prevoxel_transform=prevoxel_transforms, input_transform=input_transforms, target_transform=target_transform, cache=config["cache_data"], augment_data=augment_data, phase=phase) data_args = { 'dataset': dataset, 'num_workers': threads, 'batch_size': batch_size, 'collate_fn': collate_fn, } if repeat: data_args['sampler'] = InfSampler(dataset, shuffle) else: data_args['shuffle'] = shuffle data_loader = DataLoader(**data_args) return data_loader
# Define Saver saver = Saver(args) saver.save_experiment_config() # Define Tensorboard Summary summary = TensorboardSummary(saver.experiment_dir) args.exp = saver.experiment_dir.split('_')[-1] if args.train_dataset == 'cityscapes': # Data train_trans = transforms.Compose([ transforms.ToPILImage(), # transforms.RandomResizedCrop((args.image_size, args.image_size), scale=(0.2, 2)), transforms.Resize((args.image_size, args.image_size)), transforms.RandomHorizontalFlip(), transforms.RandomAffine(22, scale=(0.75, 1.25)), transforms.ToTensor(), transforms.Normalize(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]) # transforms.NormalizeInstance() ]) val_trans = transforms.Compose([ transforms.ToPILImage(), transforms.Resize((args.image_size, args.image_size), do_mask=False), transforms.ToTensor(), transforms.Normalize(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]) # transforms.NormalizeInstance() ])
def main(args): assert torch.cuda.is_available(), 'CUDA is not available.' torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True prepare_seed(args.rand_seed) logstr = 'seed-{:}-time-{:}'.format(args.rand_seed, time_for_file()) logger = Logger(args.save_path, logstr) logger.log('Main Function with logger : {:}'.format(logger)) logger.log('Arguments : -------------------------------') for name, value in args._get_kwargs(): logger.log('{:16} : {:}'.format(name, value)) logger.log("Python version : {}".format(sys.version.replace('\n', ' '))) logger.log("Pillow version : {}".format(PIL.__version__)) logger.log("PyTorch version : {}".format(torch.__version__)) logger.log("cuDNN version : {}".format(torch.backends.cudnn.version())) # General Data Argumentation mean_fill = tuple([int(x * 255) for x in [0.485, 0.456, 0.406]]) normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) assert args.arg_flip == False, 'The flip is : {}, rotate is {}'.format(args.arg_flip, args.rotate_max) train_transform = [transforms.PreCrop(args.pre_crop_expand)] train_transform += [transforms.TrainScale2WH((args.crop_width, args.crop_height))] train_transform += [transforms.AugScale(args.scale_prob, args.scale_min, args.scale_max)] # if args.arg_flip: # train_transform += [transforms.AugHorizontalFlip()] if args.rotate_max: train_transform += [transforms.AugRotate(args.rotate_max)] train_transform += [transforms.AugCrop(args.crop_width, args.crop_height, args.crop_perturb_max, mean_fill)] train_transform += [transforms.ToTensor(), normalize] train_transform = transforms.Compose(train_transform) eval_transform = transforms.Compose( [transforms.PreCrop(args.pre_crop_expand), transforms.TrainScale2WH((args.crop_width, args.crop_height)), transforms.ToTensor(), normalize]) assert (args.scale_min + args.scale_max) / 2 == args.scale_eval, 'The scale is not ok : {},{} vs {}'.format( args.scale_min, args.scale_max, args.scale_eval) # Model Configure Load model_config = load_configure(args.model_config, logger) args.sigma = args.sigma * args.scale_eval logger.log('Real Sigma : {:}'.format(args.sigma)) # Training Dataset train_data = GeneralDataset(train_transform, args.sigma, model_config.downsample, args.heatmap_type, args.data_indicator) train_data.load_list(args.train_lists, args.num_pts, True) train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True,num_workers=args.workers, pin_memory=True) # Evaluation Dataloader eval_loaders = [] if args.eval_ilists is not None: for eval_ilist in args.eval_ilists: eval_idata = GeneralDataset(eval_transform, args.sigma, model_config.downsample, args.heatmap_type, args.data_indicator) eval_idata.load_list(eval_ilist, args.num_pts, True) eval_iloader = torch.utils.data.DataLoader(eval_idata, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) eval_loaders.append((eval_iloader, False)) # Define network logger.log('configure : {:}'.format(model_config)) net = obtain_model(model_config, args.num_pts + 1) assert model_config.downsample == net.downsample, 'downsample is not correct : {} vs {}'.format( model_config.downsample, net.downsample) logger.log("=> network :\n {}".format(net)) logger.log('Training-data : {:}'.format(train_data)) for i, eval_loader in enumerate(eval_loaders): eval_loader, is_video = eval_loader logger.log('The [{:2d}/{:2d}]-th testing-data [{:}] = {:}'.format(i, len(eval_loaders), 'video' if is_video else 'image', eval_loader.dataset)) logger.log('arguments : {:}'.format(args)) opt_config = load_configure(args.opt_config, logger) if hasattr(net, 'specify_parameter'): net_param_dict = net.specify_parameter(opt_config.LR, opt_config.Decay) else: net_param_dict = net.parameters() optimizer, scheduler, criterion = obtain_optimizer(net_param_dict, opt_config, logger) logger.log('criterion : {:}'.format(criterion)) net, criterion = net.cuda(), criterion.cuda() net = torch.nn.DataParallel(net) last_info = logger.last_info() if last_info.exists(): logger.log("=> loading checkpoint of the last-info '{:}' start".format(last_info)) last_info = torch.load(str(last_info)) start_epoch = last_info['epoch'] + 1 checkpoint = torch.load(last_info['last_checkpoint']) assert last_info['epoch'] == checkpoint['epoch'], 'Last-Info is not right {:} vs {:}'.format(last_info, checkpoint[ 'epoch']) net.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) logger.log("=> load-ok checkpoint '{:}' (epoch {:}) done".format(logger.last_info(), checkpoint['epoch'])) else: logger.log("=> do not find the last-info file : {:}".format(last_info)) start_epoch = 0 if args.eval_once: logger.log("=> only evaluate the model once") eval_results = eval_all(args, eval_loaders, net, criterion, 'eval-once', logger, opt_config) logger.close() return # Main Training and Evaluation Loop start_time = time.time() epoch_time = AverageMeter() for epoch in range(start_epoch, opt_config.epochs): scheduler.step() need_time = convert_secs2time(epoch_time.avg * (opt_config.epochs - epoch), True) epoch_str = 'epoch-{:03d}-{:03d}'.format(epoch, opt_config.epochs) LRs = scheduler.get_lr() logger.log('\n==>>{:s} [{:s}], [{:s}], LR : [{:.5f} ~ {:.5f}], Config : {:}'.format(time_string(), epoch_str, need_time, min(LRs), max(LRs), opt_config)) # train for one epoch train_loss, train_nme = train(args, train_loader, net, criterion, optimizer, epoch_str, logger, opt_config) # log the results logger.log( '==>>{:s} Train [{:}] Average Loss = {:.6f}, NME = {:.2f}'.format(time_string(), epoch_str, train_loss, train_nme * 100)) # remember best prec@1 and save checkpoint save_path = save_checkpoint({ 'epoch': epoch, 'args': deepcopy(args), 'arch': model_config.arch, 'state_dict': net.state_dict(), 'scheduler': scheduler.state_dict(), 'optimizer': optimizer.state_dict(), }, str(logger.path('model') / '{:}-{:}.pth'.format(model_config.arch, epoch_str)), logger) last_info = save_checkpoint({ 'epoch': epoch, 'last_checkpoint': save_path, }, str(logger.last_info()), logger) eval_results = eval_all(args, eval_loaders, net, criterion, epoch_str, logger, opt_config) # measure elapsed time epoch_time.update(time.time() - start_time) start_time = time.time() logger.close()
def __init__( self, file_names: Iterable[str], working_dir=None, cache_dir=None, sr=44100, n_fft=1024, hop_length=512, freq_compression="linear", n_freq_bins=256, f_min=None, f_max=18000, *args, **kwargs ): super().__init__(file_names, working_dir, sr, *args, **kwargs) if self.dataset_name is not None: self._logger.info("Init dataset {}...".format(self.dataset_name)) self.sp = signal.signal_proc() self.sr = sr self.f_min = f_min self.f_max = f_max self.n_fft = n_fft self.hop_length = hop_length self.sp = signal.signal_proc() self.freq_compression = freq_compression valid_freq_compressions = ["linear", "mel", "mfcc"] if self.freq_compression not in valid_freq_compressions: raise ValueError( "{} is not a valid freq_compression. Must be one of {}", format(self.freq_compression, valid_freq_compressions), ) self._logger.debug( "Number of test files: {}".format(len(self.file_names)) ) spec_transforms = [ lambda fn: T.load_audio_file(fn, sr=sr), T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]), T.Spectrogram(n_fft, hop_length, center=False) ] self.file_reader = AsyncFileReader() if cache_dir is None: self.t_spectrogram = T.Compose(spec_transforms) else: self.t_spectrogram = T.CachedSpectrogram( cache_dir=cache_dir, spec_transform=T.Compose(spec_transforms), n_fft=n_fft, hop_length=hop_length, file_reader=AsyncFileReader(), ) if self.freq_compression == "linear": self.t_compr_f = T.Interpolate( n_freq_bins, sr, f_min, f_max ) elif self.freq_compression == "mel": self.t_compr_f = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max) elif self.freq_compression == "mfcc": self.t_compr_f = T.Compose( T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max), T.M2MFCC() ) else: raise "Undefined frequency compression" self.t_compr_a = T.Amp2Db(min_level_db=DefaultSpecDatasetOps["min_level_db"]) self.t_norm = T.Normalize( min_level_db=DefaultSpecDatasetOps["min_level_db"], ref_level_db=DefaultSpecDatasetOps["ref_level_db"], )