def __init__(self, train_img_path, batch_size, shuffle=True, validation_split=0.0, num_workers=1, p_augment=0.0, training=True): img_transforms = transforms.Compose([ transforms.Resize((512, 300)), transforms.ToTensor(), transforms.RandomChoice([ transforms.RandomApply( [audiotransforms.FrequencyMasking(freq_mask_param=50)], p=p_augment), transforms.RandomApply( [audiotransforms.TimeMasking(time_mask_param=100)], p=p_augment) ]), ]) self.train_img_path = train_img_path self.dataset = ImageFolder(root=self.train_img_path, transform=img_transforms) super().__init__(self.dataset, batch_size, shuffle, validation_split, num_workers)
def train_input_per_sample_transform(self) -> Callable: transforms = [] if self.time_mask_param is not None: transforms.append(TAudio.TimeMasking(time_mask_param=self.time_mask_param)) if self.freq_mask_param is not None: transforms.append(TAudio.FrequencyMasking(freq_mask_param=self.freq_mask_param)) transforms += [T.ToTensor(), T.Resize(self.spectrogram_size)] return T.Compose(transforms)
def __init__(self, train_img_path, batch_size, shuffle=True, validation_split=0.0, num_workers=1, p_augment=0.0, training=True): alexnet_transforms = transforms.Compose([ transforms.Resize(256), transforms.ToTensor(), transforms.RandomChoice([ transforms.RandomApply( [audiotransforms.FrequencyMasking(freq_mask_param=50)], p=p_augment), transforms.RandomApply( [audiotransforms.TimeMasking(time_mask_param=100)], p=p_augment) ]), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) """ alexnet_transforms = A.Compose([ A.Resize(256,256,always_apply=True), A.OneOf([ A.GaussNoise(p=0.4), A.RandomBrightnessContrast(p=0.4), A.ShiftScaleRotate(shift_limit_x = 0.1,scale_limit = 0, rotate_limit=0,p=0.4) ], p = p_augment), A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225],always_apply=True), ToTensorV2(always_apply=True) ]) alexnet_transforms = transforms.Compose([ transforms.Resize(256), transforms.ToTensor(), #transforms.Lambda(lambda x: torch.unsqueeze(x,1)), #transforms.Lambda(lambda x: torch.cat((x, x, x), 1)), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) """ self.train_img_path = train_img_path #self.dataset = ImageFolderAlbumentations(root=self.train_img_path, transform=alexnet_transforms) self.dataset = ImageFolder(root=self.train_img_path, transform=alexnet_transforms) super().__init__(self.dataset, batch_size, shuffle, validation_split, num_workers)
def spec_augment(spec, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1): _, n_mels, n_steps = spec.shape mask_value = spec.mean() aug_spec = spec freq_mask_param = max_mask_pct * n_mels for _ in range(n_freq_masks): aug_spec = transforms.FrequencyMasking(freq_mask_param)(aug_spec, mask_value) time_mask_param = max_mask_pct * n_steps for _ in range(n_time_masks): aug_spec = transforms.TimeMasking(time_mask_param)(aug_spec, mask_value) return aug_spec
def construct_transforms(log_compression, time_cutout, freq_cutout, n_mels=40, **kwargs): train_tfs = { 'audio': Compose([ ToTensor(), transforms.MelSpectrogram(sample_rate=8192, n_fft=512, hop_length=128, n_mels=n_mels), LogCompress(ratio=log_compression), transforms.TimeMasking(time_cutout), transforms.FrequencyMasking(freq_cutout), TorchUnsqueeze() ]), 'target': Compose([ ShortTermAverageTransform(frame_length=512, hop_length=128, threshold=0.5), ToTensor() ]) } dev_tfs = { 'audio': Compose([ ToTensor(), transforms.MelSpectrogram(sample_rate=8192, n_fft=512, hop_length=128, n_mels=n_mels), LogCompress(ratio=log_compression), TorchUnsqueeze(), ]), 'target': Compose([ ShortTermAverageTransform(frame_length=512, hop_length=128, threshold=0.5), ToTensor() ]), } return train_tfs, dev_tfs
def __init__(self, root_dir, transform=None, train=False): """ Args: root_dir (string): Directory with all the subdirectory for each speaker and their audio. transform (callable, optional): Optional transform to be applied on a sample. """ if train: self.root_dir = os.path.join(root_dir, "train") else: self.root_dir = os.path.join(root_dir, "test") self.speaker_frame, self.name_dict = self._create_speaker_dataframe() self.transform = transform self.sample_rate = 16000 self.resample_trans = torchaudio.transforms.Resample( 48000, self.sample_rate) self.freq_masking = T.FrequencyMasking(freq_mask_param=80, iid_masks=True) self.time_masking = T.TimeMasking(time_mask_param=80, iid_masks=True)
def __getitem__(self, index): audio, sr = load(self.file_paths[index]) audio = torch.mean(audio, dim=0, keepdim=True) if self.sr != sr: audio = transforms.Resample(sr, self.sr)(audio) mel_spectrogram = transforms.MelSpectrogram(sample_rate=self.sr, n_fft=self.n_fft, win_length=self.win_length, hop_length=self.hop_length, n_mels=self.n_mels, f_max=self.sr / 2)(audio) if self.log_mel: offset = 1e-6 mel_spectrogram = torch.log(mel_spectrogram + offset) else: mel_spectrogram = transforms.AmplitudeToDB( stype="power", top_db=80)(mel_spectrogram) if self.augment: audio = transforms.FrequencyMasking(freq_mask_param=20)(audio) audio = transforms.TimeMasking(time_mask_param=10)(audio) label = self.labels[index] return mel_spectrogram, label
def train_default_transforms( spectrogram_size: Tuple[int, int], time_mask_param: Optional[int], freq_mask_param: Optional[int]) -> Dict[str, Callable]: """During training we apply the default transforms with optional ``TimeMasking`` and ``Frequency Masking``.""" augs = [] if time_mask_param is not None: augs.append( ApplyToKeys(DefaultDataKeys.INPUT, TAudio.TimeMasking(time_mask_param=time_mask_param))) if freq_mask_param is not None: augs.append( ApplyToKeys( DefaultDataKeys.INPUT, TAudio.FrequencyMasking(freq_mask_param=freq_mask_param))) if len(augs) > 0: return merge_transforms( default_transforms(spectrogram_size), {"post_tensor_transform": nn.Sequential(*augs)}) return default_transforms(spectrogram_size)
def test_TimeMasking(self): tensor = torch.rand((10, 2, 50, 10, 2)) self._assert_consistency( T.TimeMasking(time_mask_param=30, iid_masks=False), tensor)
transform = { 'val': { 'base': base }, 'test': { 'base': base }, 'train': { 'base': base }, } # Augmentation logmel_aug = T.Compose([ TA.TimeMasking(time_mask_param=30), TA.FrequencyMasking(freq_mask_param=15) ]) logmel_A = T.Lambda(lambd=lambda x: torch.cat((x, logmel_aug(x)), dim=2)) augment = { 'val': { 'logmel': logmel_A }, 'test': { 'logmel': logmel_A }, 'train': { 'logmel': logmel_A },
rate = 0.9 spec_ = stretch(spec, rate) plot_spectrogram(torch.abs(spec_[0]), title=f"Stretched x{rate}", aspect='equal', xmax=304) ###################################################################### # TimeMasking # ~~~~~~~~~~~ # torch.random.manual_seed(4) spec = get_spectrogram() plot_spectrogram(spec[0], title="Original") masking = T.TimeMasking(time_mask_param=80) spec = masking(spec) plot_spectrogram(spec[0], title="Masked along time axis") ###################################################################### # FrequencyMasking # ~~~~~~~~~~~~~~~~ # torch.random.manual_seed(4) spec = get_spectrogram() plot_spectrogram(spec[0], title="Original")
def __init__( self, wake_words_root_path="D:/Workspace/Projects/Voice2Command/recordings/positive", background_sounds_root_path="D:/Storage/UrbanSound8K/audio/fold1", max_length=3, sampling_rate=44100, #44.1 Hz testing=False): self.wake_words_positive_root_path = wake_words_root_path + "/positive" self.wake_words_negative_root_path = wake_words_root_path + "/negative" self.sampling_rate = sampling_rate self.background_sounds_root_path = background_sounds_root_path self.generated_samples = [] self.sample_size = sampling_rate * max_length self.background_noise_sound_paths = list( pathlib.Path(background_sounds_root_path).glob('*.wav')) self.wake_words = self._load_wake_words( self.wake_words_positive_root_path) self.wake_words_negative = self._load_wake_words( self.wake_words_negative_root_path) #Spec Augment transforms self.transforms = nn.Sequential( transforms.FrequencyMasking(freq_mask_param=2), transforms.TimeMasking(time_mask_param=4)) number_of_samples = 400 if testing == True: number_of_samples = 50 for idx, path in enumerate( self.background_noise_sound_paths[:number_of_samples]): y, sr = librosa.core.load(path, sr=sampling_rate) if len(y) < self.sample_size: y = np.pad(y, (0, self.sample_size - len(y))) else: y = y[:self.sample_size] y_false = np.array(y, copy=True) y_true = y #Positive wake_word = self.sample_wake_word(self.wake_words) interval = self._get_random_time_interval( len(wake_word), max_length * sampling_rate) self._overlay_wakeword(y_true[interval[0]:interval[1]], wake_word) # self._save_sound(y) S_true = librosa.feature.melspectrogram(y=y_true, sr=sr, hop_length=128) S_db_true = librosa.core.power_to_db(S_true) S_db_true = self.transforms(torch.from_numpy(S_db_true)) #Negative if random.random() > 0.5: wake_word = self.sample_wake_word(self.wake_words_negative) interval = self._get_random_time_interval( len(wake_word), max_length * sampling_rate) self._overlay_wakeword(y_false[interval[0]:interval[1]], wake_word) self._save_sound(y_false) S_false = librosa.feature.melspectrogram(y=y_false, sr=sr, hop_length=128) S_db_false = librosa.core.power_to_db(S_false) S_db_false = self.transforms(torch.from_numpy(S_db_false)) # Labels for position detection of the wake word # label = np.zeros(sample_size) # label[interval[1]:interval[1] + 50] = 1 self.generated_samples.append((S_db_true.unsqueeze(dim=0).float(), torch.tensor([1]).float())) self.generated_samples.append((S_db_false.unsqueeze(dim=0).float(), torch.tensor([0]).float()))
from convolution_net.learner import Learner from convolution_net.load import fetch_dataloaders, build_register, train_dev_test # environment # torch.manual_seed(0) torch.backends.cudnn.deterministic = False torch.backends.cudnn.benchmark = True torch.set_num_threads(4) # np.random.seed(0) train_tfs = { 'audio': Compose([ ToTensor(), transforms.MelSpectrogram(sample_rate=8192, n_fft=512, hop_length=128, n_mels=40), LogCompress(ratio=1), transforms.TimeMasking(4), transforms.FrequencyMasking(4), TorchUnsqueeze() ]), 'target': Compose([ ShortTermAverageTransform(frame_length=512, hop_length=128, threshold=0.5), # ThresholdPoolSequence(0.001), # was 0.125 ToTensor() ]) } dev_tfs = { 'audio': Compose([ ToTensor(), transforms.MelSpectrogram(sample_rate=8192, n_fft=512, hop_length=128, n_mels=40), LogCompress(ratio=1), TorchUnsqueeze(),
def main(args, config): model = AudioOnly(8, base_model=args.arch) import torchaudio.transforms as at t = [] if args.masking_time != 0: t.append(at.TimeMasking(args.masking_time)) if args.masking_freq != 0: t.append(at.FrequencyMasking(args.masking_freq)) transform = transforms.Compose(t) dataset = AudioDataSet("train", transform=transform) val_transform = transforms.Compose([ ]) sampler = None train_loader = torch.utils.data.DataLoader( dataset, sampler=sampler, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, collate_fn=None, drop_last=False) val_loader = torch.utils.data.DataLoader( AudioDataSet("val",transform=val_transform), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True, collate_fn=None) logger = config.get_logger('train') logger.info(model) criterion_categorical = getattr(module_loss, config['loss']) criterion_continuous = getattr(module_loss, config['loss_continuous']) metrics = [getattr(module_metric, met) for met in config['metrics']] metrics_continuous = [getattr(module_metric, met) for met in config['metrics_continuous']] # policies = model.get_optim_policies(lr=args.lr) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) lr_scheduler = config.init_obj('lr_scheduler', torch.optim.lr_scheduler, optimizer) for param_group in optimizer.param_groups: print(param_group['lr']) trainer = Trainer(model, criterion_categorical, criterion_continuous, metrics, metrics_continuous, optimizer, categorical=True, continuous=False, config=config, data_loader=train_loader, valid_data_loader=val_loader, lr_scheduler=lr_scheduler) trainer.train() test_loader = torch.utils.data.DataLoader( AudioDataSet("test",transform=val_transform), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True, collate_fn=None) """ load best model and test """ cp = torch.load(str(trainer.checkpoint_dir / 'model_best.pth')) model.load_state_dict(cp['state_dict'],strict=True) print('loaded', str(trainer.checkpoint_dir / 'model_best.pth'), 'best_epoch', cp['epoch']) trainer = Trainer(model, criterion_categorical, criterion_continuous, metrics, metrics_continuous, optimizer, categorical=True, continuous=False, config=config, data_loader=train_loader, valid_data_loader=test_loader, lr_scheduler=lr_scheduler) trainer.test()