def __init__(
            self,
            file_name,
            sequence_len: int,
            hop: int,
            sr: int = 44100,
            fft_size: int = 4096,
            fft_hop: int = 441,
            n_freq_bins: int = 256,
            freq_compression: str = "linear",
            f_min: int = 200,
            f_max: int = 18000,
            cache_dir=None  #added
    ):
        self.sequence_len = sequence_len
        self.hop = hop

        self.audio = T.load_audio_file(file_name, sr=sr, mono=True)
        self.n_frames = self.audio.shape[1]

        self.t = [
            T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]),
            T.Spectrogram(fft_size, fft_hop, center=False),
        ]

        if freq_compression == "linear":
            self.t.append(T.Interpolate(n_freq_bins, sr, f_min, f_max))
        elif freq_compression == "mel":
            self.t.append(
                T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max))
        elif freq_compression == "mfcc":
            t_mel = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max)
            self.t.append(T.Compose(t_mel, T.M2MFCC()))
        else:
            raise "Undefined frequency compression"
        self.t.append(
            T.Amp2Db(min_level_db=DefaultSpecDatasetOps["min_level_db"]))
        self.t.append(
            T.Normalize(
                min_level_db=DefaultSpecDatasetOps["min_level_db"],
                ref_level_db=DefaultSpecDatasetOps["ref_level_db"],
            ))

        #self.file_reader = AsyncFileReader()
        self.t = T.CachedSpectrogram(
            cache_dir=cache_dir,
            spec_transform=T.Compose(self.t),
            n_fft=fft_size,
            hop_length=fft_hop,
            #file_reader=AsyncFileReader()
        )
예제 #2
0
    def __init__(self,
                 file_names: Iterable[str],
                 working_dir=None,
                 cache_dir=None,
                 sr=44100,
                 n_fft=4096,
                 hop_length=441,
                 freq_compression="linear",
                 n_freq_bins=256,
                 f_min=0,
                 f_max=18000,
                 seq_len=128,
                 augmentation=False,
                 noise_files=[],
                 min_max_normalize=False,
                 *args,
                 **kwargs):
        super().__init__(file_names, working_dir, sr, *args, **kwargs)
        if self.dataset_name is not None:
            self._logger.info("Init dataset {}...".format(self.dataset_name))

        self.n_fft = n_fft
        self.hop_length = hop_length
        self.f_min = f_min
        self.f_max = f_max

        valid_freq_compressions = ["linear", "mel", "mfcc"]

        if freq_compression not in valid_freq_compressions:
            raise ValueError(
                "{} is not a valid freq_compression. Must be one of {}",
                format(freq_compression, valid_freq_compressions),
            )
        self.freq_compression = freq_compression

        self.possible_call_labels = re.compile("|".join(["call"]))
        self.possible_nocall_labels = re.compile("|".join(["noise"]))

        self._logger.debug("Number of files : {}".format(len(self.file_names)))

        _n_calls = 0
        for f in self.file_names:
            if self.is_call(f):
                _n_calls += 1

        self._logger.debug("Number of calls: {}".format(_n_calls))
        self._logger.debug(
            "Number of noise: {}".format(len(self.file_names) - _n_calls))

        self.augmentation = augmentation

        spec_transforms = [
            lambda fn: T.load_audio_file(fn, sr=sr),
            T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]),
            T.Spectrogram(n_fft, hop_length, center=False),
        ]
        self.file_reader = AsyncFileReader()
        if cache_dir is None:
            self.t_spectrogram = T.Compose(spec_transforms)
        else:
            self.t_spectrogram = T.CachedSpectrogram(
                cache_dir=cache_dir,
                spec_transform=T.Compose(spec_transforms),
                n_fft=n_fft,
                hop_length=hop_length,
                file_reader=AsyncFileReader(),
            )
        if augmentation:
            self._logger.debug(
                "Init augmentation transforms for time and pitch shift")
            self.t_amplitude = T.RandomAmplitude(3, -6)
            self.t_timestretch = T.RandomTimeStretch()
            self.t_pitchshift = T.RandomPitchSift()
        else:
            self._logger.debug("Running without augmentation")
        if self.freq_compression == "linear":
            self.t_compr_f = T.Interpolate(n_freq_bins, sr, f_min, f_max)
        elif self.freq_compression == "mel":
            self.t_compr_f = T.F2M(sr=sr,
                                   n_mels=n_freq_bins,
                                   f_min=f_min,
                                   f_max=f_max)
        elif self.freq_compression == "mfcc":
            self.t_compr_f = T.Compose(
                T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max))
            self.t_compr_mfcc = T.M2MFCC(n_mfcc=32)
        else:
            raise "Undefined frequency compression"
        if augmentation:
            if noise_files:
                self._logger.debug(
                    "Init augmentation transform for random noise addition")
                self.t_addnoise = T.RandomAddNoise(
                    noise_files,
                    self.t_spectrogram,
                    T.Compose(self.t_timestretch, self.t_pitchshift,
                              self.t_compr_f),
                    min_length=seq_len,
                    return_original=True)
            else:
                self.t_addnoise = None
        self.t_compr_a = T.Amp2Db(
            min_level_db=DefaultSpecDatasetOps["min_level_db"])

        if min_max_normalize:
            self.t_norm = T.MinMaxNormalize()
            self._logger.debug("Init min-max-normalization activated")
        else:
            self.t_norm = T.Normalize(
                min_level_db=DefaultSpecDatasetOps["min_level_db"],
                ref_level_db=DefaultSpecDatasetOps["ref_level_db"],
            )
            self._logger.debug("Init 0/1-dB-normalization activated")

        self.t_subseq = T.PaddedSubsequenceSampler(seq_len,
                                                   dim=1,
                                                   random=augmentation)
예제 #3
0
    def __init__(
        self,
        file_names: Iterable[str],
        working_dir=None,
        cache_dir=None,
        sr=44100,
        n_fft=1024,
        hop_length=512,
        freq_compression="linear",
        n_freq_bins=256,
        f_min=None,
        f_max=18000,
        *args,
        **kwargs
    ):
        super().__init__(file_names, working_dir, sr, *args, **kwargs)
        if self.dataset_name is not None:
            self._logger.info("Init dataset {}...".format(self.dataset_name))

        self.sp = signal.signal_proc()

        self.sr = sr
        self.f_min = f_min
        self.f_max = f_max
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.sp = signal.signal_proc()
        self.freq_compression = freq_compression

        valid_freq_compressions = ["linear", "mel", "mfcc"]

        if self.freq_compression not in valid_freq_compressions:
            raise ValueError(
                "{} is not a valid freq_compression. Must be one of {}",
               format(self.freq_compression, valid_freq_compressions),
            )

        self._logger.debug(
            "Number of test files: {}".format(len(self.file_names))
        )

        spec_transforms = [
            lambda fn: T.load_audio_file(fn, sr=sr),
            T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]),
            T.Spectrogram(n_fft, hop_length, center=False)
        ]

        self.file_reader = AsyncFileReader()

        if cache_dir is None:
            self.t_spectrogram = T.Compose(spec_transforms)
        else:
            self.t_spectrogram = T.CachedSpectrogram(
                cache_dir=cache_dir,
                spec_transform=T.Compose(spec_transforms),
                n_fft=n_fft,
                hop_length=hop_length,
                file_reader=AsyncFileReader(),
            )

        if self.freq_compression == "linear":
            self.t_compr_f = T.Interpolate(
                n_freq_bins, sr, f_min, f_max
            )
        elif self.freq_compression == "mel":
            self.t_compr_f = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max)
        elif self.freq_compression == "mfcc":
            self.t_compr_f = T.Compose(
                T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max), T.M2MFCC()
            )
        else:
            raise "Undefined frequency compression"

        self.t_compr_a = T.Amp2Db(min_level_db=DefaultSpecDatasetOps["min_level_db"])

        self.t_norm = T.Normalize(
            min_level_db=DefaultSpecDatasetOps["min_level_db"],
            ref_level_db=DefaultSpecDatasetOps["ref_level_db"],
        )
    def __init__(
            self,
            file_names: Iterable[str],
            working_dir=None,
            cache_dir=None,
            sr=44100,
            n_fft=2048,  #4096
            hop_length=220,  #441
            freq_compression="linear",
            n_freq_bins=256,  # determines the width of the image
            f_min=0,
            f_max=18000,
            seq_len=128,  # shd be adjusted together with sequence_len in class StridedAudioDataset (called by predict.py)
            augmentation=False,
            noise_files=[],
            *args,
            **kwargs):
        super().__init__(file_names, working_dir, sr, *args, **kwargs)
        if self.dataset_name is not None:
            self._logger.info("Init dataset {}...".format(self.dataset_name))

        self.n_fft = n_fft
        self.hop_length = hop_length
        self.f_min = f_min
        self.f_max = f_max

        # mel: log transformation of freq (Hz scale to Mel scale)
        # attention: Mel-spectrograms as a network input led to an excessive loss of resolution in higher frequency bands, which was
        # a big problem considering the high-frequency pulsed calls and whistles.
        valid_freq_compressions = ["linear", "mel", "mfcc"]

        if freq_compression not in valid_freq_compressions:
            raise ValueError(
                "{} is not a valid freq_compression. Must be one of {}",
                format(freq_compression, valid_freq_compressions),
            )
        self.freq_compression = freq_compression

        # combine a RegExp pattern into pattern objects for pattern matching
        self.possible_call_labels = re.compile("|".join(["call"]))
        self.possible_nocall_labels = re.compile("|".join(["noise"]))

        self._logger.debug("Number of files : {}".format(len(self.file_names)))

        _n_calls = 0
        for f in self.file_names:
            if self.is_call(f):
                _n_calls += 1

        self._logger.debug("Number of calls: {}".format(_n_calls))
        self._logger.debug(
            "Number of noise: {}".format(len(self.file_names) - _n_calls))

        self.augmentation = augmentation

        spec_transforms = [
            lambda fn: T.load_audio_file(fn, sr=sr),  # return: a vector tensor
            T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]),
            T.Spectrogram(n_fft, hop_length, center=False),
        ]
        self.file_reader = AsyncFileReader()
        # if user chooses to not cache .spec by omitting the directory
        if cache_dir is None:
            self.t_spectrogram = T.Compose(spec_transforms)
        else:
            # where .spec is created and stored
            # n_fft, hop_length: meta in spec_dict
            self.t_spectrogram = T.CachedSpectrogram(
                cache_dir=cache_dir,
                spec_transform=T.Compose(spec_transforms),
                n_fft=n_fft,
                hop_length=hop_length,
                file_reader=AsyncFileReader(),
            )
        if augmentation:
            self._logger.debug(
                "Init augmentation transforms for time and pitch shift")
            self.t_amplitude = T.RandomAmplitude(3, -6)
            self.t_timestretch = T.RandomTimeStretch()
            self.t_pitchshift = T.RandomPitchSift()
        else:
            self._logger.debug("Running without augmentation")
        if self.freq_compression == "linear":
            self.t_compr_f = T.Interpolate(n_freq_bins, sr, f_min, f_max)
        elif self.freq_compression == "mel":
            self.t_compr_f = T.F2M(sr=sr,
                                   n_mels=n_freq_bins,
                                   f_min=f_min,
                                   f_max=f_max)
        elif self.freq_compression == "mfcc":
            self.t_compr_f = T.Compose(
                T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max),
                T.M2MFCC())
        else:
            raise "Undefined frequency compression"
        if augmentation:
            if noise_files:
                self._logger.debug(
                    "Init augmentation transform for random noise addition")
                self.t_addnoise = T.RandomAddNoise(
                    noise_files,
                    self.t_spectrogram,
                    T.Compose(self.t_timestretch, self.t_pitchshift,
                              self.t_compr_f),
                    min_length=seq_len,
                    return_original=True
                )  # if return_original = True, both augmented and original specs are returned
            else:
                self.t_addnoise = None
        self.t_compr_a = T.Amp2Db(
            min_level_db=DefaultSpecDatasetOps["min_level_db"])
        self.t_norm = T.Normalize(
            min_level_db=DefaultSpecDatasetOps["min_level_db"],
            ref_level_db=DefaultSpecDatasetOps["ref_level_db"],
        )
        self.t_subseq = T.PaddedSubsequenceSampler(seq_len,
                                                   dim=1,
                                                   random=augmentation)
예제 #5
0
    def __init__(
        self,
        file_names: Iterable[str],
        working_dir=None,
        cache_dir=None,
        sr=44100,
        n_fft=4096,
        hop_length=441,
        freq_compression="linear",
        n_freq_bins=256,
        f_min=0,
        f_max=18000,
        seq_len=128,
        augmentation=False,
        noise_files_train=[],
        noise_files_val=[],
        noise_files_test=[],
        random=False,
        *args,
        **kwargs
    ):
        super().__init__(file_names, working_dir, sr, *args, **kwargs)
        if self.dataset_name is not None:
            self._logger.info("Init dataset {}...".format(self.dataset_name))

        self.sp = signal.signal_proc()

        self.df = 15.0
        self.exp_e = 0.1
        self.bin_pow = 2.0
        self.gaus_mean = 0.0
        self.gaus_stdv = 12.5
        self.poisson_lambda = 15.0
        self.orig_noise_value = -5

        self.f_min = f_min
        self.f_max = f_max
        self.n_fft = n_fft
        self.random = random
        self.hop_length = hop_length
        self.augmentation = augmentation
        self.file_reader = AsyncFileReader()
        self.noise_files_val = noise_files_val
        self.noise_files_test = noise_files_test
        self.freq_compression = freq_compression
        self.noise_files_train = noise_files_train


        valid_freq_compressions = ["linear", "mel", "mfcc"]

        if self.freq_compression not in valid_freq_compressions:
            raise ValueError(
                "{} is not a valid freq_compression. Must be one of {}",
                format(self.freq_compressio, valid_freq_compressions),
            )

        self._logger.debug(
            "Number of files to denoise : {}".format(len(self.file_names))
        )

        spec_transforms = [
            lambda fn: T.load_audio_file(fn, sr=sr),
            T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]),
            T.Spectrogram(n_fft, hop_length, center=False),
        ]

        if cache_dir is None:
            self.t_spectrogram = T.Compose(spec_transforms)
        else:
            self.t_spectrogram = T.CachedSpectrogram(
                cache_dir=cache_dir,
                spec_transform=T.Compose(spec_transforms),
                n_fft=n_fft,
                hop_length=hop_length,
                file_reader=AsyncFileReader())

        if self.augmentation:
            self._logger.debug("Init augmentation transforms for intensity, time, and pitch shift")
            self.t_amplitude = T.RandomAmplitude(3, -6)
            self.t_timestretch = T.RandomTimeStretch()
            self.t_pitchshift = T.RandomPitchSift()
        else:
            #only for noise augmentation during validation phase - intensity, time and pitch augmentation is not used during validation/test
            self.t_timestretch = T.RandomTimeStretch()
            self.t_pitchshift = T.RandomPitchSift()
            self._logger.debug("Running without intensity, time, and pitch augmentation")

        if self.freq_compression == "linear":
            self.t_compr_f = T.Interpolate(n_freq_bins, sr, f_min, f_max)
        elif self.freq_compression == "mel":
            self.t_compr_f = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max)
        elif self.freq_compression == "mfcc":
            self.t_compr_f = T.Compose(T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max))
            self.t_compr_mfcc = T.M2MFCC(n_mfcc=32)
        else:
            raise "Undefined frequency compression"

        if self.augmentation and self.noise_files_train and self.dataset_name == "train":
            self._logger.debug("Init training real-world noise files for noise2noise adding")
            self.t_addnoise = T.RandomAddNoise(
                self.noise_files_train,
                self.t_spectrogram,
                T.Compose(self.t_timestretch, self.t_pitchshift, self.t_compr_f),
                min_length=seq_len,
                min_snr=-2,
                max_snr=-8,
                return_original=True
            )
        elif not self.augmentation and self.noise_files_val and self.dataset_name == "val":
            self._logger.debug("Init validation real-world noise files for noise2noise adding")
            self.t_addnoise = T.RandomAddNoise(
                self.noise_files_val,
                self.t_spectrogram,
                T.Compose(self.t_timestretch, self.t_pitchshift, self.t_compr_f),
                min_length=seq_len,
                min_snr=-2,
                max_snr=-8,
                return_original=True
            )
        elif not self.augmentation and self.noise_files_test and self.dataset_name == "test":
            self._logger.debug("Init test real-world noise files for noise2noise adding")
            self.t_addnoise = T.RandomAddNoise(
                self.noise_files_test,
                self.t_spectrogram,
                T.Compose(self.t_timestretch, self.t_pitchshift, self.t_compr_f),
                min_length=seq_len,
                min_snr=-2,
                max_snr=-8,
                return_original=True
            )
        else:
            self.t_addnoise = None
            raise "ERROR: Init noise files for noise adding does not have a proper setup per split!"

        self.t_compr_a = T.Amp2Db(min_level_db=DefaultSpecDatasetOps["min_level_db"])

        self.t_norm = T.Normalize(
            min_level_db=DefaultSpecDatasetOps["min_level_db"],
            ref_level_db=DefaultSpecDatasetOps["ref_level_db"],
        )

        self.t_subseq = T.PaddedSubsequenceSampler(seq_len, dim=1, random=augmentation)