def __init__(self, config: dict): super().__init__(config) self.spect = Spectrum(config) self.cmvn = CMVN(config) # global cmvn dim == feature dim if config.type == 'MelSpectrum' and self.cmvn.global_cmvn: assert config.filterbank_channel_count * config.channel == len(config.global_mean), \ 'Error, feature dim {} is not equals to cmvn dim {}'. \ format(config.filterbank_channel_count * config.channel, len(config.global_mean))
def __init__(self, config: dict): super().__init__(config) self.spect = Spectrum(config) self.cmvn = CMVN(config) # global cmvn dim == feature dim if config.type == "Fbank" and self.cmvn.global_cmvn: assert config.filterbank_channel_count * config.channel == len( config.global_mean ), "Error, feature dim {} is not equals to cmvn dim {}".format( config.filterbank_channel_count * config.channel, len(config.global_mean), ) print("Fbank params: ", self.config)
def test_spectrum(self): wav_path_16k = str( Path(os.environ["MAIN_ROOT"]).joinpath("examples/sm1_cln.wav")) wav_path_8k = str( Path(os.environ["MAIN_ROOT"]).joinpath("examples/english.wav")) with self.session(): for wav_file in [wav_path_8k, wav_path_16k]: read_wav = ReadWav.params().instantiate() input_data, sample_rate = read_wav(wav_file) spectrum = Spectrum.params({ "window_length": 0.025, "dither": 0.0 }).instantiate() spectrum_test = spectrum(input_data, sample_rate) output_true = np.array([ [9.819611, 2.84503, 3.660894, 2.7779, 1.212233], [9.328745, 2.553949, 3.276319, 3.000918, 2.499342], ]) if tf.executing_eagerly(): self.assertEqual(tf.rank(spectrum_test).numpy(), 2) else: self.assertEqual(tf.rank(spectrum_test).eval(), 2) if wav_file == wav_path_16k: if tf.executing_eagerly(): self.assertAllClose( spectrum_test.numpy()[0:2, 0:5], output_true, rtol=1e-05, atol=1e-05, ) else: self.assertAllClose( spectrum_test.eval()[0:2, 0:5], output_true, rtol=1e-05, atol=1e-05, )
def params(cls, config=None): """ Set params. :param config: contains thirteen optional parameters:upper_frequency_limit(float, default=0), lower_frequency_limit(float, default=60.0), filterbank_channel_count(float, default=40.0), :return: An object of class HParams, which is a set of hyperparameters as name-value pairs. """ hparams = HParams(cls=cls) # spectrum hparams.append(Spectrum.params({"output_type": 1, "is_fbank": True})) # fbank upper_frequency_limit = 0 lower_frequency_limit = 60 filterbank_channel_count = 40 hparams.add_hparam("upper_frequency_limit", upper_frequency_limit) hparams.add_hparam("lower_frequency_limit", lower_frequency_limit) hparams.add_hparam("filterbank_channel_count", filterbank_channel_count) # delta delta_delta = False # True order = 2 window = 2 hparams.add_hparam("delta_delta", delta_delta) hparams.add_hparam("order", order) hparams.add_hparam("window", window) if config is not None: hparams.parse(config, True) hparams.type = "Fbank" hparams.add_hparam("channel", 1) if hparams.delta_delta: hparams.channel = hparams.order + 1 return hparams
def params(cls, config=None): """ Set params. :param config: contains thirteen optional parameters: --window_length : Window length in seconds. (float, default = 0.025) --frame_length : Hop length in seconds. (float, default = 0.010) --snip_edges : If True, the last frame (shorter than window_length) will be cutoff. If False, 1 // 2 frame_length data will be padded to data. (bool, default = True) ---raw_energy : If 1, compute frame energy before preemphasis and windowing. If 2, compute frame energy after preemphasis and windowing. (int, default = 1) --preEph_coeff : Coefficient for use in frame-signal preemphasis. (float, default = 0.0) --window_type : Type of window ("hamm"|"hann"|"povey"|"rect"|"blac"|"tria"). (string, default = "hann") --remove_dc_offset : Subtract mean from waveform on each frame. (bool, default = false) --is_fbank : If true, compute power spetrum without frame energy. If false, using the frame energy instead of the square of the constant component of the signal. (bool, default = true) --output_type : If 1, return power spectrum. If 2, return log-power spectrum. If 3, return magnitude spectrum. (int, default = 3) --upper_frequency_limit : High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (float, default = 0) --lower_frequency_limit : Low cutoff frequency for mel bins (float, default = 20) --filterbank_channel_count : Number of triangular mel-frequency bins. (float, default = 23) --dither : Dithering constant (0.0 means no dither). (float, default = 0) [add robust to training] :return: An object of class HParams, which is a set of hyperparameters as name-value pairs. """ hparams = HParams(cls=cls) # spectrum hparams.append( Spectrum.params({ 'output_type': 3, 'is_fbank': True, 'preEph_coeff': 0.0, 'window_type': 'hann', 'dither': 0.0, 'remove_dc_offset': False })) # mel_spectrum upper_frequency_limit = 0 lower_frequency_limit = 60 filterbank_channel_count = 40 sample_rate = -1 hparams.add_hparam('upper_frequency_limit', upper_frequency_limit) hparams.add_hparam('lower_frequency_limit', lower_frequency_limit) hparams.add_hparam('filterbank_channel_count', filterbank_channel_count) hparams.add_hparam('sample_rate', sample_rate) # delta delta_delta = False # True order = 2 window = 2 hparams.add_hparam('delta_delta', delta_delta) hparams.add_hparam('order', order) hparams.add_hparam('window', window) if config is not None: hparams.parse(config, True) hparams.type = 'MelSpectrum' hparams.add_hparam('channel', 1) if hparams.delta_delta: hparams.channel = hparams.order + 1 return hparams
def params(cls, config=None): """Set params. Args: config: contains the following thirteen optional parameters: 'window_length': Window length in seconds. (float, default = 0.025) 'frame_length': Hop length in seconds. (float, default = 0.010) 'snip_edges': If 1, the last frame (shorter than window_length) will be cutoff. If 2, 1 // 2 frame_length data will be padded to data. (int, default = 1) 'preEph_coeff': Coefficient for use in frame-signal preemphasis. (float, default = 0.97) 'window_type': Type of window ("hamm"|"hann"|"povey"|"rect"|"blac"|"tria"). (string, default = "povey") 'remove_dc_offset': Subtract mean from waveform on each frame. (bool, default = true) 'is_fbank': If true, compute power spetrum without frame energy. If false, using the frame energy instead of the square of the constant component of the signal. (bool, default = true) 'is_log10': If true, using log10 to fbank. If false, using loge. (bool, default = false) 'output_type': If 1, return power spectrum. If 2, return log-power spectrum. (int, default = 1) 'upper_frequency_limit': High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (float, default = 0) 'lower_frequency_limit': Low cutoff frequency for mel bins (float, default = 20) 'filterbank_channel_count': Number of triangular mel-frequency bins. (float, default = 23) 'dither': Dithering constant (0.0 means no dither). (float, default = 1) [add robust to training] Note: Return an object of class HParams, which is a set of hyperparameters as name-value pairs. """ hparams = HParams(cls=cls) # spectrum hparams.append(Spectrum.params({"output_type": 1, "is_fbank": True})) # fbank upper_frequency_limit = 0 lower_frequency_limit = 60 filterbank_channel_count = 40 is_log10 = False hparams.add_hparam("upper_frequency_limit", upper_frequency_limit) hparams.add_hparam("lower_frequency_limit", lower_frequency_limit) hparams.add_hparam("filterbank_channel_count", filterbank_channel_count) hparams.add_hparam('is_log10', is_log10) # delta delta_delta = False # True order = 2 window = 2 hparams.add_hparam("delta_delta", delta_delta) hparams.add_hparam("order", order) hparams.add_hparam("window", window) if config is not None: hparams.parse(config, True) hparams.type = "Fbank" hparams.add_hparam("channel", 1) if hparams.delta_delta: hparams.channel = hparams.order + 1 return hparams