def params(cls, config=None): """ Set params. :param config: contains three optional parameters: --sample_rate : Waveform data sample frequency (must match the waveform file, if specified there). (float, default = 16000) --window_length : Window length in seconds. (float, default = 0.030) --frame_length : Hop length in seconds. (float, default = 0.010) :return: An object of class HParams, which is a set of hyperparameters as name-value pairs. """ window_length = 0.030 frame_length = 0.010 sample_rate = 16000 hparams = HParams(cls=cls) hparams.add_hparam('window_length', window_length) hparams.add_hparam('frame_length', frame_length) hparams.add_hparam('sample_rate', sample_rate) if config is not None: hparams.override_from_dict(config) return hparams
def params(cls, config=None): """ Set params. :param config: contains three optional parameters:window_length(float, default=0.030), frame_length(float, default=0.010), sample_rate(float, default=16000). :return:An object of class HParams, which is a set of hyperparameters as name-value pairs. """ window_length = 0.030 frame_length = 0.010 sample_rate = 16000 hparams = HParams(cls=cls) hparams.add_hparam('window_length', window_length) hparams.add_hparam('frame_length', frame_length) hparams.add_hparam('sample_rate', sample_rate) if config is not None: hparams.override_from_dict(config) return hparams
def params(cls, config: dict = None): embedding_size = 512 #hp = HParams(cls=cls) hp = HParams(cls=cls) hp.add_hparam('embedding_size', embedding_size) if config is not None: hp.override_from_dict(config) return hp
def params(cls, config=None): """ Set params. :param config: contains two optional parameters: audio_channels(int, default=1), sample_rate(float, default=16000.0). :return: An object of class HParams, which is a set of hyperparameters as name-value pairs. """ audio_channels = 1 sample_rate = 16000.0 hparams = HParams(cls=cls) hparams.add_hparam('audio_channels', audio_channels) hparams.add_hparam('sample_rate', sample_rate) if config is not None: hparams.override_from_dict(config) return hparams
def create_estimator(self): # Set model params model_params = HParams() # create model func model_fn = self.model_fn() # multi-gpus devices, num_gpu = utils.gpu_device_names() distribution = utils.get_distribution_strategy(num_gpu) logging.info('Device: {}/{}'.format(num_gpu, devices)) # run config tfconf = self.config['solver']['run_config'] saverconf = self.config['solver']['saver'] session_config = tf.ConfigProto( allow_soft_placement=tfconf['allow_soft_placement'], log_device_placement=tfconf['log_device_placement'], intra_op_parallelism_threads=tfconf[ 'intra_op_parallelism_threads'], inter_op_parallelism_threads=tfconf[ 'inter_op_parallelism_threads'], gpu_options=tf.GPUOptions(allow_growth=tfconf['allow_growth'])) run_config = tf.estimator.RunConfig( #pylint: disable=no-member tf_random_seed=tfconf['tf_random_seed'], session_config=session_config, save_summary_steps=saverconf['save_summary_steps'], keep_checkpoint_max=saverconf['max_to_keep'], log_step_count_steps=tfconf['log_step_count_steps'], train_distribute=distribution, device_fn=None, protocol=None, eval_distribute=None, experimental_distribute=None, ) # Instantiate Estimator nn = tf.estimator.Estimator( #pylint: disable=no-member,invalid-name model_fn=model_fn, model_dir=saverconf['model_path'], config=run_config, params=model_params, warm_start_from=None, ) return nn
def params(cls, config=None): """ Set params. :param config: contains one optional parameters:sample_rate(int, default=16000). :return: An object of class HParams, which is a set of hyperparameters as name-value pairs. """ sample_rate = 16000 hparams = HParams(cls=cls) hparams.add_hparam('sample_rate', sample_rate) if config is not None: hparams.override_from_dict(config) return hparams
def speech_params(sr=16000, bins=40, dither=True, use_delta_deltas=True, cmvn=False, cmvn_path=''): ''' feat params ''' p = HParams() p.add_hparam("audio_sample_rate", sr) p.add_hparam("audio_channels", 1) p.add_hparam("audio_preemphasis", 0.97) if dither: p.add_hparam("audio_dither", 1.0 / np.iinfo(np.int16).max) else: p.add_hparam("audio_dither", 0.0) p.add_hparam("audio_frame_length", 25.0) p.add_hparam("audio_frame_step", 10.0) p.add_hparam("audio_lower_edge_hertz", 20.0) p.add_hparam("audio_upper_edge_hertz", sr / 2.0) p.add_hparam("audio_num_mel_bins", bins) p.add_hparam("audio_add_delta_deltas", use_delta_deltas) p.add_hparam("num_zeropad_frames", 0) p.add_hparam("audio_global_cmvn", cmvn) p.add_hparam("audio_cmvn_path", cmvn_path) return p
def params(cls, config=None): """ Set params. :param config: contains thirteen optional parameters: --window_length : Window length in seconds. (float, default = 0.025) --frame_length : Hop length in seconds. (float, default = 0.010) --snip_edges : If True, the last frame (shorter than window_length) will be cutoff. If False, 1 // 2 frame_length data will be padded to data. (bool, default = True) ---raw_energy : If 1, compute frame energy before preemphasis and windowing. If 2, compute frame energy after preemphasis and windowing. (int, default = 1) --preeph_coeff : Coefficient for use in frame-signal preemphasis. (float, default = 0.0) --window_type : Type of window ("hamm"|"hann"|"povey"|"rect"|"blac"|"tria"). (string, default = "hann") --remove_dc_offset : Subtract mean from waveform on each frame. (bool, default = false) --is_fbank : If true, compute power spetrum without frame energy. If false, using the frame energy instead of the square of the constant component of the signal. (bool, default = true) --output_type : If 1, return power spectrum. If 2, return log-power spectrum. If 3, return magnitude spectrum. (int, default = 3) --upper_frequency_limit : High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (float, default = 0) --lower_frequency_limit : Low cutoff frequency for mel bins (float, default = 20) --filterbank_channel_count : Number of triangular mel-frequency bins. (float, default = 23) --dither : Dithering constant (0.0 means no dither). (float, default = 0) [add robust to training] :return: An object of class HParams, which is a set of hyperparameters as name-value pairs. """ hparams = HParams(cls=cls) window_length = 0.025 frame_length = 0.010 output_type = 3 is_fbank = True preeph_coeff = 0.0 window_type = 'hann' dither = 0.0 remove_dc_offset = False upper_frequency_limit = 0 lower_frequency_limit = 60 filterbank_channel_count = 40 sample_rate = 16000 snip_edges = True raw_energy = 1 hparams.add_hparam('window_length', window_length) hparams.add_hparam('snip_edges', snip_edges) hparams.add_hparam('raw_energy', raw_energy) hparams.add_hparam('frame_length', frame_length) hparams.add_hparam('output_type', output_type) hparams.add_hparam('is_fbank', is_fbank) hparams.add_hparam('preeph_coeff', preeph_coeff) hparams.add_hparam('window_type', window_type) hparams.add_hparam('dither', dither) hparams.add_hparam('remove_dc_offset', remove_dc_offset) hparams.add_hparam('upper_frequency_limit', upper_frequency_limit) hparams.add_hparam('lower_frequency_limit', lower_frequency_limit) hparams.add_hparam('filterbank_channel_count', filterbank_channel_count) hparams.add_hparam('sample_rate', sample_rate) return hparams
def params(cls, config=None): """ Set params. :param config: contains ten optional parameters. --sample_rate : Sample frequency of waveform data. (int, default = 16000) --window_length : Window length in seconds. (float, default = 0.025) --frame_length : Hop length in seconds. (float, default = 0.010) --snip_edges : If 1, the last frame (shorter than window_length) will be cutoff. If 2, 1 // 2 frame_length data will be padded to data. (int, default = 1) ---raw_energy : If 1, compute frame energy before preemphasis and windowing. If 2, compute frame energy after preemphasis and windowing. (int, default = 1) --preeph_coeff : Coefficient for use in frame-signal preemphasis. (float, default = 0.97) --window_type : Type of window ("hamm"|"hann"|"povey"|"rect"|"blac"|"tria"). (string, default = "povey") --remove_dc_offset : Subtract mean from waveform on each frame (bool, default = true) --is_fbank : If true, compute power spetrum without frame energy. If false, using the frame energy instead of the square of the constant component of the signal. (bool, default = false) --output_type : If 1, return power spectrum. If 2, return log-power spectrum. (int, default = 2) :return: An object of class HParams, which is a set of hyperparameters as name-value pairs. """ window_length = 0.025 frame_length = 0.010 output_type = 2 sample_rate = 16000 snip_edges = 2 raw_energy = 1 preeph_coeff = 0.97 window_type = 'povey' remove_dc_offset = True is_fbank = False hparams = HParams(cls=cls) hparams.add_hparam('window_length', window_length) hparams.add_hparam('frame_length', frame_length) hparams.add_hparam('output_type', output_type) hparams.add_hparam('sample_rate', sample_rate) hparams.add_hparam('snip_edges', snip_edges) hparams.add_hparam('raw_energy', raw_energy) hparams.add_hparam('preeph_coeff', preeph_coeff) hparams.add_hparam('window_type', window_type) hparams.add_hparam('remove_dc_offset', remove_dc_offset) hparams.add_hparam('is_fbank', is_fbank) if config is not None: hparams.override_from_dict(config) return hparams
def params(cls, config=None): """ Set params. :param config: contains twenty-nine optional parameters: --sample_rate : Samplerate of the signal we working with. (int, default = 16000) --window_length : Window length in seconds. (float, default = 0.025) --frame_length : Hop length in seconds. (float, default = 0.010) --snip_edges : If true, the last frame (shorter than window_length) will be cutoff. If false, 1 // 2 frame_length data will be padded to data. (bool, default = true) ---raw_energy : If 1, compute frame energy before preemphasis and windowing. If 2, compute frame energy after preemphasis and windowing. (int, default = 1) --preEph_coeff : Coefficient for use in frame-signal preemphasis. (float, default = 0.97) --window_type : Type of window ("hamm"|"hann"|"povey"|"rect"|"blac"|"tria"). (string, default = "povey") --remove_dc_offset : Subtract mean from waveform on each frame. (bool, default = true) --is_fbank : If true, compute power spetrum without frame energy. If false, using the frame energy instead of the square of the constant component of the signal. (bool, default = true) --output_type : If 1, return power spectrum. If 2, return log-power spectrum. (int, default = 1) --upper_frequency_limit : High cutoff frequency for mel bins. (if <= 0, offset from Nyquist) (float, default = 0) --lower_frequency_limit : Low cutoff frequency for mel bins. (float, default = 20) --filterbank_channel_count : Number of triangular mel-frequency bins. (float, default = 23) --dither : Dithering constant (0.0 means no dither). (float, default = 1) [add robust to training] --delta-pitch : Smallest relative change in pitch that our algorithm measures. (float, default = 0.005) --frames-per-chunk : Only relevant for offline pitch extraction. (e.g. compute-kaldi-pitch-feats), you can set it to a small nonzero value, such as 10, for better feature compatibility with online decoding (affects energy normalization in the algorithm) (int, default = 0) --lowpass-cutoff : cutoff frequency for LowPass filter (Hz). (float, default = 1000) --lowpass-filter-width : Integer that determines filter width of lowpass filter, more gives sharper filter (int, default = 1) --max-f0 : max. F0 to search for (Hz) (float, default = 400) --max-frames-latency : Maximum number of frames of latency that we allow pitch tracking to introduce into the feature processing (affects output only if --frames-per-chunk > 0 and --simulate-first-pass-online=true (int, default = 0) --min-f0 : min. F0 to search for (Hz) (float, default = 50) --nccf-ballast : Increasing this factor reduces NCCF for quiet frames. (float, default = 7000) --nccf-ballast-online : This is useful mainly for debug; it affects how the NCCF ballast is computed. (bool, default = false) --penalty-factor : cost factor for FO change. (float, default = 0.1) --preemphasis-coefficient : Coefficient for use in signal preemphasis (deprecated) (float, default = 0) --recompute-frame : Only relevant for online pitch extraction, or for compatibility with online pitch extraction. A non-critical parameter; the frame at which we recompute some of the forward pointers, after revising our estimate of the signal energy. Relevant if--frames-per-chunk > 0. (int, default = 500) --resample-frequency : Frequency that we down-sample the signal to. Must be more than twice lowpass-cutoff (float, default = 4000) --simulate-first-pass-online : If true, compute-kaldi-pitch-feats will output features that correspond to what an online decoder would see in the first pass of decoding-- not the final version of the features, which is the default. Relevant if --frames-per-chunk > 0 (bool, default = false) --soft-min-f0 : Minimum f0, applied in soft way, must not exceed min-f0 (float, default = 10) --upsample-filter-width : Integer that determines filter width when upsampling NCCF (int, default = 5) :return: An object of class HParams, which is a set of hyperparameters as name-value pairs. """ hparams = HParams(cls=cls) upper_frequency_limit = 0 lower_frequency_limit = 20.0 filterbank_channel_count = 23.0 window_length = 0.025 frame_length = 0.010 raw_energy = 1 preeph_coeff = 0.97 window_type = 'povey' remove_dc_offset = True is_fbank = True output_type = 1 dither = 0.0 sample_rate = 16000 snip_edges = True preemph_coeff = 0.0 min_f0 = 50.0 max_f0 = 400.0 soft_min_f0 = 10.0 penalty_factor = 0.1 lowpass_cutoff = 1000.0 resample_freq = 4000.0 delta_pitch = 0.005 nccf_ballast = 7000.0 lowpass_filter_width = 1 upsample_filter_width = 5 max_frames_latency = 0 frames_per_chunk = 0 simulate_first_pass_online = False recompute_frame = 500 nccf_ballast_online = False is_log10 = False pitch_scale = 2.0 pov_scale = 2.0 pov_offset = 0.0 delta_pitch_scale = 10.0 delta_pitch_noise_stddev = 0.005 normalization_left_context = 75 normalization_right_context = 75 delta_window = 2 delay = 0 add_pov_feature = True add_normalized_log_pitch = True add_delta_pitch = True add_raw_log_pitch = False hparams.add_hparam('sample_rate', sample_rate) hparams.add_hparam('snip_edges', snip_edges) hparams.add_hparam('preemph_coeff', preemph_coeff) hparams.add_hparam('dither', dither) hparams.add_hparam('min_f0', min_f0) hparams.add_hparam('max_f0', max_f0) hparams.add_hparam('soft_min_f0', soft_min_f0) hparams.add_hparam('penalty_factor', penalty_factor) hparams.add_hparam('lowpass_cutoff', lowpass_cutoff) hparams.add_hparam('resample_freq', resample_freq) hparams.add_hparam('delta_pitch', delta_pitch) hparams.add_hparam('nccf_ballast', nccf_ballast) hparams.add_hparam('lowpass_filter_width', lowpass_filter_width) hparams.add_hparam('upsample_filter_width', upsample_filter_width) hparams.add_hparam('max_frames_latency', max_frames_latency) hparams.add_hparam('frames_per_chunk', frames_per_chunk) hparams.add_hparam('simulate_first_pass_online', simulate_first_pass_online) hparams.add_hparam('recompute_frame', recompute_frame) hparams.add_hparam('nccf_ballast_online', nccf_ballast_online) hparams.add_hparam('upper_frequency_limit', upper_frequency_limit) hparams.add_hparam('lower_frequency_limit', lower_frequency_limit) hparams.add_hparam('filterbank_channel_count', filterbank_channel_count) hparams.add_hparam('window_length', window_length) hparams.add_hparam('frame_length', frame_length) hparams.add_hparam('output_type', output_type) hparams.add_hparam('raw_energy', raw_energy) hparams.add_hparam('preeph_coeff', preeph_coeff) hparams.add_hparam('window_type', window_type) hparams.add_hparam('remove_dc_offset', remove_dc_offset) hparams.add_hparam('is_fbank', is_fbank) hparams.add_hparam('is_log10', is_log10) hparams.add_hparam('pitch_scale', pitch_scale) hparams.add_hparam('pov_offset', pov_offset) hparams.add_hparam('pov_scale', pov_scale) hparams.add_hparam('delta_pitch_scale', delta_pitch_scale) hparams.add_hparam('delta_pitch_noise_stddev', delta_pitch_noise_stddev) hparams.add_hparam('normalization_left_context', normalization_left_context) hparams.add_hparam('normalization_right_context', normalization_right_context) hparams.add_hparam('delta_window', delta_window) hparams.add_hparam('delay', delay) hparams.add_hparam('add_pov_feature', add_pov_feature) hparams.add_hparam('add_normalized_log_pitch', add_normalized_log_pitch) hparams.add_hparam('add_delta_pitch', add_delta_pitch) hparams.add_hparam('add_raw_log_pitch', add_raw_log_pitch) if config is not None: hparams.override_from_dict(config) return hparams
def params(cls, config=None): """ Set params. :param config: contains seven optional parameters: --norm_means : Flag of norm_means. (bool, default=True) --norm_vars : Flag of norm_vars. (bool, default=False) --utt2spk : Use for speaker CMVN. (string, default=None) --spk2utt : Rspecifier for speaker to utterance-list map. (string, default=None) --reverse : Flag of reverse. (bool, default=False) --std_floor : Floor to std. (float, default=1.0e-20) --filetype : Type of input file. (string, default='mat') :return: """ norm_means = True norm_vars = False utt2spk = None spk2utt = None reverse = False std_floor = 1.0e-20 filetype = 'mat' hparams = HParams(cls=cls) hparams.add_hparam('norm_means', norm_means) hparams.add_hparam('norm_vars', norm_vars) hparams.add_hparam('utt2spk', utt2spk) hparams.add_hparam('spk2utt', spk2utt) hparams.add_hparam('reverse', reverse) hparams.add_hparam('std_floor', std_floor) hparams.add_hparam('filetype', filetype) if config is not None: hparams.override_from_dict(config) return hparams
def params(cls, config=None): """ Set params. :param config: contains nineteen optional parameters: --sample_rate : Waveform data sample frequency (must match the waveform file, if specified there). (float, default = 16000) --delta-pitch : Smallest relative change in pitch that our algorithm measures (float, default = 0.005) --window_length : Frame length in seconds (float, default = 0.025) --frame_length : Frame shift in seconds (float, default = 0.010) --frames-per-chunk : Only relevant for offline pitch extraction (e.g. compute-kaldi-pitch-feats), you can set it to a small nonzero value, such as 10, for better feature compatibility with online decoding (affects energy normalization in the algorithm) (int, default = 0) --lowpass-cutoff : cutoff frequency for LowPass filter (Hz). (float, default = 1000) --lowpass-filter-width : Integer that determines filter width of lowpass filter, more gives sharper filter (int, default = 1) --max-f0 : max. F0 to search for (Hz) (float, default = 400) --max-frames-latency : Maximum number of frames of latency that we allow pitch tracking to introduce into the feature processing (affects output only if --frames-per-chunk > 0 and --simulate-first-pass-online=true (int, default = 0) --min-f0 : min. F0 to search for (Hz) (float, default = 50) --nccf-ballast : Increasing this factor reduces NCCF for quiet frames. (float, default = 7000) --nccf-ballast-online : This is useful mainly for debug; it affects how the NCCF ballast is computed. (bool, default = false) --penalty-factor : cost factor for FO change. (float, default = 0.1) --preemphasis-coefficient : Coefficient for use in signal preemphasis (deprecated). (float, default = 0) --recompute-frame : Only relevant for online pitch extraction, or for compatibility with online pitch extraction. A non-critical parameter; the frame at which we recompute some of the forward pointers, after revising our estimate of the signal energy. Relevant if--frames-per-chunk > 0. (int, default = 500) --resample-frequency : Frequency that we down-sample the signal to. Must be more than twice lowpass-cutoff (float, default = 4000) --simulate-first-pass-online : If true, compute-kaldi-pitch-feats will output features that correspond to what an online decoder would see in the first pass of decoding-- not the final version of the features, which is the default. Relevant if --frames-per-chunk > 0 (bool, default = false) --snip-edges : If this is set to false, the incomplete frames near the ending edge won't be snipped, so that the number of frames is the file size divided by the frame-shift. This makes different types of features give the same number of frames. (bool, default = true) --soft-min-f0 : Minimum f0, applied in soft way, must not exceed min-f0. (float, default = 10) --upsample-filter-width : Integer that determines filter width when upsampling NCCF. (int, default = 5) --add-delta-pitch : If true, time derivative of log-pitch is added to output features. (bool, default = true) --add-pov-feature : If true, the warped NCCF is added to output features. (bool, default = true) --add-raw-log-pitch : If true, log(pitch) is added to output features. (bool, default = false) --delay : Number of frames by which the pitch information is delayed. (int, default = 0) --delta-pitch-noise-stddev : Standard deviation for noise we add to the delta log-pitch (before scaling); should be about the same as delta-pitch option to pitch creation. The purpose is to get rid of peaks in the delta-pitch caused by discretization of pitch values. (float, default = 0.005) --delta-pitch-scale : Term to scale the final delta log-pitch feature. (float, default = 10) --delta-window : Number of frames on each side of central frame, to use for delta window. (int, default = 2) --normalization-left-context : Left-context (in frames) for moving window normalization. (int, default = 75) --normalization-right-context : Right-context (in frames) for moving window normalization. (int, default = 75) --pitch-scale : Scaling factor for the final normalized log-pitch value. (float, default = 2) --pov-offset : This can be used to add an offset to the POV feature. Intended for use in online decoding as a substitute for CMN. (float, default = 0) --pov-scale : Scaling factor for final POV (probability of voicing) feature. (float, default = 2) :return: An object of class HParams, which is a set of hyperparameters as name-value pairs. """ hparams = HParams(cls=cls) window_length = 0.025 frame_length = 0.010 sample_rate = 16000 snip_edges = True preemph_coeff = 0.0 min_f0 = 50.0 max_f0 = 400.0 soft_min_f0 = 10.0 penalty_factor = 0.1 lowpass_cutoff = 1000.0 resample_freq = 4000.0 delta_pitch = 0.005 nccf_ballast = 7000.0 lowpass_filter_width = 1 upsample_filter_width = 5 max_frames_latency = 0 frames_per_chunk = 0 simulate_first_pass_online = False recompute_frame = 500 nccf_ballast_online = False pitch_scale = 2.0 pov_scale = 2.0 pov_offset = 0.0 delta_pitch_scale = 10.0 delta_pitch_noise_stddev = 0.005 normalization_left_context = 75 normalization_right_context = 75 delta_window = 2 delay = 0 add_pov_feature = True add_normalized_log_pitch = True add_delta_pitch = True add_raw_log_pitch = False hparams.add_hparam('window_length', window_length) hparams.add_hparam('frame_length', frame_length) hparams.add_hparam('sample_rate', sample_rate) hparams.add_hparam('snip_edges', snip_edges) hparams.add_hparam('preemph_coeff', preemph_coeff) hparams.add_hparam('min_f0', min_f0) hparams.add_hparam('max_f0', max_f0) hparams.add_hparam('soft_min_f0', soft_min_f0) hparams.add_hparam('penalty_factor', penalty_factor) hparams.add_hparam('lowpass_cutoff', lowpass_cutoff) hparams.add_hparam('resample_freq', resample_freq) hparams.add_hparam('delta_pitch', delta_pitch) hparams.add_hparam('nccf_ballast', nccf_ballast) hparams.add_hparam('lowpass_filter_width', lowpass_filter_width) hparams.add_hparam('upsample_filter_width', upsample_filter_width) hparams.add_hparam('max_frames_latency', max_frames_latency) hparams.add_hparam('frames_per_chunk', frames_per_chunk) hparams.add_hparam('simulate_first_pass_online', simulate_first_pass_online) hparams.add_hparam('recompute_frame', recompute_frame) hparams.add_hparam('nccf_ballast_online', nccf_ballast_online) hparams.add_hparam('pitch_scale', pitch_scale) hparams.add_hparam('pov_offset', pov_offset) hparams.add_hparam('pov_scale', pov_scale) hparams.add_hparam('delta_pitch_scale', delta_pitch_scale) hparams.add_hparam('delta_pitch_noise_stddev', delta_pitch_noise_stddev) hparams.add_hparam('normalization_left_context', normalization_left_context) hparams.add_hparam('normalization_right_context', normalization_right_context) hparams.add_hparam('delta_window', delta_window) hparams.add_hparam('delay', delay) hparams.add_hparam('add_pov_feature', add_pov_feature) hparams.add_hparam('add_normalized_log_pitch', add_normalized_log_pitch) hparams.add_hparam('add_delta_pitch', add_delta_pitch) hparams.add_hparam('add_raw_log_pitch', add_raw_log_pitch) if config is not None: hparams.override_from_dict(config) return hparams
def params(cls, config=None): """ Set params. :param config: contains nine optional parameters: --sample_rate : Sample frequency of waveform data. (int, default = 16000) --if_add_rir : If true, add rir to audio data. (bool, default = False) --rir_filelist : FileList path of rir.(string, default = 'rirlist.scp') --if_add_noise : If true, add random noise to audio data. (bool, default = False) --snr_min : Minimum SNR adds to signal. (float, default = 0) --snr_max : Maximum SNR adds to signal. (float, default = 30) --noise_filelist : FileList path of noise.(string, default = 'noiselist.scp') --if_add_aecres : If true, add aecres to audio data. (bool, default = False) --aecres_filelist : FileList path of aecres.(string, default = 'aecreslist.scp') :return: An object of class HParams, which is a set of hyperparameters as name-value pairs. """ sample_rate = 16000 if_add_rir = False rir_filelist = 'rirlist.scp' if_add_noise = False noise_filelist = 'noiselist.scp' snr_min = 0 snr_max = 30 if_add_aecres = False aecres_filelist = 'aecreslist.scp' hparams = HParams(cls=cls) hparams.add_hparam('sample_rate', sample_rate) hparams.add_hparam('if_add_rir', if_add_rir) hparams.add_hparam('if_add_noise', if_add_noise) hparams.add_hparam('rir_filelist', rir_filelist) hparams.add_hparam('noise_filelist', noise_filelist) hparams.add_hparam('snr_min', snr_min) hparams.add_hparam('snr_max', snr_max) hparams.add_hparam('if_add_aecres', if_add_aecres) hparams.add_hparam('aecres_filelist', aecres_filelist) if config is not None: hparams.override_from_dict(config) return hparams
def params(cls, config=None): """ Set params. :param config: contains twenty optional parameters: --delta-pitch : Smallest relative change in pitch that our algorithm measures (float, default = 0.005) --frame-length : Frame length in milliseconds (float, default = 25) --frame-shift : Frame shift in milliseconds (float, default = 10) --frames-per-chunk : Only relevant for offline pitch extraction (e.g. compute-kaldi-pitch-feats), you can set it to a small nonzero value, such as 10, for better feature compatibility with online decoding (affects energy normalization in the algorithm) (int, default = 0) --lowpass-cutoff : cutoff frequency for LowPass filter (Hz) (float, default = 1000) --lowpass-filter-width : Integer that determines filter width of lowpass filter, more gives sharper filter (int, default = 1) --max-f0 : max. F0 to search for (Hz) (float, default = 400) --max-frames-latency : Maximum number of frames of latency that we allow pitch tracking to introduce into the feature processing (affects output only if --frames-per-chunk > 0 and --simulate-first-pass-online=true (int, default = 0) --min-f0 : min. F0 to search for (Hz) (float, default = 50) --nccf-ballast : Increasing this factor reduces NCCF for quiet frames (float, default = 7000) --nccf-ballast-online : This is useful mainly for debug; it affects how the NCCF ballast is computed. (bool, default = false) --penalty-factor : cost factor for FO change. (float, default = 0.1) --preemphasis-coefficient : Coefficient for use in signal preemphasis (deprecated) (float, default = 0) --recompute-frame : Only relevant for online pitch extraction, or for compatibility with online pitch extraction. A non-critical parameter; the frame at which we recompute some of the forward pointers, after revising our estimate of the signal energy. Relevant if--frames-per-chunk > 0 (int, default = 500) --resample-frequency : Frequency that we down-sample the signal to. Must be more than twice lowpass-cutoff (float, default = 4000) --sample-frequency : Waveform data sample frequency (must match the waveform file, if specified there) (float, default = 16000) --simulate-first-pass-online : If true, compute-kaldi-pitch-feats will output features that correspond to what an online decoder would see in the first pass of decoding-- not the final version of the features, which is the default. Relevant if --frames-per-chunk > 0 (bool, default = false) --snip-edges : If this is set to false, the incomplete frames near the ending edge won't be snipped, so that the number of frames is the file size divided by the frame-shift. This makes different types of features give the same number of frames. (bool, default = true) --soft-min-f0 : Minimum f0, applied in soft way, must not exceed min-f0 (float, default = 10) --upsample-filter-width : Integer that determines filter width when upsampling NCCF (int, default = 5) :return: An object of class HParams, which is a set of hyperparameters as name-value pairs. """ hparams = HParams(cls=cls) window_length = 0.025 frame_length = 0.010 sample_rate = 16000 snip_edges = True preemph_coeff = 0.0 min_f0 = 50.0 max_f0 = 400.0 soft_min_f0 = 10.0 penalty_factor = 0.1 lowpass_cutoff = 1000.0 resample_freq = 4000.0 delta_pitch = 0.005 nccf_ballast = 7000.0 lowpass_filter_width = 1 upsample_filter_width = 5 max_frames_latency = 0 frames_per_chunk = 0 simulate_first_pass_online = False recompute_frame = 500 nccf_ballast_online = False hparams.add_hparam('window_length', window_length) hparams.add_hparam('frame_length', frame_length) hparams.add_hparam('sample_rate', sample_rate) hparams.add_hparam('snip_edges', snip_edges) hparams.add_hparam('preemph_coeff', preemph_coeff) hparams.add_hparam('min_f0', min_f0) hparams.add_hparam('max_f0', max_f0) hparams.add_hparam('soft_min_f0', soft_min_f0) hparams.add_hparam('penalty_factor', penalty_factor) hparams.add_hparam('lowpass_cutoff', lowpass_cutoff) hparams.add_hparam('resample_freq', resample_freq) hparams.add_hparam('delta_pitch', delta_pitch) hparams.add_hparam('nccf_ballast', nccf_ballast) hparams.add_hparam('lowpass_filter_width', lowpass_filter_width) hparams.add_hparam('upsample_filter_width', upsample_filter_width) hparams.add_hparam('max_frames_latency', max_frames_latency) hparams.add_hparam('frames_per_chunk', frames_per_chunk) hparams.add_hparam('simulate_first_pass_online', simulate_first_pass_online) hparams.add_hparam('recompute_frame', recompute_frame) hparams.add_hparam('nccf_ballast_online', nccf_ballast_online) if config is not None: hparams.override_from_dict(config) return hparams
def params(cls, config=None): norm_means = True norm_vars = False utt2spk = None spk2utt = None reverse = False std_floor = 1.0e-20 filetype = 'mat' hparams = HParams(cls=cls) hparams.add_hparam('norm_means', norm_means) hparams.add_hparam('norm_vars', norm_vars) hparams.add_hparam('utt2spk', utt2spk) hparams.add_hparam('spk2utt', spk2utt) hparams.add_hparam('reverse', reverse) hparams.add_hparam('std_floor', std_floor) hparams.add_hparam('filetype', filetype) if config is not None: hparams.override_from_dict(config) return hparams
def params(cls, config=None): """ Set params. :param config: contains fifthteen optional parameters. --sample_rate : Sample frequency of waveform data. (int, default = 16000) --window_length : Window length in seconds. (float, default = 0.025) --frame_length : Hop length in seconds. (float, default = 0.010) --snip_edges : If 1, the last frame (shorter than window_length) will be cutoff. If 2, 1 // 2 frame_length data will be padded to data. (int, default = 1) ---raw_energy : If 1, compute frame energy before preemphasis and windowing. If 2, compute frame energy after preemphasis and windowing. (int, default = 1) --preeph_coeff : Coefficient for use in frame-signal preemphasis. (float, default = 0.97) --window_type : Type of window ("hamm"|"hann"|"povey"|"rect"|"blac"|"tria"). (string, default = "povey") --remove_dc_offset : Subtract mean from waveform on each frame (bool, default = true) --is_fbank : If true, compute power spetrum without frame energy. If false, using the frame energy instead of the square of the constant component of the signal. (bool, default = true) --output_type : If 1, return power spectrum. If 2, return log-power spectrum. (int, default = 1) --upper_frequency_limit : High cutoff frequency for mel bins (if < 0, offset from Nyquist) (float, default = 0) --lower_frequency_limit : Low cutoff frequency for mel bins (float, default = 20) --filterbank_channel_count : Number of triangular mel-frequency bins (float, default = 23) --coefficient_count : Number of cepstra in MFCC computation.(int, default = 13) --cepstral_lifter : Constant that controls scaling of MFCCs.(float, default = 22) :return: An object of class HParams, which is a set of hyperparameters as name-value pairs. """ upper_frequency_limit = 0.0 lower_frequency_limit = 20.0 filterbank_channel_count = 23.0 window_length = 0.025 frame_length = 0.010 output_type = 1 sample_rate = 16000 snip_edges = True raw_energy = 1 preeph_coeff = 0.97 window_type = 'povey' remove_dc_offset = True is_fbank = True cepstral_lifter = 22.0 coefficient_count = 13 use_energy = True hparams = HParams(cls=cls) hparams.add_hparam('upper_frequency_limit', upper_frequency_limit) hparams.add_hparam('lower_frequency_limit', lower_frequency_limit) hparams.add_hparam('filterbank_channel_count', filterbank_channel_count) hparams.add_hparam('window_length', window_length) hparams.add_hparam('frame_length', frame_length) hparams.add_hparam('output_type', output_type) hparams.add_hparam('sample_rate', sample_rate) hparams.add_hparam('snip_edges', snip_edges) hparams.add_hparam('raw_energy', raw_energy) hparams.add_hparam('preeph_coeff', preeph_coeff) hparams.add_hparam('window_type', window_type) hparams.add_hparam('remove_dc_offset', remove_dc_offset) hparams.add_hparam('is_fbank', is_fbank) hparams.add_hparam('cepstral_lifter', cepstral_lifter) hparams.add_hparam('coefficient_count', coefficient_count) hparams.add_hparam('use_energy', use_energy) if config is not None: hparams.override_from_dict(config) return hparams
def params(cls, config=None): """ Set params. :param config: contains five optional parameters: --sample_rate : Waveform data sample frequency (must match the waveform file, if specified there). (float, default = 16000) --window_length : Window length in seconds. (float, default = 0.025) --frame_length : Hop length in seconds. (float, default = 0.010) --snip_edges : If True, the last frame (shorter than window_length) will be cutoff. If False, 1 // 2 frame_length data will be padded to data. (int, default = True) --remove_dc_offset : Subtract mean from waveform on each frame (bool, default = true) :return:An object of class HParams, which is a set of hyperparameters as name-value pairs. """ window_length = 0.025 frame_length = 0.010 snip_edges = True remove_dc_offset = True sample_rate = 16000 hparams = HParams(cls=cls) hparams.add_hparam('window_length', window_length) hparams.add_hparam('frame_length', frame_length) hparams.add_hparam('snip_edges', snip_edges) hparams.add_hparam('remove_dc_offset', remove_dc_offset) hparams.add_hparam('sample_rate', sample_rate) if config is not None: hparams.override_from_dict(config) return hparams
def params(cls, config=None): hparams = HParams(cls=cls) return hparams
def params(cls, config=None): """ Set params. :param config: contains eight optional parameters:upper_frequency_limit(float, default=4000.0), lower_frequency_limit(float, default=20.0), filterbank_channel_count(float, default=40.0), window_length(float, default=0.025), frame_length(float, default=0.010), thres_autoc(float, default=0.3), output_type(int, default=2), sample_rate(int, default=16000). :return: An object of class HParams, which is a set of hyperparameters as name-value pairs. """ hparams = HParams(cls=cls) upper_frequency_limit = 0 lower_frequency_limit = 20.0 filterbank_channel_count = 23.0 window_length = 0.025 frame_length = 0.010 raw_energy = 1 preeph_coeff = 0.97 window_type = 'povey' remove_dc_offset = True is_fbank = True output_type = 1 sample_rate = 16000 snip_edges = True preemph_coeff = 0.0 min_f0 = 50.0 max_f0 = 400.0 soft_min_f0 = 10.0 penalty_factor = 0.1 lowpass_cutoff = 1000.0 resample_freq = 4000.0 delta_pitch = 0.005 nccf_ballast = 7000.0 lowpass_filter_width = 1 upsample_filter_width = 5 max_frames_latency = 0 frames_per_chunk = 0 simulate_first_pass_online = False recompute_frame = 500 nccf_ballast_online = False hparams.add_hparam('sample_rate', sample_rate) hparams.add_hparam('snip_edges', snip_edges) hparams.add_hparam('preemph_coeff', preemph_coeff) hparams.add_hparam('min_f0', min_f0) hparams.add_hparam('max_f0', max_f0) hparams.add_hparam('soft_min_f0', soft_min_f0) hparams.add_hparam('penalty_factor', penalty_factor) hparams.add_hparam('lowpass_cutoff', lowpass_cutoff) hparams.add_hparam('resample_freq', resample_freq) hparams.add_hparam('delta_pitch', delta_pitch) hparams.add_hparam('nccf_ballast', nccf_ballast) hparams.add_hparam('lowpass_filter_width', lowpass_filter_width) hparams.add_hparam('upsample_filter_width', upsample_filter_width) hparams.add_hparam('max_frames_latency', max_frames_latency) hparams.add_hparam('frames_per_chunk', frames_per_chunk) hparams.add_hparam('simulate_first_pass_online', simulate_first_pass_online) hparams.add_hparam('recompute_frame', recompute_frame) hparams.add_hparam('nccf_ballast_online', nccf_ballast_online) hparams.add_hparam('upper_frequency_limit', upper_frequency_limit) hparams.add_hparam('lower_frequency_limit', lower_frequency_limit) hparams.add_hparam('filterbank_channel_count', filterbank_channel_count) hparams.add_hparam('window_length', window_length) hparams.add_hparam('frame_length', frame_length) hparams.add_hparam('output_type', output_type) hparams.add_hparam('raw_energy', raw_energy) hparams.add_hparam('preeph_coeff', preeph_coeff) hparams.add_hparam('window_type', window_type) hparams.add_hparam('remove_dc_offset', remove_dc_offset) hparams.add_hparam('is_fbank', is_fbank) if config is not None: hparams.override_from_dict(config) return hparams
def params(cls, config=None): """ Set params. :param config: contains five optional parameters: --sample_rate : Waveform data sample frequency (must match the waveform file, if specified there). (float, default = 16000) --window_length : Window length in seconds. (float, default = 0.025) --frame_length : Hop length in seconds. (float, default = 0.010) --ceps_subband_num : Number of Ceps_subband. (int, default=13). --tag_ceps_mean_norm : Flag of tag_ceps_mean_norm. (bool, default=True). :return:An object of class HParams, which is a set of hyperparameters as name-value pairs. """ window_length = 0.025 frame_length = 0.010 ceps_subband_num = 13 tag_ceps_mean_norm = True sample_rate = 16000 hparams = HParams(cls=cls) hparams.add_hparam('window_length', window_length) hparams.add_hparam('frame_length', frame_length) hparams.add_hparam('ceps_subband_num', ceps_subband_num) hparams.add_hparam('tag_ceps_mean_norm', tag_ceps_mean_norm) hparams.add_hparam('sample_rate', sample_rate) if config is not None: hparams.override_from_dict(config) return hparams
def test_hparams(self): hparams = HParams(cls=self.__class__, name='fbank', n_mels=40) hparams.del_hparam('cls') self.assertEqual(hparams.name, 'fbank') self.assertEqual(hparams.n_mels, 40) self.assertDictEqual(hparams.values(), {'name': 'fbank', 'n_mels': 40}) hparams.add_hparam('sr', 8000) self.assertEqual(hparams.sr, 8000) hparams.set_hparam('sr', 16000) self.assertEqual(hparams.sr, 16000) self.assertEqual(hparams.get('sr'), 16000) hparams.del_hparam('sr') self.assertJsonEqual(hparams.to_json(), '{"name": "fbank", "n_mels": 40}') self.assertEqual('name' in hparams, True) self.assertEqual(hparams['name'], 'fbank') self.assertEqual(hparams['n_mels'], 40) hparams['n_mels'] = 80 self.assertEqual(hparams['n_mels'], 80) hparams2 = copy.deepcopy(hparams) self.assertEqual(hparams == hparams2, True) self.assertEqual(hparams != hparams2, False) hparams2['name'] = 'MFCC' self.assertEqual(hparams == hparams2, False) self.assertEqual(hparams != hparams2, True)
def params(cls, config=None): """ Set params. :param config: contains seven optional parameters:upper_frequency_limit(float, default=4000.0), lower_frequency_limit(float, default=20.0), filterbank_channel_count(float, default=40.0), window_length(float, default=0.025), frame_length(float, default=0.010), output_type(int, default=2), sample_rate(float, default=16000). :return: An object of class HParams, which is a set of hyperparameters as name-value pairs. """ upper_frequency_limit = 4000.0 lower_frequency_limit = 20.0 filterbank_channel_count = 40.0 window_length = 0.025 frame_length = 0.010 output_type = 2 sample_rate = 16000.0 hparams = HParams(cls=cls) hparams.add_hparam('upper_frequency_limit', upper_frequency_limit) hparams.add_hparam('lower_frequency_limit', lower_frequency_limit) hparams.add_hparam('filterbank_channel_count', filterbank_channel_count) hparams.add_hparam('window_length', window_length) hparams.add_hparam('frame_length', frame_length) hparams.add_hparam('output_type', output_type) hparams.add_hparam('sample_rate', sample_rate) if config is not None: hparams.override_from_dict(config) return hparams
def params(cls, config=None): """ Set params. :param config: contains five optional parameters:window_length(float, default=0.025), frame_length(float, default=0.010), sample_rate(float, default=16000.0), ceps_subband_num(int, default=13), tag_ceps_mean_norm(bool, default=True). :return:An object of class HParams, which is a set of hyperparameters as name-value pairs. """ window_length = 0.025 frame_length = 0.010 ceps_subband_num = 13 tag_ceps_mean_norm = True sample_rate = 16000.0 hparams = HParams(cls=cls) hparams.add_hparam('window_length', window_length) hparams.add_hparam('frame_length', frame_length) hparams.add_hparam('ceps_subband_num', ceps_subband_num) hparams.add_hparam('tag_ceps_mean_norm', tag_ceps_mean_norm) hparams.add_hparam('sample_rate', sample_rate) if config is not None: hparams.override_from_dict(config) return hparams
def params(cls, config=None): """ Set params. :param config: contains eight optional parameters:upper_frequency_limit(float, default=4000.0), lower_frequency_limit(float, default=20.0), filterbank_channel_count(float, default=40.0), window_length(float, default=0.025), frame_length(float, default=0.010), thres_autoc(float, default=0.3), output_type(int, default=2), sample_rate(int, default=16000). :return: An object of class HParams, which is a set of hyperparameters as name-value pairs. """ upper_frequency_limit = 8000.0 lower_frequency_limit = 20.0 filterbank_channel_count = 23.0 window_length = 0.025 frame_length = 0.010 snip_edges = 2 raw_energy = 1 preeph_coeff = 0.97 window_type = 'povey' remove_dc_offset = True is_fbank = True thres_autoc = 0.3 output_type = 1 sample_rate = 16000 hparams = HParams(cls=cls) hparams.add_hparam('upper_frequency_limit', upper_frequency_limit) hparams.add_hparam('lower_frequency_limit', lower_frequency_limit) hparams.add_hparam('filterbank_channel_count', filterbank_channel_count) hparams.add_hparam('window_length', window_length) hparams.add_hparam('frame_length', frame_length) hparams.add_hparam('output_type', output_type) hparams.add_hparam('sample_rate', sample_rate) hparams.add_hparam('snip_edges', snip_edges) hparams.add_hparam('raw_energy', raw_energy) hparams.add_hparam('preeph_coeff', preeph_coeff) hparams.add_hparam('window_type', window_type) hparams.add_hparam('remove_dc_offset', remove_dc_offset) hparams.add_hparam('is_fbank', is_fbank) hparams.add_hparam('thres_autoc', thres_autoc) if config is not None: hparams.override_from_dict(config) return hparams