示例#1
0
def apply_augmentation(audio_path:str, preproc_cfg:dict, logger:Logger)\
                                                ->Tuple[np.ndarray, np.ndarray]:

    logger.info(f"audio_path: {audio_path}")
    if preproc_cfg['tempo_gain_pitch_perturb']:
        if np.random.binomial(1, preproc_cfg['tempo_gain_pitch_prob']):
            aug_data, samp_rate = tempo_gain_pitch_perturb(
                audio_path,
                tempo_range=preproc_cfg['tempo_range'],
                gain_range=preproc_cfg['gain_range'],
                pitch_range=preproc_cfg['pitch_range'],
                augment_from_normal=preproc_cfg['augment_from_normal'],
                logger=logger)
        else:
            aug_data, samp_rate = array_from_wave(audio_path)
    else:
        aug_data, samp_rate = array_from_wave(audio_path)
    if preproc_cfg['synthetic_gaussian_noise']:
        if np.random.binomial(1, preproc_cfg['gauss_noise_prob']):
            aug_data = synthetic_gaussian_noise_inject(
                aug_data,
                preproc_cfg['gauss_snr_db_range'],
                preproc_cfg['augment_from_normal'],
                logger=logger)
    if preproc_cfg['background_noise']:
        if np.random.binomial(1, preproc_cfg['background_noise_prob']):
            logger.info("noise injected")
            aug_data = inject_noise(aug_data, samp_rate,
                                    preproc_cfg['background_noise_dir'],
                                    preproc_cfg['background_noise_range'],
                                    preproc_cfg['augment_from_normal'], logger)
        else:
            logger.info("noise not injected")

    return aug_data, samp_rate
示例#2
0
def test_for_nan_values():
    """
    this test will try a variety of audio files and input parameters to generate nan values
    """
    logging.basicConfig(filename=None, filemode='w', level=10)
    logger = logging.getLogger("train_log")

    test_audio = get_nan_audio()
    params_list = get_nan_parameters()

    for audio_count, audio_path in enumerate(test_audio):
        for params_count, params_dict in enumerate(params_list):
            audio_data, samp_rate = array_from_wave(audio_path)
            features = log_spectrogram_from_data(audio_data,
                                                 samp_rate,
                                                 window_size=32,
                                                 step_size=16)
            features = torch.from_numpy(features.T)
            features = spec_augment(
                features,
                time_warping_para=params_dict["W"],
                frequency_masking_para=params_dict["frequency_masking_para"],
                time_masking_para=params_dict["time_masking_para"],
                frequency_mask_num=len(params_dict["f"]),
                time_mask_num=len(params_dict["t"]),
                logger=logger,
                fixed_params=params_dict)
            features = to_numpy(features)
            features = features.T

            # np.isnan returns an array of bools, if one value is true (there is a nan) the sum will not be zero
            assert np.isnan(features).sum(
            ) == 0, f"nan value found in audio {audio_count}, params {params_count}"
示例#3
0
def test_time_masking(logger: Logger = None):
    """
    Checks that the number of time masks are less than the maximum number allowed. 
    Values of test_tuples are:
    ('time_warping_para', 'frequency_masking_para', 'time_masking_para'
    'frequency_mask_num',  'time_mask_num')
    """
    test_tuples = [
        (0, 0, 60, 0, 1),  # 1 mask with max width of 60
        (0, 0, 30, 0, 2),
        (0, 0, 20, 0, 3)
    ]
    audio_paths = get_all_test_audio()
    number_of_tests = 10  # multiple tests as mask selection is random
    for _ in range(number_of_tests):
        for audio_path in audio_paths:
            for param_tuple in test_tuples:
                audio_data, samp_rate = array_from_wave(audio_path)
                features = log_spectrogram_from_data(audio_data,
                                                     samp_rate,
                                                     window_size=32,
                                                     step_size=16)
                features = torch.from_numpy(features.T)
                aug_features = spec_augment(features, *param_tuple)
                aug_features = to_numpy(aug_features)
                num_mask_rows = count_time_mask(aug_features)

                time_mask_size = param_tuple[2]
                num_time_masks = param_tuple[4]
                max_time_masks = time_mask_size * num_time_masks

                #print(f"number of time masked rows: {num_mask_rows}, max_time_masked: {max_time_masks}")
                assert num_mask_rows <= max_time_masks
示例#4
0
def check_audio_with_sox():
    """
    this test aims to find files where audio_with_sox raises a 
    FileNotFoundError by running audio_with_sox over the entire
    noise file using different window sizes defined in data_lens 
    """
    noise_dataset = dataset_info.NoiseDataset()
    noise_files = noise_dataset.files_from_pattern()
    data_lens = [0.5, 5, 50]  # in secs
    step_size = 0.05
    print(
        f"\n Test Full Noise File: testing {len(noise_files)} noise files...")
    file_count = 0
    for noise_file in noise_files:
        print(f"Processing file {file_count}: {noise_file}")
        file_count += 1
        audio, samp_rate = array_from_wave(noise_file)
        noise_len = audio.shape[0] / samp_rate
        for data_len in data_lens:
            start_end_tups = calc_start_end(noise_len, data_len, step_size)
            for noise_start, noise_end in start_end_tups:
                try:
                    noise_dst = audio_with_sox(noise_file, samp_rate,
                                               noise_start, noise_end)
                except AssertionError:
                    raise AssertionError(
                        f"noise:{noise_file}, data_len: {data_len}")
                except FileNotFoundError:
                    raise FileNotFoundError(
                        f"noise:{noise_file}, data_len: {data_len}")
                except:
                    raise Exception(
                        f"noise:{noise_file}, data_len: {data_len}")
def test_gain_pitch_same_size():
    """
    tests that varying the gain and the pitch has no affect on the audio_data size
    """

    tempo = 1.0
    gain_pitch_tuples = [
        (0, 0),  # not augmentation
        (8, 0),  # only gain aug
        (0, 400),  # only pitch
        (-6, -400)
    ]  # both gain and pitch
    audio_path = get_all_test_audio()[0]  # only using a single audio path
    for gain, pitch in gain_pitch_tuples:
        # un-augmented audio_data
        audio_data, samp_rate = array_from_wave(audio_path)
        aug_data, samp_rate = tempo_gain_pitch_perturb(audio_path,
                                                       sample_rate=samp_rate,
                                                       tempo_range=(tempo,
                                                                    tempo),
                                                       gain_range=(gain, gain),
                                                       pitch_range=(pitch,
                                                                    pitch))

        assert audio_data.size == aug_data.size, "data size is not the same"
示例#6
0
def extend_audio(audio_dir:str, target_duration:int) -> None: 
    """
        stacks the audio files in audio_dur on themselves until they are each equal in
        length to the target_duration (in seconds)
        Arguments:
            audio_dir (str): directory of audio files
            target_duration (int): length in seconds the audio filles will be extended to
    """
    assert os.path.exists(audio_dir) == True, "audio directory does not exist"

    pattern = os.path.join(audio_dir, "*.wav")
    audio_files = glob.glob(pattern)
    
    for audio_fn in audio_files: 
        audio_duration = wav_duration(audio_fn)
        if audio_duration < target_duration:
            data, samp_rate = array_from_wave(audio_fn)
            # whole_dup as in whole_duplicate
            whole_dup, frac_dup = divmod(target_duration, audio_duration) 
            output_data = data
            #loop over whole_duplicates minus one because concatenating onto original
            for i in range(int(whole_dup)-1):
                output_data = np.concatenate((output_data, data), axis=0)
            # adding on the fractional section
            fraction_index = int(frac_dup*samp_rate)
            output_data = np.concatenate((output_data, data[:fraction_index]))

        file_name = os.path.basename(audio_fn)
        extended_name = file_name[:-4]+ "_extended.wav"
        extended_dir =  os.path.join(os.path.dirname(audio_fn), "extended")
        if not os.path.exists(extended_dir):
            os.mkdir(extended_dir)
        ext_audio_path = os.path.join(extended_dir, extended_name)

        write(ext_audio_path, samp_rate, output_data)
示例#7
0
def compare_log_spec_from_file(audio_file_1: str,
                               audio_file_2: str,
                               plot=False):
    """
    This function takes in two audio paths and calculates the difference between the spectrograms 
        by subtracting them. 
    """
    audio_1, sr_1 = array_from_wave(audio_file_1)
    audio_2, sr_2 = array_from_wave(audio_file_2)

    if len(audio_1.shape) > 1:
        audio_1 = audio_1[:, 0]  # take the first channel
    if len(audio_2.shape) > 1:
        audio_2 = audio_2[:, 0]  # take the first channel

    window_size = 20
    step_size = 10

    nperseg_1 = int(window_size * sr_1 / 1e3)
    noverlap_1 = int(step_size * sr_1 / 1e3)
    nperseg_2 = int(window_size * sr_2 / 1e3)
    noverlap_2 = int(step_size * sr_2 / 1e3)

    freq_1, time_1, spec_1 = scipy.signal.spectrogram(audio_1,
                                                      fs=sr_1,
                                                      window='hann',
                                                      nperseg=nperseg_1,
                                                      noverlap=noverlap_1,
                                                      detrend=False)

    freq_2, time_2, spec_2 = scipy.signal.spectrogram(audio_2,
                                                      fs=sr_2,
                                                      window='hann',
                                                      nperseg=nperseg_2,
                                                      noverlap=noverlap_2,
                                                      detrend=False)

    spec_diff = spec_1 - spec_2
    freq_diff = freq_1 - freq_2
    time_diff = time_1 - time_2

    if plot:
        plot_spectrogram(freq_diff, time_diff, spec_diff)
        #plot_spectrogram(freq_1, time_1, spec_2)
        #plot_spectrogram(freq_2, time_2, spec_2)

    return spec_diff
def test_datatype():
    test_audio_paths = get_all_test_audio()
    snr_level = 30
    for audio_path in test_audio_paths:
        audio_data, sr = array_from_wave(audio_path)
        augmented_data = synthetic_gaussian_noise_inject(audio_data,
                                                         snr_range=(snr_level,
                                                                    snr_level))
        assert augmented_data.dtype == "int16"
示例#9
0
def audio_with_sox(path:str, sample_rate:int, start_time:float, end_time:float, logger=None)\
                                                                                    ->np.ndarray:
    """
    crop and resample the recording with sox and loads it.
    If the output file cannot be found, an array of zeros of the desired length will be returned.
    """
    use_log = (logger is not None)
    with NamedTemporaryFile(suffix=".wav") as tar_file:
        tar_filename = tar_file.name
        sox_cmd = [
            'sox',
            '-V3',  # verbosity level=3
            path,  # noise filename
            '-r',
            f'{sample_rate}',  # sample rate
            '-c',
            '1',  # output is single-channel audio
            '-b',
            '16',  # bitrate = 16
            '-e',
            'si',  # encoding = signed-integer
            '-t',
            'wav',  # the output file is wav type
            tar_filename,  # output temp-filename
            'trim',
            f'{start_time}',
            '=' + f'{end_time}'
        ]  # trim to start and end time
        sox_result = subprocess.run(sox_cmd,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)

        if use_log:
            logger.info(
                f"noise_inj: sox: sox stdout: {sox_result.stdout.decode('utf-8')}"
            )
            stderr_message = sox_result.stderr.decode('utf-8')
            if 'FAIL' in stderr_message:
                logger.error(f"noise_inj: sox: sox stderr: {stderr_message}")
                print(f"ERROR: noise_inj: sox: sox stderr: {stderr_message}")
            else:
                logger.info(f"noise_inj: sox: sox stderr: {stderr_message}")

        if os.path.exists(tar_filename):
            noise_data, samp_rate = array_from_wave(tar_filename)
        else:
            noise_len = round((end_time - start_time) / sample_rate)
            noise_data = np.zeros((noise_len, ))
            logger.error(
                f"noise_inj: sox: tmp_file doesnt exist, using zeros of len {noise_len}"
            )
            print(
                f"ERROR: noise_inj: sox: sox stderr: tmp_file doesnt exist, using zeros of len {noise_len}"
            )

        assert isinstance(noise_data, np.ndarray), "not numpy array returned"
        return noise_data
示例#10
0
def augment_audio_with_sox(path: str,
                           sample_rate: int,
                           tempo: float,
                           gain: float,
                           pitch: float,
                           logger=None) -> Tuple[np.ndarray, int]:
    """
    Changes tempo, gain (volume), and pitch of the recording with sox and loads it.
    """
    use_log = (logger is not None)
    with NamedTemporaryFile(suffix=".wav") as augmented_file:
        augmented_filename = augmented_file.name
        sox_cmd = [
            'sox',
            '-V3',  # verbosity level = 3
            path,  # file to augment
            '-r',
            f'{sample_rate}',  # sample rate
            '-c',
            '1',  # single-channel audio
            '-b',
            '16',  # bitrate = 16
            '-e',
            'si',  # encoding = signed-integer
            '-t',
            'wav',  # the output file is wav type
            augmented_filename,  # output temp-filename
            'tempo',
            f'{tempo:.3f}',  # augment tempo
            'gain',
            f'{gain:.3f}',  # augment gain (in db)
            'pitch',
            f'{pitch:.0f}'
        ]  # augment pitch (in hundredths of semi-tone)
        sox_result = subprocess.run(sox_cmd,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)

        if use_log:
            logger.info(
                f"sox_pertrub: aug_audio_sox: tmpfile exists: {os.path.exists(augmented_filename)}"
            )
            logger.info(
                f"sox_pertrub: aug_audio_sox: sox stdout: {sox_result.stdout.decode('utf-8')}"
            )
            stderr_message = sox_result.stderr.decode('utf-8')
            if 'FAIL' in stderr_message:
                logger.error(
                    f"sox_pertrub: aug_audio_sox: sox stderr: {stderr_message}"
                )
            else:
                logger.info(
                    f"sox_pertrub: aug_audio_sox: sox stderr: {stderr_message}"
                )

        data, samp_rate = array_from_wave(augmented_filename)
        return data, samp_rate
示例#11
0
def load_audio(audio_path, preproc):

    audio_data, samp_rate = wave.array_from_wave(audio_path)
    inputs = loader.log_specgram_from_data(audio_data,
                                           samp_rate,
                                           window_size=32,
                                           step_size=16)
    inputs = (inputs - preproc.mean) / preproc.std
    return inputs, samp_rate
示例#12
0
def tempo_gain_pitch_perturb(audio_path: str,
                             sample_rate: int = 16000,
                             tempo_range: AugmentRange = (0.85, 1.15),
                             gain_range: AugmentRange = (-6.0, 8.0),
                             pitch_range: AugmentRange = (-400, 400),
                             augment_from_normal: bool = False,
                             logger=None) -> Tuple[np.ndarray, int]:
    """
    Picks tempo and gain uniformly, applies it to the utterance by using sox utility.
    Arguments:
        augment_from_normal - bool: if true, the augmentation values will be drawn from normal dist
    Returns:
        tuple(np.ndarray, int) - the augmente audio data and the sample_rate
    """
    use_log = (logger is not None)

    if augment_from_normal:
        tempo_center = np.mean(tempo_range)
        tempo_value = get_value_from_truncnorm(tempo_center,
                                               tempo_range,
                                               bounds=tempo_range)
        gain_center = np.mean(gain_range)
        gain_value = get_value_from_truncnorm(gain_center,
                                              gain_range,
                                              bounds=gain_range)
        pitch_center = np.mean(pitch_range)
        pitch_value = get_value_from_truncnorm(pitch_center,
                                               pitch_range,
                                               bounds=pitch_range)
    else:
        tempo_value = np.random.uniform(*tempo_range)
        gain_value = np.random.uniform(*gain_range)
        pitch_value = np.random.uniform(*pitch_range)

    if use_log:
        logger.info(f"tempo_gain_pitch_perturb: audio_file: {audio_path}")
    if use_log:
        logger.info(f"tempo_gain_pitch_perturb: tempo_value: {tempo_value}")
    if use_log:
        logger.info(f"tempo_gain_pitch_perturb: gain_value: {gain_value}")
    if use_log:
        logger.info(f"tempo_gain_pitch_perturb: pitch_value: {pitch_value}")

    try:
        audio_data, samp_rate = augment_audio_with_sox(audio_path,
                                                       sample_rate,
                                                       tempo_value,
                                                       gain_value,
                                                       pitch_value,
                                                       logger=logger)
    except RuntimeError as rterr:
        if use_log:
            logger.error(f"tempo_gain_pitch_perturb: RuntimeError: {rterr}")
        audio_data, samp_rate = array_from_wave(audio_path)

    return audio_data, samp_rate
示例#13
0
def resample_with_sox(path, sample_rate):
    """
    resample the recording with sox 
    """
    sox_params = "sox \"{}\" -r {} -c 1 -b 16 -e si {} trim {} ={} >/dev/null 2>&1".format(path, sample_rate,
                                                                                            tar_filename, start_time,
                                                                                            end_time)
    os.system(sox_params)
    noise_data, samp_rate = array_from_wave(tar_filename)
    return noise_data
示例#14
0
    def signal_augmentations(self, wave_file: str) -> tuple:
        """
        Performs all of the augmtations to the raw audio signal. The audio data is in pcm16 format.
        Arguments:
            wave_file - str: the path to the audio sample
        Returns:
            audio_data - np.ndarray: augmented np-array
            samp_rate - int: sample rate of the audio recording
        """
        if self.use_log:
            self.logger.info(f"preproc: audio_data read: {wave_file}")

        audio_data, samp_rate = array_from_wave(wave_file)

        # sox-based tempo, gain, pitch augmentations
        if self.tempo_gain_pitch_perturb and self.train_status:
            if np.random.binomial(1, self.tempo_gain_pitch_prob):
                audio_data, samp_rate = tempo_gain_pitch_perturb(
                    wave_file,
                    samp_rate,
                    self.tempo_range,
                    self.gain_range,
                    self.pitch_range,
                    self.augment_from_normal,
                    logger=self.logger)
                if self.use_log:
                    self.logger.info(f"preproc: tempo_gain_pitch applied")

        # synthetic gaussian noise
        if self.synthetic_gaussian_noise and self.train_status:
            if np.random.binomial(1, self.gauss_noise_prob):
                audio_data = synthetic_gaussian_noise_inject(
                    audio_data,
                    self.gauss_snr_db_range,
                    self.augment_from_normal,
                    logger=self.logger)
                if self.use_log:
                    self.logger.info(f"preproc: synth_gauss_noise applied")

        # noise injection
        if self.background_noise and self.train_status:
            if np.random.binomial(1, self.background_noise_prob):
                audio_data = inject_noise(audio_data, samp_rate,
                                          self.noise_dir,
                                          self.background_noise_range,
                                          self.augment_from_normal,
                                          self.logger)
                if self.use_log: self.logger.info(f"preproc: noise injected")

        return audio_data, samp_rate
示例#15
0
def test_apply_spec_augment_call(logger: Logger = None):
    """
    Just tests if the apply_spec_augment can be called without errors
    Arguments:
        logger - Logger: can be taken as input to teset logger
    """
    audio_paths = get_all_test_audio()
    for audio_path in audio_paths:
        audio_data, samp_rate = array_from_wave(audio_path)
        features = log_spectrogram_from_data(audio_data,
                                             samp_rate,
                                             window_size=32,
                                             step_size=16)
        apply_spec_augment(features, logger)
def test_high_snr_value():
    test_audio_paths = get_all_test_audio()
    snr_level = 100
    # absolute tolerance is 1e-5 of the range of values in pcm16 format (2**16)
    atol = 2**16 * 1e-5
    for audio_path in test_audio_paths:
        audio_data, sr = array_from_wave(audio_path)
        augmented_data = synthetic_gaussian_noise_inject(audio_data,
                                                         snr_range=(snr_level,
                                                                    snr_level))
        np.testing.assert_allclose(audio_data,
                                   augmented_data,
                                   rtol=1e-03,
                                   atol=atol)
def test_regression_equal_pickle():
    """
    The pickle data is output from using the Speak-out.wav file with an snr_level = 30 and a random seed of zero
    """
    pickle_path = "../test_pickle/sythentic-gaussian-noise-inject_Speak-out_snr-30.pickle"
    with open(pickle_path, 'rb') as fid:
        pickle_data = pickle.load(fid)

    audio_path = "../test_audio/Speak-out.wav"
    snr_level = 30
    audio_data, sr = array_from_wave(audio_path)

    np.random.seed(0)
    augmented_data = synthetic_gaussian_noise_inject(audio_data,
                                                     snr_range=(snr_level,
                                                                snr_level))

    assert (augmented_data == pickle_data
            ).sum() == augmented_data.size, "regression test fails"
def test_no_augment():
    """
    tests that input audio and augmented data are identical with no augmentation: tempo=1.0, gain=0
    pitch = 0
    """

    tempo = 1.0
    gain = 0.0
    pitch = 0.0
    audio_paths = get_all_test_audio()
    for audio_path in audio_paths:
        # un-augmented audio_data
        audio_data, samp_rate = array_from_wave(audio_path)
        aug_data, samp_rate = tempo_gain_pitch_perturb(audio_path,
                                                       sample_rate=samp_rate,
                                                       tempo_range=(tempo,
                                                                    tempo),
                                                       gain_range=(gain, gain),
                                                       pitch_range=(pitch,
                                                                    pitch))

        assert all(audio_data == aug_data), "data is not the same"
def test_gain_increase_amplitude():
    """
    tests that 1) 6 dB increase in gain coorespondes to a 1.995 increase in the sum of the absolute
    value of the amplitudes and,
    2) a 6 db decrease cooresponds to a 0.5 decrease in the sum abs value of amplitudes
    Ratio is computed as: ratio = 10**(gain/20)
    """

    tempo = 1.0
    pitch = 0.0
    gain_ratio_tuples = [
        (0, 1.0),  # not augmentation
        (6, 1.995),
        (-6, 0.501)
    ]
    #(10, 3.162),        # these two tests fail.
    #(-10, 0.3162)       # I'm not sure why, likely an error in my approach.
    audio_paths = get_all_test_audio()  # only using a single audio path
    for audio_path in audio_paths:
        print(f"audio_path: {audio_path}")
        for gain, amp_ratio in gain_ratio_tuples:
            # un-augmented audio_data
            audio_data, samp_rate = array_from_wave(audio_path)
            aug_data, samp_rate = tempo_gain_pitch_perturb(
                audio_path,
                sample_rate=samp_rate,
                tempo_range=(tempo, tempo),
                gain_range=(gain, gain),
                pitch_range=(pitch, pitch))
            audio_rms = audioop.rms(audio_data, 2)
            scaled_aug_rms = audioop.rms(aug_data, 2) / amp_ratio
            accuracy = -1  # same up to 10^(-accuracy)
            print(
                f"audio rms: {audio_rms}, scaled_aug rms: {scaled_aug_rms}, ratio:{amp_ratio}, accuracy:{10**(-accuracy)}"
            )
            np.testing.assert_almost_equal(audio_rms,
                                           scaled_aug_rms,
                                           decimal=accuracy)
def test_tempo_augment():
    """
    Verifies the size of the augmented data scaled by the tempo equals the size
    of the un-augmented data
    """

    audio_paths = get_all_test_audio()
    tempos = [0, 0.5, 0.85, 1, 1.15, 2]
    for audio_path in audio_paths:
        # un-augmented audio_data
        audio_data, samp_rate = array_from_wave(audio_path)
        for tempo in tempos:
            aug_data, samp_rate = tempo_gain_pitch_perturb(
                audio_path,
                sample_rate=samp_rate,
                tempo_range=(tempo, tempo),
                gain_range=(0, 0),
                pitch_range=(0, 0))

        print(
            f"audio_data size: {audio_data.size}, aug_data: {aug_data.size}, tempo: {tempo}"
        )
        assert audio_data.size == pytest.approx(aug_data.size * tempo, 1e-1)
示例#21
0
def compute_mean_std(
        audio_files: List[str], preprocessor: str, window_size: int,
        step_size: int,
        use_feature_normalize: bool) -> Tuple[np.ndarray, np.ndarray]:
    """
    Compute the mean and std deviation of all of the feature bins (frequency bins if log_spec
    preprocessor). Will first normalize the audio samples if use_feature_normalize is true.
    Args:
        audio_files - List[str]: a list of shuffled audio files. len = max_samples
        preprocessor (str): name of preprocessor
        window_size - int: window_size of preprocessor
        step_size - int: step_size of preprocessor
        use_feature_normalize - bool: whether or not the features themselves are normalized
    Returns:
        mean - np.ndarray: the mean of the feature bins - shape = (# feature bins,)
        std  - np.ndarray: the std deviation of the feature bins - shape = (# bins,)
    """
    assert len(audio_files) > 0, "input list of audio_files is empty"

    samples = []
    for audio_file in audio_files:
        audio_data, samp_rate = array_from_wave(audio_file)
        feature_array = process_audio(audio_data, samp_rate, window_size,
                                      step_size, preprocessor)
        if use_feature_normalize:
            feature_array = feature_normalize(
                feature_array)  # normalize the feature
        samples.append(feature_array)

    # compute mean and std dev of the feature bins (along axis=0)
    # feature arrays aka samples are time x feature bin
    samples = np.vstack(
        samples)  # stacks along time axis: shape = (all_time, feature bin)
    mean = np.mean(samples, axis=0, dtype='float32'
                   )  # computes mean along time axis: shape = (feature bin,)
    std = np.std(samples, axis=0, dtype='float32')
    return mean, std
示例#22
0
def process_audio(audio,
                  samp_rate: int,
                  window_size=32,
                  step_size=16,
                  processing='log_spectrogram'):
    """Processes audio through the provided processing function.

    Args:
        audio (str or np.ndarray): path to audio or audio array
        samp_rate (int): sample rate of audio
        window_size (int): size of window in processing function
        step_size (int): step in processing function
        processing (str): name of processing function. 
            'log_spectogram', 'mfcc', and 'log_mel' are acceptable.
    Returns: 
        np.ndarray: processed array of dimensions: time x processor_bins
    """
    assert isinstance(audio, (str, np.ndarray)), \
        f"audio must be type str or np.ndarray, not {type(audio)}"

    # process audio from audio path
    if isinstance(audio, str):
        audio, samp_rate = array_from_wave(audio_path)

    audio = average_channels(audio)

    if processing == 'log_spectrogram':
        output = log_spectrogram(audio, samp_rate, window_size, step_size)
    elif processing == 'mfcc':
        output = mfcc(audio, samp_rate, window_size, step_size)
    elif processing == 'log_mel':
        output = log_mel_filterbank(audio, samp_rate, window_size, step_size)
    else:
        raise ValueError(f"processing value: {processing} is unacceptable")

    return output
示例#23
0
def check_length(audio_path:str, noise_path:str, noise_level:float=0.5):
    audio_data, samp_rate = array_from_wave(audio_path)
    audio_noise = inject_noise_sample(audio_data, samp_rate, noise_path, 
                    noise_level=noise_level, logger=None)
示例#24
0
def log_specgram_from_file(audio_file):
    audio, sr = wave.array_from_wave(audio_file)
    return log_specgram(audio, sr)
示例#25
0
def log_specgram_from_file(audio_file, plot=False):
    audio, sr = wave.array_from_wave(audio_file)
    return log_specgram(audio, sr, plot=plot)
示例#26
0
文件: wave_test.py 项目: zhly0/speech
def test_load():
    audio, samp_rate = wave.array_from_wave("test0.wav")

    assert samp_rate == 16000
    assert audio.dtype == np.int16