Exemplo n.º 1
0
def test_kaldi_audio(wav_file, audio, dtype):
    # make sure we have results when loading a wav file with
    # shennong.Audio and with the Kaldi code.
    with tempfile.NamedTemporaryFile('w+') as tfile:
        tfile.write('test {}\n'.format(wav_file))
        tfile.seek(0)
        with SequentialWaveReader('scp,t:' + tfile.name) as reader:
            for key, wave in reader:
                audio_kaldi = Audio(wave.data().numpy().reshape(
                    audio.data.shape),
                                    audio.sample_rate,
                                    validate=False)

    audio = audio.astype(dtype)
    assert audio.duration == audio_kaldi.duration
    assert audio.dtype == dtype
    assert audio.is_valid()
    assert audio_kaldi.dtype == np.float32
    assert not audio_kaldi.is_valid()  # not in [-1, 1] but [-2**15, 2**15-1]

    mfcc = MfccProcessor().process(audio)
    mfcc_kaldi = MfccProcessor().process(audio_kaldi)
    assert mfcc.shape == mfcc_kaldi.shape
    assert np.array_equal(mfcc.times, mfcc_kaldi.times)
    assert mfcc.properties == mfcc_kaldi.properties
    assert mfcc.dtype == mfcc_kaldi.dtype
    assert pytest.approx(mfcc.data, mfcc_kaldi.data)
Exemplo n.º 2
0
def test_check_wavs_bad(wav_file, wav_file_8k, tmpdir, capsys):
    def fun(utts):
        c = pipeline._init_config(
            pipeline.get_default_config('mfcc', with_cmvn=False))
        u = pipeline._init_utterances(utts)
        pipeline._Manager(c, u)
        return u

    # build a stereo file and make sure it is not supported by the
    # pipeline
    audio = Audio.load(wav_file)
    stereo = Audio(np.asarray((audio.data, audio.data)).T,
                   sample_rate=audio.sample_rate)
    assert stereo.nchannels == 2
    wav_file_2 = str(tmpdir.join('stereo.wav'))
    stereo.save(wav_file_2)
    with pytest.raises(ValueError) as err:
        fun([(wav_file_2, )])
    assert 'all wav files are not mono' in str(err)

    # ensure we catch differences in sample rates
    capsys.readouterr()  # clear buffer
    w = [(wav_file, ), (wav_file_8k, )]
    out = fun(w)
    err = capsys.readouterr().err
    assert 'several sample rates found in wav files' in err
    assert sorted(out.keys()) == ['utt_1', 'utt_2']

    # make sure timestamps are ordered
    with pytest.raises(ValueError) as err:
        fun([('1', wav_file, 1, 0)])
    assert 'timestamps are not in increasing order for' in str(err)
Exemplo n.º 3
0
def test_scan_bad():
    with pytest.raises(ValueError) as err:
        Audio.scan(__file__)
    assert 'is it a wav?' in str(err)

    with pytest.raises(ValueError) as err:
        Audio.scan('/path/to/some/lost/place')
    assert 'file not found' in str(err)
Exemplo n.º 4
0
def test_equal(audio):
    assert audio == audio

    audio2 = Audio(audio.data, audio.sample_rate)
    assert audio == audio2

    audio2 = Audio(audio.data, audio.sample_rate + 1)
    assert audio != audio2

    audio2 = Audio(audio.data * 2, audio.sample_rate)
    assert audio.duration == audio2.duration
    assert audio.sample_rate == audio2.sample_rate
    assert audio != audio2
def wavs_to_feats_df(wavs_list, feats):

    assert feats in ['mfcc', 'bnf'], "Unknown feature parameter for wavs_to_feats_df function: {}".format(feats)

    feats_list = []

    for wav_file in wavs_list:

        wav_data = Audio.load(wav_file).resample(8000)

        assert wav_data.sample_rate == 8000, "Error. Could not resample file to 8000 Hz for MFCC/BNF feature extraction."
        assert wav_data.nchannels == 1, "Unexpected non-mono file supplied: {}".format(filename)

        if feats == 'mfcc':
            mfcc_data = mfcc_processor.process(wav_data)
            mfcc_data = delta_processor.process(mfcc_data)
            feats_list.append(mfcc_data.data)

        elif feats == 'bnf':
            bnf_data = bnf_processor.process(wav_data)
            feats_list.append(bnf_data.data)        

    feats_df = pd.DataFrame({
        "filename" : [ os.path.splitext(os.path.basename(f))[0] for f in wavs_list ], # '.../filename.wav' => 'filename',
        "features" : feats_list
    })

    return feats_df
Exemplo n.º 6
0
def test_segment(audio):
    d = audio.duration
    assert audio.segment([(0., d)])[0] == audio
    assert audio.segment([(0., d+10)])[0] == audio

    chunks = audio.segment([(0, d/2), (d/2, d)])
    assert all(c.duration == pytest.approx(d/2, rel=1e-3) for c in chunks)
    assert sum(c.nsamples for c in chunks) == audio.nsamples
    assert Audio(
        np.concatenate([c.data for c in chunks]), audio.sample_rate) == audio

    chunks = audio.segment([(0, d/3), (d/3, 2*d/3), (2*d/3, d)])
    assert all(c.duration == pytest.approx(d/3, rel=1e-3) for c in chunks)
    assert sum(c.nsamples for c in chunks) == audio.nsamples
    assert Audio(
        np.concatenate([c.data for c in chunks]), audio.sample_rate) == audio
Exemplo n.º 7
0
    def get_features(self, y, sample_rate):
        """Feature extraction

        Parameters
        ----------
        y : (n_samples, 1) numpy array
            Waveform
        sample_rate : int
            Sample rate

        Returns
        -------
        data : (n_frames, n_dimensions) numpy array
            Features
        """
        # scale the audio signal between -1 and 1 before
        # creating audio object w/ shennong: Do this because
        # when pyannote uses "data augmentation", it normalizes
        # the signal, but when loading the data without data
        # augmentation it doesn't normalize it.
        y = y / np.max((-np.min(y), np.max(y)))

        # create audio object for shennong
        audio = Audio(data=y, sample_rate=sample_rate)

        # create processor
        processor = BottleneckProcessor(weights=self.weights)

        # define parameters

        #processor.frame_length = self.duration
        #processor.frame_shift = self.step

        # extract features
        bottleneck = processor.process(audio)

        # Compute Pitch
        if self.with_pitch:
            # extract pitch
            pitch = self.get_pitch(audio, self.pitchFmin, self.pitchFmax)

            ## concatenate mfcc w/pitch - sometimes Kaldi adds to pitch
            ## one frame so give 2 frames of tolerance
            #bottleneck = bottleneck.concatenate(pitch, 2)
            bottleneck = self.concatenate_with_pitch(bottleneck.data,
                                                     pitch.data)
            ## add 1 frame at begining and 1 frame at end to ensure that
            ## we have the same length as mfccs etc..
            bottleneck = np.insert(bottleneck,
                                   0,
                                   np.zeros((1, bottleneck.shape[1])),
                                   axis=0)
            bottleneck = np.insert(bottleneck,
                                   bottleneck.shape[0],
                                   np.zeros((1, bottleneck.shape[1])),
                                   axis=0)
        else:
            bottleneck = bottleneck.data

        return bottleneck
Exemplo n.º 8
0
def extract_features_shennong(audio_path, save_path):
    audio = Audio.load(audio_path)
    # 80-dim fbank with 1-dim energe
    processor = FilterbankProcessor(sample_rate=audio.sample_rate,
                                    num_bins=40,
                                    use_energy=False)  #80 fbank + 1 energy
    fbank = processor.process(audio)
    fbank = fbank.data  #(fbank.data - fbank.data.mean()) / fbank.data.std()

    # 3-dim pitch
    processor = PitchProcessor(frame_shift=0.01, frame_length=0.025)
    options = {
        'sample_rate': audio.sample_rate,
        'frame_shift': 0.01,
        'frame_length': 0.025,
        'min_f0': 20,
        'max_f0': 500
    }
    processor = PitchProcessor(**options)
    pitch = processor.process(audio)
    postprocessor = PitchPostProcessor()  # use default options
    postpitch = postprocessor.process(pitch)  # 3 dim
    postpitch = postpitch.data  #(postpitch.data - postpitch.data.mean()) / postpitch.data.std()
    #features = postpitch
    shape = min(fbank.shape[0], postpitch.shape[0])
    #zero = np.zeros((,content[i].shape[1]),dtype=np.float32)
    #content[i] = np.vstack((content[i],zero))
    features = np.concatenate((fbank[:shape, :], postpitch[:shape, :]),
                              axis=-1)

    # name = os.path.basename(audio_path).split('.')[0] + '.npy'
    # np.save(os.path.join(save_path, name), features.data)
    return features
Exemplo n.º 9
0
def test_save(tmpdir, audio):
    p = str(tmpdir.join('test.wav'))
    audio.save(p)

    # cannot overwrite an existing file
    with pytest.raises(ValueError) as err:
        audio.save(p)
    assert 'file already exist' in str(err)

    audio2 = Audio.load(p)
    assert audio == audio2

    # test with float32 wav
    signal = np.zeros((1000,), dtype=np.float32)
    signal[10] = 1.0
    signal[20] = -1.0
    p = str(tmpdir.join('test2.wav'))
    audio = Audio(signal, 1000)
    audio.save(p)
    meta = Audio.scan(p)
    assert meta.nsamples == 1000
    assert meta.nchannels == 1

    audio2 = Audio.load(p)
    assert audio2 == audio
    assert audio2.data.min() == -1.0
    assert audio2.data.max() == 1.0
Exemplo n.º 10
0
    def get_features(self, y, sample_rate):
        """Feature extraction

        Parameters
        ----------
        y : (n_samples, 1) numpy array
            Waveform
        sample_rate : int
            Sample rate

        Returns
        -------
        data : (n_frames, n_dimensions) numpy array
            Features
        """
        # scale the audio signal between -1 and 1 before
        # creating audio object w/ shennong: Do this because
        # when pyannote uses "data augmentation", it normalizes
        # the signal, but when loading the data without data
        # augmentation it doesn't normalize it.
        y = y / np.max((-np.min(y), np.max(y)))

        # create audio object for shennong
        audio = Audio(data=y, sample_rate=sample_rate)

        # create filterbank processor
        processor = FilterbankProcessor(sample_rate=sample_rate)

        # use energy ?
        processor.use_energy = self.e

        # set parameters
        processor.frame_length = self.duration
        processor.frame_shift = self.step
        processor.window_type = self.fftWindow
        processor.low_freq = self.melLowFreq
        processor.high_freq = self.melHighFreq
        processor.num_bins = self.melNbFilters
        processor.snip_edges = False

        # process audio to get filterbanks
        fbank = processor.process(audio)

        # Compute Pitch
        if self.with_pitch:
            # extract pitch
            pitch = self.get_pitch(audio, self.pitchFmin, self.pitchFmax)

            ## concatenate mfcc w/pitch - sometimes Kaldi adds to pitch
            ## one frame so give 2 frames of tolerance
            #fbank = fbank.concatenate(pitch, 2)
            fbank = self.concatenate_with_pitch(fbank.data, pitch.data)
        else:
            fbank = fbank.data

        return fbank
Exemplo n.º 11
0
def test_silence():
    silence = Audio(np.zeros((100, )), 16000)

    with pytest.raises(RuntimeError) as err:
        BottleneckProcessor().process(silence)
    assert 'no voice detected in signal' in str(err.value)

    # silence VAD all false
    vad = _compute_vad(silence.data, null_logger(), bugfix=True)
    assert not vad.any()
Exemplo n.º 12
0
    def get_features(self, y, sample_rate):
        """Feature extraction

        Parameters
        ----------
        y : (n_samples, 1) numpy array
            Waveform
        sample_rate : int
            Sample rate

        Returns
        -------
        data : (n_frames, n_dimensions) numpy array
            Features
        """
        # scale the audio signal between -1 and 1 before
        # creating audio object w/ shennong: Do this because
        # when pyannote uses "data augmentation", it normalizes
        # the signal, but when loading the data without data
        # augmentation it doesn't normalize it.
        y = y / np.max((-np.min(y), np.max(y)))

        # create audio object for shennong
        audio = Audio(data=y, sample_rate=sample_rate)

        # MFCC parameters
        processor = SpectrogramProcessor(sample_rate=sample_rate)
        processor.window_type = self.window_type
        processor.dither = self.dither
        processor.preemph_coeff = self.preemph_coeff
        processor.remove_dc_offset = self.remove_dc_offset
        processor.round_to_power_of_two = self.round_to_power_of_two
        processor.blackman_coeff = self.blackman_coeff
        processor.energy_floor = self.energy_floor
        processor.raw_energy = self.raw_energy

        processor.snip_edges = False  # end with correct number of frames

        # MFCC extraction
        #audio = Audio(data=y, sample_rate=sample_rate)
        spect = processor.process(audio)

        # Compute Pitch
        if self.with_pitch:
            # extract pitch
            pitch = self.get_pitch(audio, self.pitchFmin, self.pitchFmax)

            ## concatenate spect w/pitch - sometimes Kaldi adds to pitch
            ## one frame so give 2 frames of tolerance
            spect = self.concatenate_with_pitch(spect.data, pitch.data)

        else:
            spect = spect.data

        return spect
Exemplo n.º 13
0
def test_bad_signal(audio):
    signal = Audio(np.random.random((10, 2)), 50)
    proc = SpectrogramProcessor(sample_rate=signal.sample_rate)
    with pytest.raises(ValueError) as err:
        proc.process(signal)
        assert 'signal must have one dimension' in str(err)

    with pytest.raises(ValueError) as err:
        proc = SpectrogramProcessor(sample_rate=signal.sample_rate + 1)
        proc.process(audio)
    assert 'mismatch in sample rates' in str(err)
Exemplo n.º 14
0
def get_plp_dd(wav_fn, norm):
    """Return the MFCCs with deltas and delta-deltas for a audio file."""
    audio = Audio.load(wav_fn)
    processor = PlpProcessor(sample_rate=audio.sample_rate, window_type="hamming",frame_length=0.025, frame_shift=0.01,
                              low_freq=0, vtln_low=60, vtln_high=7200, high_freq=audio.sample_rate/2)
    plp_static = processor.process(audio, vtln_warp=1.0)
    d_processor = DeltaPostProcessor(order=2)
    plp_deltas = d_processor.process(plp_static)
    features = np.float64(plp_deltas._to_dict()["data"])
    if norm == "cmvn":
        features = (features - np.mean(features, axis=0)) / np.std(features, axis=0)

    return features
Exemplo n.º 15
0
def test_shape():
    # it was a bug when audio data is shaped (n, 1): must be reshaped
    # as (n,). The bug appens when converting audio data to pykaldi
    # vector.
    d1 = np.random.random((100,))
    assert d1.shape == (100,)

    d2 = np.random.random((100, 1))
    assert d2.shape == (100, 1)

    for d in (d1, d2):
        a = Audio(d, 10)
        assert a.shape == (100,)
Exemplo n.º 16
0
    def get_audio(self, utterance):
        """Returns the audio data for that `utterance`"""
        utt = self.utterances[utterance]
        audio = Audio.load(utt.file)
        if utt.tstart is not None:
            assert utt.tstop > utt.tstart
            audio = audio.segment([(utt.tstart, utt.tstop)])[0]

        if self.features == 'bottleneck':
            # resample here the signal (this avoid bugs if one part of
            # the pipeline on 8k and the other on 16k), then update
            # the metadata for the wav to be used by the rest of the
            # pipeline
            self.log.debug(
                'resampling audio from %dHz@%db to %dHz@%db',
                audio.sample_rate, audio.dtype.itemsize * 8, 8000, 16)

            audio = audio.resample(8000).astype(np.int16)
            self._wavs_metadata[self.utterances[utterance].file] = (
                Audio._metawav(
                    audio.nchannels, audio.sample_rate,
                    audio.nsamples, audio.duration))
        return audio
def transform_all_wavs(folder_wav, type, folder_out): # will output [timexdim}
    processor = BottleneckProcessor(weights=type)
    count = 0
    for file in os.listdir(folder_wav):
        if count % 500 == 0:
            print(count)
        count += 1
        if not file.endswith('.wav'):
            continue
        audio = Audio.load(os.path.join(folder_wav, file))

        features = processor.process(audio)
        #print(features.shape)
        #print(features)
        np.savetxt(fname = os.path.join(folder_out,file[:-4] + '.csv'), X=features._data)
Exemplo n.º 18
0
def test_output(audio):
    assert MfccProcessor(frame_shift=0.01).process(audio).shape == (140, 13)
    assert MfccProcessor(frame_shift=0.02).process(audio).shape == (70, 13)
    assert MfccProcessor(frame_shift=0.02,
                         frame_length=0.05).process(audio).shape == (69, 13)

    # sample rate mismatch
    with pytest.raises(ValueError):
        MfccProcessor(sample_rate=8000).process(audio)

    # only mono signals are accepted
    with pytest.raises(ValueError):
        data = np.random.random((1000, 2))
        stereo = Audio(data, sample_rate=16000)
        MfccProcessor(sample_rate=stereo.sample_rate).process(stereo)
Exemplo n.º 19
0
def test_compare_kaldi(wav_file):
    a1 = Audio.load(wav_file).data

    with tempfile.NamedTemporaryFile('w+') as tfile:
        tfile.write('test {}\n'.format(wav_file))
        tfile.seek(0)
        with SequentialWaveReader('scp,t:' + tfile.name) as reader:
            for key, wave in reader:
                a2 = wave.data().numpy()

    assert a1.max() == a2.max()
    assert a1.min() == a2.min()
    assert len(a1) == len(a2.flatten()) == 22713
    assert a1.dtype == np.int16 and a2.dtype == np.float32
    assert a1.shape == (22713,) and a2.shape == (1, 22713)
    assert pytest.approx(a1, a2)
Exemplo n.º 20
0
    def __init__(self,
                 config,
                 utterances,
                 log=get_logger('manager', 'warning')):
        self._config = config
        self._utterances = utterances
        self._warps = {}
        self.log = log

        self._check_utterances()

        # store the metadata because we need to access the sample rate
        # for processors instanciation
        audio_files = set(utt.audio_file for utt in utterances)
        self._audio_metadata = {}
        for audio in audio_files:
            log.debug('scanning %s', audio)
            self._audio_metadata[audio] = Audio.scan(audio)

        # make sure all the audio files are compatible with the pipeline
        log.info('scanning %s utterances...', len(self._utterances))
        self._check_audio_files()

        # the features type to be extracted
        self.features = [
            k for k in self.config.keys() if k in self.valid_features
        ][0]

        # get some framing parameters constant for all processors
        # (retrieve them from a features processor instance)
        proc = self.get_features_processor(next(iter(self.utterances)))
        self.frame_length = proc.frame_length
        self.frame_shift = proc.frame_shift

        # if CMVN by speaker, instanciate a CMVN processor by speaker
        # here, else instanciate a processor per utterance
        if 'cmvn' in self.config:
            if self.config['cmvn']['by_speaker']:
                self._cmvn_processors = {
                    spk: self.get_processor_class('cmvn')(proc.ndims)
                    for spk in set(utt.speaker for utt in self.utterances)
                }
            else:
                self._cmvn_processors = {
                    utt.name: self.get_processor_class('cmvn')(proc.ndims)
                    for utt in self.utterances
                }
Exemplo n.º 21
0
def test_output_shape(audio):
    assert EnergyProcessor(frame_shift=0.01).process(audio).shape == (140, 1)
    assert EnergyProcessor(frame_shift=0.02).process(audio).shape == (70, 1)
    assert EnergyProcessor(frame_shift=0.02,
                           frame_length=0.05).process(audio).shape == (69, 1)

    # sample rate mismatch
    with pytest.raises(ValueError) as err:
        EnergyProcessor(sample_rate=8000).process(audio)
    assert 'mismatch in sample rate' in str(err)

    # only mono signals are accepted
    with pytest.raises(ValueError) as err:
        data = np.random.random((1000, 2))
        stereo = Audio(data, sample_rate=16000)
        EnergyProcessor(sample_rate=stereo.sample_rate).process(stereo)
    assert 'must have one dimension' in str(err)
Exemplo n.º 22
0
    def get_audio(self, utterance):
        """Returns the audio data for that `utterance`"""
        audio = utterance.load_audio()

        if self.features == 'bottleneck':
            # resample here the signal (this avoid bugs if one part of the
            # pipeline on 8k and the other on 16k), then update the metadata
            # for the audio to be used by the rest of the pipeline
            self.log.debug('resampling audio from %dHz@%db to %dHz@%db',
                           audio.sample_rate, audio.dtype.itemsize * 8, 8000,
                           16)

            audio = audio.resample(8000).astype(np.int16)
            self._audio_metadata[utterance.audio_file] = (Audio._metadata(
                audio.nchannels, audio.sample_rate, audio.nsamples,
                audio.duration))
        return audio
Exemplo n.º 23
0
def main():
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    parser.add_argument('data_dir', help='input directory with wavs')
    parser.add_argument(
        'output_dir',
        default='/tmp',
        nargs='?',
        help='output directory (created files are deleted at exit)')

    args = parser.parse_args()

    # load audio data and compute total duration
    audio_data = {
        os.path.basename(f): Audio.load(f)
        for f in list_files_with_extension(args.data_dir, '.wav')
    }
    total_duration = datetime.timedelta(
        seconds=int(sum(a.duration for a in audio_data.values())))
    print('found {} wav files, total duration of {}'.format(
        len(audio_data), str(total_duration)))

    # compute the features (default MFCC)
    print('computing MFCC features...')
    t1 = datetime.datetime.now()
    processor = MfccProcessor()
    features = FeaturesCollection(
        **{k: processor.process(v)
           for k, v in audio_data.items()})
    t2 = datetime.datetime.now()
    print('took {}'.format(t2 - t1))

    # save the features in all the supported formats
    data = {
        'duration': total_duration,
        'data': {
            ext: analyze_serializer(features, ext, args.output_dir)
            for ext in supported_extensions().keys()
        }
    }

    print_results(data)
def get_features(sound_file, chosen_processor):
    # computes the feature coefficients of a sound file

    #     :param sound_file : sound file in format .wav
    #     :type amount: .wav file
    #     :returns: feature coefficients per frame of 25ms every 10ms can be 'filterbank'
    #     'plp', 'rasteplp' or 'bottleneck'
    #     :rtype: a numpy array

    audio = Audio.load(sound_file)
    processors = {
        'filterbank': FilterbankProcessor(sample_rate=audio.sample_rate),
        'plp': PlpProcessor(sample_rate=audio.sample_rate),
        'rastaplp': RastaPlpProcessor(sample_rate=audio.sample_rate),
        'bottleneck': BottleneckProcessor(weights='BabelMulti')
    }

    features = chosen_processor.process(audio)
    features = pd.DataFrame(features)
    return (features)
Exemplo n.º 25
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('wav', help='wav file to compute features on')

    # load the wav file
    wav_file = parser.parse_args().wav
    audio = Audio.load(wav_file)

    # initialize features processors
    processors = {
        'spectrogram': SpectrogramProcessor(sample_rate=audio.sample_rate),
        'filterbank': FilterbankProcessor(sample_rate=audio.sample_rate),
        'mfcc': MfccProcessor(sample_rate=audio.sample_rate),
        'plp': PlpProcessor(sample_rate=audio.sample_rate),
        'rastaplp': RastaPlpProcessor(sample_rate=audio.sample_rate),
        'bottleneck': BottleneckProcessor(weights='BabelMulti')}

    # compute the features for all processors
    features = {k: v.process(audio) for k, v in processors.items()}

    # plot the audio signal and the resulting features
    fig, axes = plt.subplots(
        nrows=len(processors)+1,
        gridspec_kw={'top': 0.95, 'bottom': 0.05, 'hspace': 0},
        subplot_kw={'xticks': [], 'yticks': []})
    time = np.arange(0.0, audio.nsamples) / audio.sample_rate
    axes[0].plot(time, audio.astype(np.float32).data)
    axes[0].set_xlim(0.0, audio.duration)
    axes[0].text(
        0.02, 0.8, 'audio',
        bbox={'boxstyle': 'round', 'alpha': 0.5, 'color': 'white'},
        transform=axes[0].transAxes)

    for n, (k, v) in enumerate(features.items(), start=1):
        axes[n].imshow(v.data.T, aspect='auto')
        axes[n].text(
            0.02, 0.8, k,
            bbox={'boxstyle': 'round', 'alpha': 0.5, 'color': 'white'},
            transform=axes[n].transAxes)

    plt.show()
Exemplo n.º 26
0
    def __init__(self, config, utterances, log=get_logger()):
        self._config = config
        self._utterances = utterances
        self.log = log

        # the list of speakers
        self._speakers = set(u.speaker for u in self.utterances.values())
        if self._speakers == {None}:
            self._speakers = None
        self._check_speakers()

        # store the metadata because we need to access the sample rate
        # for processors instanciation
        wavs = set(u.file for u in utterances.values())
        self._wavs_metadata = {w: Audio.scan(w) for w in wavs}

        # make sure all the wavs are compatible with the pipeline
        log.info(f'scanning {len(self._utterances)} utterances...')
        self._check_wavs()

        # the features type to be extracted
        self.features = [
            k for k in self.config.keys() if k in self._valid_features][0]

        # get some framing parameters constant for all processors
        # (retrieve them from a features processor instance)
        p = self.get_features_processor(next(iter(self.utterances.keys())))
        self.frame_length = p.frame_length
        self.frame_shift = p.frame_shift

        # if CMVN by speaker, instanciate a CMVN processor by speaker
        # here, else instanciate a processor per utterance
        if 'cmvn' in self.config:
            if self.config['cmvn']['by_speaker']:
                self._cmvn_processors = {
                    spk: self.get_processor_class('cmvn')(p.ndims)
                    for spk in self.speakers}
            else:
                self._cmvn_processors = {
                    utt: self.get_processor_class('cmvn')(p.ndims)
                    for utt in self.utterances}
Exemplo n.º 27
0
def get_mfcc_vtln(wav_fn, f, norm, lang):
    """Return the MFCCs with deltas and delta-deltas for a audio file."""
    ref = os.path.basename(f).replace(".wav", "")
    if not os.path.isfile("warps_{}.pkl".format(lang)):
        if os.path.isfile('warps_{}.txt'.format(lang)):
            factors = {}
            with open('warps_{}.txt'.format(lang), mode='r',
                      encoding='utf-8') as opfile:
                wop = opfile.read().split('\n')
                for line in wop:
                    if len(line) > 1:
                        l_sp = line.split()
                        factors[l_sp[0]] = float(l_sp[1])
                        print(factors)
            with open('warps_{}.pkl'.format(lang), mode='wb') as opfile:
                pickle.dump(factors, opfile)
        else:
            print('no warp factors found')
            exit()
    with open("warps_{}.pkl".format(lang), mode="rb") as op:
        factors = pickle.load(op)
    warp = float(factors[ref])
    audio = Audio.load(wav_fn)
    processor = MfccProcessor(sample_rate=audio.sample_rate,
                              window_type="hamming",
                              frame_length=0.025,
                              frame_shift=0.01,
                              cepstral_lifter=26.0,
                              low_freq=0,
                              vtln_low=60,
                              vtln_high=7200,
                              high_freq=audio.sample_rate / 2)
    d_processor = DeltaPostProcessor(order=2)
    mfcc_static = processor.process(audio, vtln_warp=warp)
    mfcc_deltas = d_processor.process(mfcc_static)
    features = np.float64(mfcc_deltas._to_dict()["data"])
    if norm == "cmvn":
        features = (features - np.mean(features, axis=0)) / np.std(features,
                                                                   axis=0)

    return features
Exemplo n.º 28
0
def test_channels_stereo():
    data = np.random.random((1000, 2))
    audio2 = Audio(data, sample_rate=16000)
    assert audio2.nchannels == 2
    assert audio2.shape == (1000, 2)

    audio1 = audio2.channel(0)
    assert audio1.nchannels == 1
    assert audio1.shape == (1000,)
    assert all(np.equal(audio1.data, audio2.data[:, 0]))
    assert not all(np.equal(audio1.data, audio2.data[:, 1]))
    assert audio1.duration == audio2.duration

    audio1 = audio2.channel(1)
    assert audio1.nchannels == 1
    assert audio1.shape == (1000,)
    assert all(np.equal(audio1.data, audio2.data[:, 1]))
    assert not all(np.equal(audio1.data, audio2.data[:, 0]))

    with pytest.raises(ValueError):
        audio2.channel(2)
Exemplo n.º 29
0
        k = shortest_path_position[0][0]
        l = shortest_path_position[1][0]

    # divide the shortest distance by the length of the path
    average_distance = (distance_matrix[vector_1.shape[0]-1][vector_2.shape[0]-1]) \
                        / path_length
    return average_distance


all_features = {}

# get bottleneck features of all .wav files (stimuli)
for root, dirs, files in os.walk(WAV_FOLDER):
    for wav_file in files:
        if wav_file.endswith(".wav"):
            audio = Audio.load(root + wav_file)
            processor = BottleneckProcessor(weights='BabelMulti')
            features = processor.process(audio)
            vectors = features.data
            utterance = wav_file.split('.')[0]
            all_features[utterance] = vectors

for row in distance_list.itertuples():
    row_index = getattr(row, 'Index')
    trip_id = getattr(row, 'tripletid')
    bottle_oth = all_features[trip_id + "_OTH"]
    bottle_tgt = all_features[trip_id + "_TGT"]
    bottle_x = all_features[trip_id + "_X"]

    eucl_oth_x = \
        calculate_distances_dtw(bottle_oth,\
Exemplo n.º 30
0
def test_bad_signal():
    signal = Audio(np.random.random((10, 2)), 50)
    proc = RastaPlpProcessor(sample_rate=signal.sample_rate)
    with pytest.raises(ValueError) as err:
        proc.process(signal)
        assert 'signal must have one dimension' in str(err.value)