예제 #1
0
def test_save(tmpdir, audio):
    p = str(tmpdir.join('test.wav'))
    audio.save(p)

    # cannot overwrite an existing file
    with pytest.raises(ValueError) as err:
        audio.save(p)
    assert 'file already exist' in str(err)

    audio2 = Audio.load(p)
    assert audio == audio2

    # test with float32 wav
    signal = np.zeros((1000,), dtype=np.float32)
    signal[10] = 1.0
    signal[20] = -1.0
    p = str(tmpdir.join('test2.wav'))
    audio = Audio(signal, 1000)
    audio.save(p)
    meta = Audio.scan(p)
    assert meta.nsamples == 1000
    assert meta.nchannels == 1

    audio2 = Audio.load(p)
    assert audio2 == audio
    assert audio2.data.min() == -1.0
    assert audio2.data.max() == 1.0
예제 #2
0
def extract_features_shennong(audio_path, save_path):
    audio = Audio.load(audio_path)
    # 80-dim fbank with 1-dim energe
    processor = FilterbankProcessor(sample_rate=audio.sample_rate,
                                    num_bins=40,
                                    use_energy=False)  #80 fbank + 1 energy
    fbank = processor.process(audio)
    fbank = fbank.data  #(fbank.data - fbank.data.mean()) / fbank.data.std()

    # 3-dim pitch
    processor = PitchProcessor(frame_shift=0.01, frame_length=0.025)
    options = {
        'sample_rate': audio.sample_rate,
        'frame_shift': 0.01,
        'frame_length': 0.025,
        'min_f0': 20,
        'max_f0': 500
    }
    processor = PitchProcessor(**options)
    pitch = processor.process(audio)
    postprocessor = PitchPostProcessor()  # use default options
    postpitch = postprocessor.process(pitch)  # 3 dim
    postpitch = postpitch.data  #(postpitch.data - postpitch.data.mean()) / postpitch.data.std()
    #features = postpitch
    shape = min(fbank.shape[0], postpitch.shape[0])
    #zero = np.zeros((,content[i].shape[1]),dtype=np.float32)
    #content[i] = np.vstack((content[i],zero))
    features = np.concatenate((fbank[:shape, :], postpitch[:shape, :]),
                              axis=-1)

    # name = os.path.basename(audio_path).split('.')[0] + '.npy'
    # np.save(os.path.join(save_path, name), features.data)
    return features
def wavs_to_feats_df(wavs_list, feats):

    assert feats in ['mfcc', 'bnf'], "Unknown feature parameter for wavs_to_feats_df function: {}".format(feats)

    feats_list = []

    for wav_file in wavs_list:

        wav_data = Audio.load(wav_file).resample(8000)

        assert wav_data.sample_rate == 8000, "Error. Could not resample file to 8000 Hz for MFCC/BNF feature extraction."
        assert wav_data.nchannels == 1, "Unexpected non-mono file supplied: {}".format(filename)

        if feats == 'mfcc':
            mfcc_data = mfcc_processor.process(wav_data)
            mfcc_data = delta_processor.process(mfcc_data)
            feats_list.append(mfcc_data.data)

        elif feats == 'bnf':
            bnf_data = bnf_processor.process(wav_data)
            feats_list.append(bnf_data.data)        

    feats_df = pd.DataFrame({
        "filename" : [ os.path.splitext(os.path.basename(f))[0] for f in wavs_list ], # '.../filename.wav' => 'filename',
        "features" : feats_list
    })

    return feats_df
예제 #4
0
def test_check_wavs_bad(wav_file, wav_file_8k, tmpdir, capsys):
    def fun(utts):
        c = pipeline._init_config(
            pipeline.get_default_config('mfcc', with_cmvn=False))
        u = pipeline._init_utterances(utts)
        pipeline._Manager(c, u)
        return u

    # build a stereo file and make sure it is not supported by the
    # pipeline
    audio = Audio.load(wav_file)
    stereo = Audio(np.asarray((audio.data, audio.data)).T,
                   sample_rate=audio.sample_rate)
    assert stereo.nchannels == 2
    wav_file_2 = str(tmpdir.join('stereo.wav'))
    stereo.save(wav_file_2)
    with pytest.raises(ValueError) as err:
        fun([(wav_file_2, )])
    assert 'all wav files are not mono' in str(err)

    # ensure we catch differences in sample rates
    capsys.readouterr()  # clear buffer
    w = [(wav_file, ), (wav_file_8k, )]
    out = fun(w)
    err = capsys.readouterr().err
    assert 'several sample rates found in wav files' in err
    assert sorted(out.keys()) == ['utt_1', 'utt_2']

    # make sure timestamps are ordered
    with pytest.raises(ValueError) as err:
        fun([('1', wav_file, 1, 0)])
    assert 'timestamps are not in increasing order for' in str(err)
예제 #5
0
def get_plp_dd(wav_fn, norm):
    """Return the MFCCs with deltas and delta-deltas for a audio file."""
    audio = Audio.load(wav_fn)
    processor = PlpProcessor(sample_rate=audio.sample_rate, window_type="hamming",frame_length=0.025, frame_shift=0.01,
                              low_freq=0, vtln_low=60, vtln_high=7200, high_freq=audio.sample_rate/2)
    plp_static = processor.process(audio, vtln_warp=1.0)
    d_processor = DeltaPostProcessor(order=2)
    plp_deltas = d_processor.process(plp_static)
    features = np.float64(plp_deltas._to_dict()["data"])
    if norm == "cmvn":
        features = (features - np.mean(features, axis=0)) / np.std(features, axis=0)

    return features
def transform_all_wavs(folder_wav, type, folder_out): # will output [timexdim}
    processor = BottleneckProcessor(weights=type)
    count = 0
    for file in os.listdir(folder_wav):
        if count % 500 == 0:
            print(count)
        count += 1
        if not file.endswith('.wav'):
            continue
        audio = Audio.load(os.path.join(folder_wav, file))

        features = processor.process(audio)
        #print(features.shape)
        #print(features)
        np.savetxt(fname = os.path.join(folder_out,file[:-4] + '.csv'), X=features._data)
예제 #7
0
def test_compare_kaldi(wav_file):
    a1 = Audio.load(wav_file).data

    with tempfile.NamedTemporaryFile('w+') as tfile:
        tfile.write('test {}\n'.format(wav_file))
        tfile.seek(0)
        with SequentialWaveReader('scp,t:' + tfile.name) as reader:
            for key, wave in reader:
                a2 = wave.data().numpy()

    assert a1.max() == a2.max()
    assert a1.min() == a2.min()
    assert len(a1) == len(a2.flatten()) == 22713
    assert a1.dtype == np.int16 and a2.dtype == np.float32
    assert a1.shape == (22713,) and a2.shape == (1, 22713)
    assert pytest.approx(a1, a2)
예제 #8
0
def main():
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    parser.add_argument('data_dir', help='input directory with wavs')
    parser.add_argument(
        'output_dir',
        default='/tmp',
        nargs='?',
        help='output directory (created files are deleted at exit)')

    args = parser.parse_args()

    # load audio data and compute total duration
    audio_data = {
        os.path.basename(f): Audio.load(f)
        for f in list_files_with_extension(args.data_dir, '.wav')
    }
    total_duration = datetime.timedelta(
        seconds=int(sum(a.duration for a in audio_data.values())))
    print('found {} wav files, total duration of {}'.format(
        len(audio_data), str(total_duration)))

    # compute the features (default MFCC)
    print('computing MFCC features...')
    t1 = datetime.datetime.now()
    processor = MfccProcessor()
    features = FeaturesCollection(
        **{k: processor.process(v)
           for k, v in audio_data.items()})
    t2 = datetime.datetime.now()
    print('took {}'.format(t2 - t1))

    # save the features in all the supported formats
    data = {
        'duration': total_duration,
        'data': {
            ext: analyze_serializer(features, ext, args.output_dir)
            for ext in supported_extensions().keys()
        }
    }

    print_results(data)
def get_features(sound_file, chosen_processor):
    # computes the feature coefficients of a sound file

    #     :param sound_file : sound file in format .wav
    #     :type amount: .wav file
    #     :returns: feature coefficients per frame of 25ms every 10ms can be 'filterbank'
    #     'plp', 'rasteplp' or 'bottleneck'
    #     :rtype: a numpy array

    audio = Audio.load(sound_file)
    processors = {
        'filterbank': FilterbankProcessor(sample_rate=audio.sample_rate),
        'plp': PlpProcessor(sample_rate=audio.sample_rate),
        'rastaplp': RastaPlpProcessor(sample_rate=audio.sample_rate),
        'bottleneck': BottleneckProcessor(weights='BabelMulti')
    }

    features = chosen_processor.process(audio)
    features = pd.DataFrame(features)
    return (features)
예제 #10
0
def get_mfcc_vtln(wav_fn, f, norm, lang):
    """Return the MFCCs with deltas and delta-deltas for a audio file."""
    ref = os.path.basename(f).replace(".wav", "")
    if not os.path.isfile("warps_{}.pkl".format(lang)):
        if os.path.isfile('warps_{}.txt'.format(lang)):
            factors = {}
            with open('warps_{}.txt'.format(lang), mode='r',
                      encoding='utf-8') as opfile:
                wop = opfile.read().split('\n')
                for line in wop:
                    if len(line) > 1:
                        l_sp = line.split()
                        factors[l_sp[0]] = float(l_sp[1])
                        print(factors)
            with open('warps_{}.pkl'.format(lang), mode='wb') as opfile:
                pickle.dump(factors, opfile)
        else:
            print('no warp factors found')
            exit()
    with open("warps_{}.pkl".format(lang), mode="rb") as op:
        factors = pickle.load(op)
    warp = float(factors[ref])
    audio = Audio.load(wav_fn)
    processor = MfccProcessor(sample_rate=audio.sample_rate,
                              window_type="hamming",
                              frame_length=0.025,
                              frame_shift=0.01,
                              cepstral_lifter=26.0,
                              low_freq=0,
                              vtln_low=60,
                              vtln_high=7200,
                              high_freq=audio.sample_rate / 2)
    d_processor = DeltaPostProcessor(order=2)
    mfcc_static = processor.process(audio, vtln_warp=warp)
    mfcc_deltas = d_processor.process(mfcc_static)
    features = np.float64(mfcc_deltas._to_dict()["data"])
    if norm == "cmvn":
        features = (features - np.mean(features, axis=0)) / np.std(features,
                                                                   axis=0)

    return features
예제 #11
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('wav', help='wav file to compute features on')

    # load the wav file
    wav_file = parser.parse_args().wav
    audio = Audio.load(wav_file)

    # initialize features processors
    processors = {
        'spectrogram': SpectrogramProcessor(sample_rate=audio.sample_rate),
        'filterbank': FilterbankProcessor(sample_rate=audio.sample_rate),
        'mfcc': MfccProcessor(sample_rate=audio.sample_rate),
        'plp': PlpProcessor(sample_rate=audio.sample_rate),
        'rastaplp': RastaPlpProcessor(sample_rate=audio.sample_rate),
        'bottleneck': BottleneckProcessor(weights='BabelMulti')}

    # compute the features for all processors
    features = {k: v.process(audio) for k, v in processors.items()}

    # plot the audio signal and the resulting features
    fig, axes = plt.subplots(
        nrows=len(processors)+1,
        gridspec_kw={'top': 0.95, 'bottom': 0.05, 'hspace': 0},
        subplot_kw={'xticks': [], 'yticks': []})
    time = np.arange(0.0, audio.nsamples) / audio.sample_rate
    axes[0].plot(time, audio.astype(np.float32).data)
    axes[0].set_xlim(0.0, audio.duration)
    axes[0].text(
        0.02, 0.8, 'audio',
        bbox={'boxstyle': 'round', 'alpha': 0.5, 'color': 'white'},
        transform=axes[0].transAxes)

    for n, (k, v) in enumerate(features.items(), start=1):
        axes[n].imshow(v.data.T, aspect='auto')
        axes[n].text(
            0.02, 0.8, k,
            bbox={'boxstyle': 'round', 'alpha': 0.5, 'color': 'white'},
            transform=axes[n].transAxes)

    plt.show()
예제 #12
0
    def get_audio(self, utterance):
        """Returns the audio data for that `utterance`"""
        utt = self.utterances[utterance]
        audio = Audio.load(utt.file)
        if utt.tstart is not None:
            assert utt.tstop > utt.tstart
            audio = audio.segment([(utt.tstart, utt.tstop)])[0]

        if self.features == 'bottleneck':
            # resample here the signal (this avoid bugs if one part of
            # the pipeline on 8k and the other on 16k), then update
            # the metadata for the wav to be used by the rest of the
            # pipeline
            self.log.debug(
                'resampling audio from %dHz@%db to %dHz@%db',
                audio.sample_rate, audio.dtype.itemsize * 8, 8000, 16)

            audio = audio.resample(8000).astype(np.int16)
            self._wavs_metadata[self.utterances[utterance].file] = (
                Audio._metawav(
                    audio.nchannels, audio.sample_rate,
                    audio.nsamples, audio.duration))
        return audio
예제 #13
0
def test_load_badfile():
    with pytest.raises(ValueError) as err:
        Audio.load('/spam/spam/with/eggs')
    assert 'file not found' in str(err)
예제 #14
0
def audio_8k(wav_file_8k):
    return Audio.load(wav_file_8k)
예제 #15
0
def test_load_notwav():
    with pytest.raises(ValueError) as err:
        Audio.load(__file__)
    assert 'is it a wav?' in str(err)
예제 #16
0
        k = shortest_path_position[0][0]
        l = shortest_path_position[1][0]

    # divide the shortest distance by the length of the path
    average_distance = (distance_matrix[vector_1.shape[0]-1][vector_2.shape[0]-1]) \
                        / path_length
    return average_distance


all_features = {}

# get bottleneck features of all .wav files (stimuli)
for root, dirs, files in os.walk(WAV_FOLDER):
    for wav_file in files:
        if wav_file.endswith(".wav"):
            audio = Audio.load(root + wav_file)
            processor = BottleneckProcessor(weights='BabelMulti')
            features = processor.process(audio)
            vectors = features.data
            utterance = wav_file.split('.')[0]
            all_features[utterance] = vectors

for row in distance_list.itertuples():
    row_index = getattr(row, 'Index')
    trip_id = getattr(row, 'tripletid')
    bottle_oth = all_features[trip_id + "_OTH"]
    bottle_tgt = all_features[trip_id + "_TGT"]
    bottle_x = all_features[trip_id + "_X"]

    eucl_oth_x = \
        calculate_distances_dtw(bottle_oth,\
예제 #17
0
def audio(wav_file):
    return Audio.load(wav_file)