示例#1
0
    def converter(self, batch, device=-1):
        # alternative to chainer.dataset.concat_examples
        DATA_SHAPE = 40 * 3  # 40 log filterbank

        xs = [np.load(path).astype(np.float32) for path, _ in batch]
        delta_x = [delta(x, 3) for x in xs]
        delta_delta_x = [delta(x, 3) for x in delta_x]

        Xs = [
            to_device(self.device,
                      np.concatenate((a, b, c), axis=1).astype(np.float32))
            for a, (b, c) in zip(xs, zip(delta_x, delta_delta_x))
        ]

        # Xs = [F.concat((X, self.xp.zeros(((self.stacked_frames + self.skip_size) - len(X), DATA_SHAPE), dtype=self.xp.float32)), axis=0) if len(X) < (self.stacked_frames + self.skip_size) else X for X in Xs]

        # Xs = [F.pad_sequence([X[i:i + self.stacked_frames] for i in range(0, len(X), self.skip_size)]).reshape(-1, DATA_SHAPE * self.stacked_frames) for X in Xs]

        word_label = [
            self.xp.asarray(lab[0]).astype(self.xp.int32) for _, lab in batch
        ]
        char_lable = [
            self.xp.asarray(lab[1]).astype(self.xp.int32) for _, lab in batch
        ]

        lable_batch = char_lable

        return Xs, lable_batch
示例#2
0
def test():
    (fs, x) = wavfile.read('a.wav')
    mfcc_feat = mfcc_wav('a.wav', 0.02, 0.01)
    mfcc_delta = base.delta(mfcc_feat, 2)
    mfcc_delta_delta = base.delta(mfcc_delta, 2)
    mfcc_feat = np.hstack([mfcc_feat, mfcc_delta, mfcc_delta_delta])

    features = mfcc_feat


    data_dim = 39
    timesteps = 8
    num_classes = 9
    print('Reshaping data for LSTM')
    features = reshape_data(features,timesteps)


    json_file = open('model.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
# load weights into new model
    loaded_model.load_weights("model.h5")
    print("Loaded model from disk")
    loaded_model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['accuracy'])
    prediction =  loaded_model.predict(features)
    print(np.shape(prediction))
    print(take_max(prediction))
示例#3
0
def dscc(signal,
         samplerate=16000,
         winlen=0.025,
         winstep=0.01,
         numcep=13,
         nfilt=26,
         nfft=512,
         lowfreq=0,
         highfreq=None,
         preemph=0.97,
         ceplifter=22,
         appendEnergy=True,
         winfunc=lambda x: numpy.ones((x, ))):
    feats, energies = base.fbank(signal, samplerate, winlen, winstep, nfilt,
                                 nfft, lowfreq, highfreq, preemph, winfunc)
    feats = base.delta(feats, 2)  # OBTAIN DELTA
    feats = boxcox(feats)
    feats = numpy.log(feats)
    feats = dct(feats, type=2, axis=1, norm='ortho')[:, :numcep]
    feats = base.lifter(feats, ceplifter)
    if appendEnergy:
        feats[:, 0] = numpy.log(
            energies
        )  # replace first cepstral coefficient with log of frame energy
    feats = base.delta(feats, 2)  #verify if 2 is right
    return feats
示例#4
0
def get_features(audio_file):
    '''Get features from a file'''
    signal, sample_rate = sf.read(tf.gfile.FastGFile(audio_file, 'rb'))
    feat, energy = fbank(signal, sample_rate, nfilt=FLAGS.nfilt)
    feat = np.log(feat)
    dfeat = delta(feat, 2)
    ddfeat = delta(dfeat, 2)
    return np.concatenate([feat, dfeat, ddfeat, np.expand_dims(energy, 1)],
                          axis=1)
示例#5
0
def get_features(audio_file):
    """Get features from a file"""
    signal, sample_rate = sf.read(audio_file)
    feat, energy = fbank(signal, sample_rate, nfilt=FLAGS.nfilt)
    feat = np.log(feat)
    dfeat = delta(feat, 2)
    ddfeat = delta(dfeat, 2)
    return np.concatenate(
        [feat, dfeat, ddfeat, np.expand_dims(energy, 1)], axis=1)
示例#6
0
def extract_feature(wav_path):
    """Extract 39-dim mfcc feature."""
    fs, audio = wav.read(wav_path)
    mfcc = base.mfcc(audio,
                     fs,
                     winlen=0.025,
                     winstep=0.01,
                     numcep=13,
                     nfilt=26,
                     preemph=0.97,
                     appendEnergy=True)
    mfcc_d = base.delta(mfcc, N=2)
    mfcc_dd = base.delta(mfcc_d, N=2)
    feat = np.concatenate([mfcc, mfcc_d, mfcc_dd], axis=1)
    return feat
示例#7
0
    def graves_2012(self, wav_path):
        """
    Alex. Graves:
    Sequence Transduction with Recurrent Neural Networks.
    CoRR abs/1211.3711 (2012)

    MFCC features
    Standard speech preprocessing was applied to transform the audio files into
    feature sequences. 26 channel mel-frequency filter bank and a pre-emphasis
    coefficient of 0.97 were used to compute 12 mel-frequency cepstral coeffici-
    ents plus an energy coefficient on 25ms Hamming windows at 10ms intervals.
    Delta coefficients were added to create input sequences of length 26 vectors

    For CMVN
    and all coefficient were normalised to have mean zero and standard deviat-
    ion one over the train- ing set. ==> please set --prep-cmvn-samples to -1.

    I left as default the other options which were not mentioned in the paper
    such as nfft, lowfreq, highfreq, ceplifter, etc.

    :param wav_path: wav file path
    :return: a feature sequence
    """
        (rate, sig) = wav.read(Util.get_file_path(self.basepath, wav_path))
        # computing features
        mfcc_feat = \
          mfcc(signal=sig, samplerate=rate, numcep=12, winlen=0.025, nfilt=26,
               winstep=0.01, preemph=0.97, appendEnergy=False, winfunc=np.hamming)
        # adding energy
        energy = np.expand_dims(np.sum(np.power(mfcc_feat, 2), axis=-1), 1)
        mfcc_e_feat = np.concatenate((energy, mfcc_feat), axis=-1)
        # concatenating a delta vector
        delta_feat = delta(mfcc_e_feat, 1)
        return np.concatenate((mfcc_e_feat, delta_feat), axis=1)
示例#8
0
def computeDeltaDelta(nfilt=41):
    '''
    function that compute the second derivative of the mel-fbanks representation. The function read the input data from
    the delta_buffer queue and put the results in delta_delta_buffer queue. he results for each frame is a tuple
    (fbanks+delta+deltaDelta, wave_form)
    Note: not used in this version of the pipeline
    '''
    num_frame = (fbanks_and_delta_window_size * 2 + 1)
    num_frame_VAD_window = (VAD_window_size * 2 + 1)
    while True:
        if delta_buffer.qsize() >= num_frame:
            last_N_frames = delta_buffer.get_last_n_frame(num_frame)
            last_N_frames = [frame[0] for frame in last_N_frames]
            last_N_frames = [d[:-nfilt] for d in last_N_frames]
            frame_delta = np.squeeze(np.asarray((delta_buffer[fbanks_and_delta_window_size])[0]))
            wave_form = (delta_buffer[fbanks_and_delta_window_size])[1]
            delta_buffer.pull_last_n_frame(1)
            frame_delta_delta = delta(last_N_frames, fbanks_and_delta_window_size)[fbanks_and_delta_window_size]
            delta_delta_frame = np.concatenate((frame_delta, frame_delta_delta))
            delta_delta_buffer.put((delta_delta_frame, wave_form))
            if delta_delta_buffer.isFirst:
                delta_delta_buffer.isFirst = False
                # add n times the first frame in order to allow th VAD to compute teh result even for the first frame
                for _ in range(num_frame_VAD_window - 1):
                    delta_delta_buffer.put((delta_delta_frame, wave_form))
示例#9
0
def mfcc_features(
	wavarr, 
	win_len=5, 		# window length for feature extraction in secs - run_orig.m
	win_overlap=0, 	# specify the overlap between adjacent windows for feature extraction in percentage - run_orig.m
	nfft=0,
	lowfreq=5, 
	highfreq=1000,
	kDelta=False, 
	logging=False
):
	# rate, aud_data = scipy.io.wavfile.read(file)
	rate = wavarr[0]
	signal = wavarr[1]
	d_mfcc_feat = None

	if nfft == 0:
		nfft = fft.calculate_nfft(signal.size)		#FFT size as the padded next power-of-two

	mfcc_feat = base.mfcc(signal, rate,
		winlen=win_len,						#window_length*1000 in extractFeatures.m
		winstep=win_len-win_overlap,		#10ms shift; Ts = 10 in extractFeatures.m
		numcep=13,			 				#C=12; in extractFeatures.m
		nfilt=20, 							#M=20; in extractFeatures.m
		nfft=nfft,							#pad to next power-of-2
		lowfreq=5, highfreq=1000, 			#LF=5; HF=1000; in extractFeatures.m
		preemph=0.97, ceplifter=22,			#alpha=0.97; L=22; in extractFeatures.m
		winfunc=np.hamming,					#@hamming
		appendEnergy=False					# replace first cepstral coefficient with log of frame energy
	)
	if kDelta:
		d_mfcc_feat = base.delta(mfcc_feat, 2)		#compute delta features from a feature vector
	#fbank_feat = sigproc.logfbank(signal, rate)	#compute log Mel-filterbank energy features from an audio signal

	return mfcc_feat, d_mfcc_feat
示例#10
0
def compute_fbanks_dataset(path="", nfilt=40):
    for filename in glob.glob(os.path.join(path, '*.wav')):
        sample_rate, audio_data = read(filename)
        fbanks, energy = logfbank(signal=audio_data,
                                  samplerate=sample_rate,
                                  nfilt=nfilt)
        fbanks = np.concatenate(
            (fbanks, np.reshape(energy, (energy.shape[0], 1))), axis=1)

        fbanks_delta = delta(feat=fbanks, N=win_size)
        fbanks_delta_delta = delta(feat=fbanks_delta, N=win_size)
        audio_features = np.concatenate(
            (fbanks, fbanks_delta, fbanks_delta_delta), axis=1)
        filename, _ = os.path.splitext(filename)
        print filename
        np.save(filename, audio_features)
示例#11
0
def computeDeltaDelta(nfilt=41):
    '''
    function that compute the second derivative of the mel-fbanks representation. The function read the input data from
    the delta_buffer queue and put the results in delta_delta_buffer queue. he results for each frame is a tuple
    (fbanks+delta+deltaDelta, wave_form)
    Note: not used in this version of the pipeline
    '''
    num_frame = (fbanks_and_delta_window_size * 2 + 1)
    num_frame_VAD_window = (VAD_window_size * 2 + 1)
    while True:
        if delta_buffer.qsize() >= num_frame:
            last_N_frames = delta_buffer.get_last_n_frame(num_frame)
            last_N_frames = [frame[0] for frame in last_N_frames]
            last_N_frames = [d[:-nfilt] for d in last_N_frames]
            frame_delta = np.squeeze(
                np.asarray((delta_buffer[fbanks_and_delta_window_size])[0]))
            wave_form = (delta_buffer[fbanks_and_delta_window_size])[1]
            delta_buffer.pull_last_n_frame(1)
            frame_delta_delta = delta(
                last_N_frames,
                fbanks_and_delta_window_size)[fbanks_and_delta_window_size]
            delta_delta_frame = np.concatenate(
                (frame_delta, frame_delta_delta))
            delta_delta_buffer.put((delta_delta_frame, wave_form))
            if delta_delta_buffer.isFirst:
                delta_delta_buffer.isFirst = False
                # add n times the first frame in order to allow th VAD to compute teh result even for the first frame
                for _ in range(num_frame_VAD_window - 1):
                    delta_delta_buffer.put((delta_delta_frame, wave_form))
示例#12
0
def computeDelta():
    '''
    function that compute the first derivative of the mel-fbanks representation. The function read the input data from
    the audio_buffer queue and put the results in delta_buffer queue. the results for each frame is a tuple
    (fbanks+delta, wave_form)
    Note: not used in this version of the pipeline
    '''
    num_frame = (fbanks_and_delta_window_size * 2 + 1)
    while True:
        if audio_buffer.qsize() >= num_frame:
            last_N_frames = np.asarray(
                audio_buffer.get_last_n_frame(num_frame))
            last_N_frames = [frame[0] for frame in last_N_frames]
            fbanks = np.squeeze(
                np.asarray((audio_buffer[fbanks_and_delta_window_size])[0]))
            wave_form = (audio_buffer[fbanks_and_delta_window_size])[1]
            audio_buffer.pull_last_n_frame(1)
            frame_delta = delta(
                last_N_frames,
                fbanks_and_delta_window_size)[fbanks_and_delta_window_size]
            delta_frame = np.concatenate((fbanks, frame_delta))
            delta_buffer.put((delta_frame, wave_form))
            if delta_buffer.isFirst:
                for _ in range(fbanks_and_delta_window_size):
                    delta_buffer.put((delta_frame, wave_form))
                delta_buffer.isFirst = False
def get_fbank_feature(wavsignal, fs):
    """
    输入为wav文件数学表示和采样频率,输出为语音的FBANK特征+一阶差分+二阶差分
    :param wavsignal:
    :param fs:
    :return:
    """
    feat_fbank = logfbank(wavsignal,
                          fs,
                          nfilt=40,
                          nfft=2048,
                          winstep=0.025,
                          winlen=0.05)
    feat_fbank_d = delta(feat_fbank, 2)
    feat_fbank_dd = delta(feat_fbank_d, 2)
    wav_feature = np.column_stack((feat_fbank, feat_fbank_d, feat_fbank_dd))
    return wav_feature
示例#14
0
def if_Miriam(ramkowanie, delta_memory):
    treshold = -50
    path="waves/answer.wav"
    with open('model.bin', 'rb') as f:
        model=pickle.load(f)
        f.close()
    fs, data = wav.read(path)
    MFCC = mfcc(samples, fs,winlen=ramkowanie, nfft=round(ramkowanie * fs) + 1, numcep=13)
    delta=base.delta(MFCC, delta_memory)
    delta_delta=base.delta(delta, delta_memory)
    MFCC_and_deltas = numpy.c_[MFCC, delta, delta_delta]
    score=model.score(MFCC_and_deltas)
    print(score)
    if score>treshold:
        return True
    else:
        return False
def get_mfcc_feature(wavsignal, fs):
    """
    输入为wav文件数学表示和采样频率,输出为语音的MFCC特征+一阶差分+二阶差分;
    :param wavsignal:
    :param fs:
    :return:
    """
    feat_mfcc = mfcc(wavsignal,
                     fs,
                     nfft=2048,
                     nfilt=40,
                     numcep=40,
                     winlen=0.05,
                     winstep=0.025)
    feat_mfcc_d = delta(feat_mfcc, 2)
    feat_mfcc_dd = delta(feat_mfcc_d, 2)
    wav_feature = np.column_stack((feat_mfcc, feat_mfcc_d, feat_mfcc_dd))
    return wav_feature
示例#16
0
def compute_mfcc(wav_path, winstep=0.01):

    (rate, sig) = wav.read(wav_path)

    mfcc_feat = mfcc(signal=sig,
                     samplerate=rate,
                     appendEnergy=True,
                     winstep=winstep)
    # Deltas
    d_mfcc_feat = delta(mfcc_feat, 2)
    # Deltas-Deltas
    dd_mfcc_feat = delta(d_mfcc_feat, 2)
    # transpose
    mfcc_feat = np.transpose(mfcc_feat)
    d_mfcc_feat = np.transpose(d_mfcc_feat)
    dd_mfcc_feat = np.transpose(dd_mfcc_feat)
    # concat above three features
    concat_mfcc_feat = np.concatenate((mfcc_feat, d_mfcc_feat, dd_mfcc_feat))
    return concat_mfcc_feat
def extract_features(audio_data, samplerate):
    mfcc_features = []
    hamming_window = numpy.hamming(400)
    if (len(audio_data.shape) > 1):
        audio_data = audio_data[:, 0]
    for i in range(0, audio_data.shape[0] - 400, 240):
        trimmed = audio_data[i:i + 400]
        hammed = numpy.multiply(hamming_window, trimmed)
        mfcced = mfcc(hammed, samplerate, nfft=2048)
        mfcc_features.append(mfcced[0])
    delta_features = delta(mfcc_features, 1)
    return mfcc_features, delta_features
示例#18
0
    def graves_2013(self, wav_path):
        """
    Alex Graves, Abdel-rahman Mohamed, Geoffrey E. Hinton:
    Speech recognition with deep recurrent neural networks.
    ICASSP 2013: 6645-6649

    FBANK features : (40 fbank, 1 energy * 3)
    The audio data was encoded using a Fourier-transform-based filter-bank with
    40 coefficients (plus energy) distributed on a mel-scale, together with their
    first and second temporal derivatives. Each input vector was therefore size 123.

    For CMVN
    The data were normalised so that every element of the input vec- tors had
    zero mean and unit variance over the training set.

    there is not description about window I chose to use a hanning window.

    I left as default the other options which were not mentioned in the paper
    such as nfft, lowfreq, highfreq, ceplifter, etc.

    :param wav_path: wav file path
    :return: a feature sequence
    """
        (rate, sig) = wav.read(Util.get_file_path(self.basepath, wav_path))
        # computing features
        fbank_feat, _ = \
          fbank(signal=sig, samplerate=rate, nfilt=40, winfunc=np.hanning)

        # adding energy
        energy = np.expand_dims(np.sum(np.power(fbank_feat, 2), axis=-1), 1)
        fbank_e_feat = np.concatenate((energy, fbank_feat), axis=-1)
        # concatenating delta vectors
        delta_feat = delta(fbank_e_feat, 1)
        delta_delta_feat = delta(fbank_e_feat, 2)
        return np.concatenate((fbank_e_feat, delta_feat, delta_delta_feat),
                              axis=1)
示例#19
0
def get_features(signal, rate, normalize):

    # extract features
    features = mfcc(signal, rate, winlen=0.020, winstep=0.010)

    # print('mfcc: ' , np.shape(features))

    delta = base.delta(features, 5)

    features = np.concatenate((features, delta), axis=-1)

    # print('delta: ', np.shape(features))

    # perform normalization if asked to
    if normalize is True:
        mean_features = np.mean(features, axis=0)
        var_features = np.var(features, axis=0)
        features = (features - mean_features) / var_features

    return features
def extract_mfcc(wave_files, encoded_labels, files_destination, labels_destination, mfcc_type):
    labels_df = pd.DataFrame(columns=['file', 'label'])
    files_num = len(wave_files)

    for i, (wave_file, label) in enumerate(zip(wave_files, encoded_labels)):
        wave_file_name = wave_file.split('/')[-1]
        mfcc_file_path = files_destination + wave_file_name.split('.')[0] + '.npy'

        print('{}/{}\t{}'.format(i + 1, files_num, wave_file_name))
        wave_data, sample_rate = sf.read(wave_file)
        # save mfcc
        if mfcc_type == 'cnn':
            mfcc = librosa.feature.mfcc(wave_data, sr=sample_rate)
        elif mfcc_type == 'rnn':
            mfcc = base.mfcc(wave_data,
                             samplerate=sample_rate,
                             numcep=13,
                             winstep=0.01,
                             winfunc=np.hamming)
            deltas = base.delta(mfcc, 2)

            # normalize mfcc over all frames
            mfcc_mean = np.mean(mfcc, axis=0)
            mfcc_std = np.std(mfcc, axis=0)
            mfcc = (mfcc - mfcc_mean)/mfcc_std

            # normalize deltas over all frames
            delta_mean = np.mean(deltas, axis=0)
            delta_std = np.std(deltas, axis=0)
            deltas = (deltas - delta_mean)/delta_std

        np.save(mfcc_file_path,
                np.concatenate((mfcc, deltas), axis=1),
                allow_pickle=False)

        labels_df.loc[i] = [wave_file_name, label]

    labels_df.to_csv(labels_destination,
                     sep='\t',
                     index=False)
示例#21
0
def computeDelta():
    '''
    function that compute the first derivative of the mel-fbanks representation. The function read the input data from
    the audio_buffer queue and put the results in delta_buffer queue. the results for each frame is a tuple
    (fbanks+delta, wave_form)
    Note: not used in this version of the pipeline
    '''
    num_frame = (fbanks_and_delta_window_size * 2 + 1)
    while True:
        if audio_buffer.qsize() >= num_frame:
            last_N_frames = np.asarray(audio_buffer.get_last_n_frame(num_frame))
            last_N_frames = [frame[0] for frame in last_N_frames]
            fbanks = np.squeeze(np.asarray((audio_buffer[fbanks_and_delta_window_size])[0]))
            wave_form = (audio_buffer[fbanks_and_delta_window_size])[1]
            audio_buffer.pull_last_n_frame(1)
            frame_delta = delta(last_N_frames, fbanks_and_delta_window_size)[fbanks_and_delta_window_size]
            delta_frame = np.concatenate((fbanks, frame_delta))
            delta_buffer.put((delta_frame, wave_form))
            if delta_buffer.isFirst:
                for _ in range(fbanks_and_delta_window_size):
                    delta_buffer.put((delta_frame, wave_form))
                delta_buffer.isFirst = False
示例#22
0
from python_speech_features import mfcc
from python_speech_features import base
from sklearn.mixture import GaussianMixture as GMM

recordings=15
ramkowanie=0.025
components=4
delta_memory=2
path = os.getcwd()
allMFCC = numpy.empty([1, recordings], dtype=object)
i=0
for file in os.listdir(os.path.join(path, 'miriam')):
    print("1")
    fs, samples = wav.read(os.path.join(path, 'miriam', file))
    MFCC = mfcc(samples, fs,winlen=ramkowanie, nfft=round(ramkowanie * fs) + 1, numcep=13)
    delta=base.delta(MFCC, delta_memory)
    delta_delta=base.delta(delta, delta_memory)
    MFCC_and_deltas=numpy.c_[MFCC,delta,delta_delta]
    allMFCC[0, i] = MFCC_and_deltas
    i = i + 1
# w nazwie pliku jest liczba parametrow mfcc z ktorej zostala funkcja wywołana
#with open('mfcc_miriam.bin', 'wb') as f:
#    pickle.dump(allMFCC, f)
#    f.close()

model= GMM(components, covariance_type="diag")
tmp_data = numpy.zeros((0, 39))
for j in range(0, recordings):
    tmp_data = numpy.r_[tmp_data, allMFCC[0, j]]
model.fit(tmp_data)
with open('model.bin', 'wb') as f:
示例#23
0
def extractMFCC(
	kNumFeatures,
	shape,				#only supports 1D features for now, will add 2D support next
	wavarr, 
	win_len, 
	win_overlap, 
	nfft, 
	cutoff=256,		#for dominant_frequency_features only 
	kDelta=False 	#frame-deltas (need more than 1 frame)
):
	rate   = wavarr[0]
	signal = wavarr[1]
	kNumScalars = 2

	#see 'extractFeaturesCodegen.m'
	mfcc_feat, _ = mymfcc.mfcc_features(wavarr, win_len=win_len, win_overlap=win_overlap, nfft=nfft, lowfreq=5, highfreq=1000)
	#print("  # mfcc frames %d, mfcc_feat[0].shape: %s" % (mfcc_feat.shape[0], str(mfcc_feat[0].shape)))
	nframes = mfcc_feat.shape[0]

	if shape=='1D':
		features = [None] * kNumScalars 	#np.zeros(shape=(kNumFeatures,), dtype=np.float)

	if shape=='2D':
		ndfeatures = np.zeros((nframes, kNumFeatures), dtype=np.float32)

	step_length = win_len - win_overlap
	offset = kNumScalars

	for f in range(nframes):	#this can be replaced by one np flattening
		mfcc = mfcc_feat[f]		#a frame

		current_start_sample = f * rate * step_length
		current_end_sample = current_start_sample + win_len * rate
		current_signal = signal[current_start_sample:current_end_sample]	#current window

		#[kurtosis, dominant_frequency_features]
		kurt = kurtosis(current_signal)
		#dom_nfft = 4096					#Matlab code is using nfft=4096
		#maxfreq, domfreq = dominant_frequency_features(current_signal[0:dom_nfft], rate, cutoff=cutoff, nfft=dom_nfft)

		if shape=='1D':		#only supports 1D features for now, will add 2D support next
			features[0] = kurt
			features[1] = 0 	#domfreq
			#features[2] = skew(current_signal)		#this led to a drop of > 4% accuracy
			features.extend(mfcc) 					#TODO: check for overflow here if we use ndarray for features
			offset += len(mfcc)

		if shape=='2D':
			fvec = [kurt, 0]
			fvec.extend(mfcc)
			ndfeatures[f] = fvec

	if kDelta and (nframes > 1):
		assert(False)		#TODO: fix this code
		d_mfcc_feat = base.delta(mfcc_feat, 2)		#compute delta features from a feature vector
		ndfeatures = np.append(features, d_mfcc_feat)

	if shape=='1D':		#only supports 1D features for now, will add 2D support next
		ndfeatures = np.zeros(kNumFeatures, dtype=np.float32)
		n = min(kNumFeatures, len(features))
		ndfeatures[0:n] = features[:n]

	return ndfeatures
    [frames, energy] = raw_frames(prime_data, frame_shift, window_size)
    freq_spectrum = get_freqspectrum(frames, params['alpha'], fs, window_size)
    fbanks = get_fbanks(freq_spectrum, params['n_filters'], fs)
    prime_features = get_mfcc(fbanks)

    prime_features = np.concatenate([energy[:, None], prime_features], 1)

    [frames, energy] = raw_frames(target_data, frame_shift, window_size)
    freq_spectrum = get_freqspectrum(frames, params['alpha'], fs, window_size)
    fbanks = get_fbanks(freq_spectrum, params['n_filters'], fs)
    target_features = get_mfcc(fbanks)

    target_features = np.concatenate([energy[:, None], target_features], 1)

    single_delta = base.delta(prime_features, params['delta_n'])
    double_delta = base.delta(single_delta, params['delta_n'])
    prime_features = np.concatenate(
        [prime_features, single_delta, double_delta], 1)

    single_delta = base.delta(target_features, params['delta_n'])
    double_delta = base.delta(single_delta, params['delta_n'])
    target_features = np.concatenate(
        [target_features, single_delta, double_delta], 1)

    feature_shape = np.shape(prime_features)[1]
    prime_node = output_file.create_earray(feature_node,
                                           'prime',
                                           f_atom, (0, feature_shape),
                                           expectedrows=5000)
    prime_node.append(prime_features)
示例#25
0
import numpy as np
from pathlib import Path
from argparse import ArgumentParser
from python_speech_features.base import delta

parser = ArgumentParser()

parser.add_argument("--source", type=Path, required=True)
parser.add_argument("--output", type=Path)
parser.add_argument("--size", type=int, default=2)

args = parser.parse_args()

source = args.source
output = args.output or args.source.with_name(f"delta_{args.source.name}")
output.parent.mkdir(parents=True, exist_ok=True)

source_npz = np.load(source)

output_dict = {}
for key, value in source_npz.items():
    output_dict[key] = delta(value, args.size)

np.savez(output, **output_dict)
示例#26
0
def audio_features(params, img_audio, audio_path, append_name, node_list):
    output_file = params['output_file']
    # create pytable atom for the features
    f_atom = tables.Float32Atom()
    count = 1
    # keep track of the nodes for which no features could be made, places
    # database contains some empty audio files
    invalid = []
    for node in node_list:
        print(f'processing file: {count}')
        count += 1
        # create a group for the desired feature type
        audio_node = output_file.create_group(node, params['feat'])
        # get the base name of the node this feature will be appended to
        base_name = node._v_name.split(append_name)[1]
        # get the caption file names corresponding to the image of this node
        caption_files = img_audio[base_name][1]

        for cap in caption_files:
            # remove extension from the caption filename
            base_capt = cap.split('.')[0]
            # remove folder path from file names (Places/coco database)
            if '/' in base_capt:
                base_capt = base_capt.split('/')[-1]
            if '-' in base_capt:
                base_capt = base_capt.replace('-', '_')
            # read audio samples
            try:
                input_data, fs = librosa.load(os.path.join(audio_path, cap),
                                              sr=None)
                # in the places database some of the audiofiles are empty
                if len(input_data) == 0:
                    break
            except:
                # try to repair broken files, some files had a wrong header.
                # In Places I found some that could not be fixed however
                try:
                    fix_wav(os.path.join(audio_path, cap))
                    #input_data = read(os.path.join(audio_path, cap))
                except:
                    # the loop will break, if no valid audio features could
                    # be made for this image, the entire node is deleted.
                    break
            # set the fft size to the power of two equal to or greater than
            # the window size.
            window_size = int(fs * params['t_window'])
            exp = 1
            while True:
                if np.power(2, exp) - window_size >= 0:
                    fft_size = np.power(2, exp)
                    break
                else:
                    exp += 1

###############################################################################
# create audio features
            if params['feat'] == 'raw':
                # calculate the needed frame shift, premphasize and frame
                # the signal
                frame_shift = int(fs * params['t_shift'])
                input = sigproc.preemphasis(input_data, coeff=params['alpha'])
                features = sigproc.framesig(input_data,
                                            frame_len=window_size,
                                            frame_step=frame_shift,
                                            winfunc=params['windowing'])

            elif params['feat'] == 'freq_spectrum':
                # calculate the needed frame shift, premphasize and frame
                # the signal
                frame_shift = int(fs * params['t_shift'])
                input = sigproc.preemphasis(input_data, coeff=params['alpha'])
                frames = sigproc.framesig(input,
                                          frame_len=window_size,
                                          frame_step=frame_shift,
                                          winfunc=params['windowing'])
                # create the power spectrum
                features = sigproc.powspec(frames, fft_size)

            elif params['feat'] == 'fbanks':
                # create mel filterbank features
                [features, energy] = base.fbank(input_data,
                                                samplerate=fs,
                                                winlen=params['t_window'],
                                                winstep=params['t_shift'],
                                                nfilt=params['nfilters'],
                                                nfft=fft_size,
                                                lowfreq=0,
                                                highfreq=None,
                                                preemph=params['alpha'],
                                                winfunc=params['windowing'])

            elif params['feat'] == 'mfcc':
                # create mfcc features
                features = base.mfcc(input_data,
                                     samplerate=fs,
                                     winlen=params['t_window'],
                                     winstep=params['t_shift'],
                                     numcep=params['ncep'],
                                     nfilt=params['nfilters'],
                                     nfft=fft_size,
                                     lowfreq=0,
                                     highfreq=None,
                                     preemph=params['alpha'],
                                     ceplifter=0,
                                     appendEnergy=params['use_energy'],
                                     winfunc=params['windowing'])

            # apply cepstral mean variance normalisation
            if params['normalise']:
                features = (features - features.mean(0)) / features.std(0)
            # optionally add the deltas and double deltas
            if params['use_deltas']:

                single_delta = base.delta(features, params['delta_n'])
                double_delta = base.delta(single_delta, params['delta_n'])
                features = np.concatenate(
                    [features, single_delta, double_delta], 1)
###############################################################################
# create new leaf node in the feature node for the current audio
# file
            feature_shape = np.shape(features)[1]
            f_table = output_file.create_earray(audio_node,
                                                append_name + base_capt,
                                                f_atom, (0, feature_shape),
                                                expectedrows=5000)

            # append new data to the tables
            f_table.append(features)
        if audio_node._f_list_nodes() == []:
            # keep track of all the invalid nodes for which no features could
            # be made
            invalid.append(node._v_name)
            # remove the top node including all other features if no captions
            # features could be created
            output_file.remove_node(node, recursive=True)
    print(invalid)
    print(f'There were {len(invalid)} files that could not be processed')