예제 #1
0
def load_data_shared(ind):

    #Training and testing data
    timit_data_train = genfromtxt('timit_data_1280_train.csv', delimiter=',')
    timit_vwlname_train = genfromtxt('timit_vwlname_1280_train.csv', delimiter=',')
    timit_vwlname_train[:] = [x - 1 for x in timit_vwlname_train]
    timit_data_test = genfromtxt('timit_data_1280_test.csv', delimiter=',')
    timit_vwlname_test = genfromtxt('timit_vwlname_1280_test.csv', delimiter=',')   
    timit_vwlname_test[:] = [x - 1 for x in timit_vwlname_test]

    fs = 16000
    datalen = 1280
    narr = np.array([13, 26, 39]); #Number of features in each frame
    i=0; j=0;
    
    trainfeature=np.zeros((len(timit_data_train), (datalen*100/fs - 1)*narr[ind]))
    for x in timit_data_train:
        fbank_flat = logfbank(x,fs).flatten()
        mfcc_flat = mfcc(x,fs).flatten()
        if ind == 0:
            trainfeature[i,:] = mfcc_flat
        elif ind == 1:
            trainfeature[i,:] = fbank_flat
        else:
            trainfeature[i,:] = np.concatenate((mfcc_flat, fbank_flat))
        i = i+1
        
    testfeature=np.zeros((len(timit_data_test), (datalen*100/fs - 1)*narr[ind]))
    for x in timit_data_test:
        fbank_flat = logfbank(x,fs).flatten()
        mfcc_flat = mfcc(x,fs).flatten()
        if ind == 0:
            testfeature[j,:] = mfcc_flat
        elif ind == 1:
            testfeature[j,:] = fbank_flat
        else:
            testfeature[j,:] = np.concatenate((mfcc_flat, fbank_flat))
        j = j+1

    training_data = (trainfeature, timit_vwlname_train)
    test_data = (testfeature, timit_vwlname_test)

    # For now, I am using test data as validating data. Should change later.
    validation_data = test_data

    def shared(data):
        """Place the data into shared variables.  This allows Theano to copy
        the data to the GPU, if one is available.

        """
        shared_x = theano.shared(
            np.asarray(data[0], dtype=theano.config.floatX), borrow=True)
        shared_y = theano.shared(
            np.asarray(data[1], dtype=theano.config.floatX), borrow=True)
        return shared_x, T.cast(shared_y, "int32")
    
    return [shared(training_data), shared(validation_data), shared(test_data)]
예제 #2
0
def svm_baseline():

    #### Change here
    ind = 0;  # 0 for mfcc,  1 for filterbank,  2 for both
    narr = np.array([13, 26, 39]); # corresponding length of feature in a frame


    #Training and testing data
    timit_data_train = genfromtxt('timit_data_1280_train.csv', delimiter=',')
    timit_vwlname_train = genfromtxt('timit_vwlname_1280_train.csv', delimiter=',')
    timit_vwlname_train[:] = [x - 1 for x in timit_vwlname_train]
    timit_data_test = genfromtxt('timit_data_1280_test.csv', delimiter=',')
    timit_vwlname_test = genfromtxt('timit_vwlname_1280_test.csv', delimiter=',')     
    timit_vwlname_test[:] = [x - 1 for x in timit_vwlname_test]

    fs = 16000
    datalen = 1280
    i=0; j=0;
    trainfeature=np.zeros((len(timit_data_train), (datalen*100/fs - 1)*narr[ind]))
    for x in timit_data_train:
        fbank_flat = logfbank(x,fs).flatten()
        mfcc_flat = mfcc(x,fs).flatten()
        if ind == 0:
            trainfeature[i,:] = mfcc_flat
        elif ind == 1:
            trainfeature[i,:] = fbank_flat
        else:
            trainfeature[i,:] = np.concatenate((mfcc_flat, fbank_flat))
        i = i+1
        
    testfeature=np.zeros((len(timit_data_test), (datalen*100/fs - 1)*narr[ind]))
    for x in timit_data_test:
        fbank_flat = logfbank(x,fs).flatten()
        mfcc_flat = mfcc(x,fs).flatten()
        if ind == 0:
            testfeature[j,:] = mfcc_flat
        elif ind == 1:
            testfeature[j,:] = fbank_flat
        else:
            testfeature[j,:] = np.concatenate((mfcc_flat, fbank_flat))
        j = j+1

    training_data = (list(trainfeature), timit_vwlname_train)
    test_data = (list(testfeature), timit_vwlname_test)


    # train
    clf = svm.SVC()
    clf.fit(training_data[0], training_data[1])
    # test
    predictions = [int(a) for a in clf.predict(test_data[0])]
    num_correct = sum(int(a == y) for a, y in zip(predictions, test_data[1]))
    print "Using svm_baseline classifier:"
    print "%s of %s values correct.  %s percent " % (num_correct, len(test_data[1]),
        (num_correct*100)/len(test_data[1]))
예제 #3
0
파일: main.py 프로젝트: readml/olin-audioml
def getFeatures(signal, rate):
	"""
	Extracts Important Vocal Features

	author: chris
	"""
	if signal.shape[0] > mem_cut_off:
		mfcc,fbank = getFeatures(signal[mem_cut_off:], rate)
		return np.concatenate((fs.mfcc(signal[:mem_cut_off],rate),mfcc)), np.concatenate((fs.logfbank(signal,rate),fbank))
	else:
		return fs.mfcc(signal,rate), fs.logfbank(signal,rate)
예제 #4
0
def getFeatures(signal, rate):
    """
	Extracts Important Vocal Features

	author: chris
	"""
    if signal.shape[0] > mem_cut_off:
        mfcc, fbank = getFeatures(signal[mem_cut_off:], rate)
        return np.concatenate((fs.mfcc(signal[:mem_cut_off],
                                       rate), mfcc)), np.concatenate(
                                           (fs.logfbank(signal, rate), fbank))
    else:
        return fs.mfcc(signal, rate), fs.logfbank(signal, rate)
예제 #5
0
def cal_bic(wavfilename, sadfilename):
    sample_rate, wav = wavfile.read(wavfilename)
    mfcc_feat = features.mfcc(wav, sample_rate)
    ref = getsad_ref(sadfilename)
    an_win_mfcc = mfcc_cut_vad_an_win(mfcc_feat, ref)
    [time, bic_value] = bic(an_win_mfcc)
    return time, bic_value
예제 #6
0
    def fill(self, class_id):
        """Fills internal structer with new training samples.
           Do not call directly.
        :param class_id: class identification
        """
        # get training samples
        for i in range(len(self.sel_files)):
            row = self.sel_files[0]
            samples = self.all_files[row].samples
            feat = mfcc(samples, 16000, winlen=0.030, appendEnergy=False, VAD=simpleVAD)
            # add two symptoms from the middle
            self.X.append(feat[int(len(feat) / 2 - 1)])
            self.y.append(class_id)
            self.X.append(feat[int(len(feat) / 2 + 1)])
            self.y.append(class_id)
            
            # clear from the list
            del_iter = self.file_store.get_iter(Gtk.TreePath.new_from_indices([row]))
            self.file_store.remove(del_iter)
            del self.all_files[row]

        # print results
        if Classifier.new_training(self.X, self.y):
            self.status_label.set_text("{0} vzorků, {1} tříd".format(len(self.X), len(self.class_names)))
        else:
            self.status_label.set_text("Klasifikátor potřebuje víc tříd")
def extract_mfcc_features(signal, win_len=0.0232, win_overlap=0.5, n_mel_bands=40, n_coefs=25, fs=48000, nfft=1024):
    """
    Return feature vector for a one channel signal

    Return same features as the one defined in the paper
    Salamon, J., Jacoby, C., & Bello, J. (2014). A Dataset and Taxonomy for Urban Sound Research. ACM International Conference Onf Multimedia, (3). doi:10.1145/2647868.2655045

    :param signal: one dimension array
    :param win_len: length of window to split the signal into
    :param win_overlap: overlap over window, 1 > win_overlap >= 0
    :param n_mel_bands: numbers of mel bands to use
    :param n_coefs: number of dct coefs to return
    :param fs: signal sampling rate
    :return: a dict of features array
    """
    win_step = win_len * win_overlap # 50%
    features = {}
    res = mfcc(signal, samplerate=fs, winlen=win_len, winstep = win_step, nfilt = n_mel_bands, lowfreq=0,
               highfreq = 22050, numcep=n_coefs, nfft=nfft) ## TODO revoir nfft.. je ne suis pas certain de comprendre a quoi ca correspond pour mel dans le papier il n'en parle pas.. surtour revoir si ca fonctionne avec nfft et fs... car bon
    #print("fs {}, signal.shape {}".format(fs,signal.shape))
    #print(res.shape)
    features["minimum"] = np.min(res, axis=0)
    features["maximum"] = np.max(res, axis=0)
    features["median"] = np.median(res, axis=0)
    features["mean"] = np.mean(res, axis=0)
    features["variance"] = np.var(res, axis=0)
    features["skewness"] = scipy.stats.skew(res, axis=0)
    features["kurtosis"] = scipy.stats.kurtosis(res, axis=0)
    features["mean_first_diff"] = np.mean(np.diff(res, axis=0), axis=0)
    features["variance_first_diff"] = np.var(np.diff(res, axis=0), axis=0)
    features["mean_second_diff"] = np.mean(np.diff(res, axis=0, n=2), axis=0)
    features["var_second_diff"] = np.var(np.diff(res, axis=0, n=2), axis=0)
    return features
예제 #8
0
def compute_mfcc(sig, rate, winlen=0.025, winstep=0.01, numcep = 12, nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, ceplifter=22, include_energy=True, snip_edges = True):
	
	if snip_edges:
		#snip the edges
		sig = snip(sig, rate, winlen, winstep)
	
	return mfcc(sig, rate, winlen, winstep, numcep, nfilt, nfft, lowfreq, highfreq, preemph, ceplifter, include_energy)
예제 #9
0
def features_from_base(basepath, order=0):
    (females, males) = read_speakers(basepath)
    # list of list (sorted)
    female_utterances_list = [
        read_utterances(basepath, female) for female in females
    ]
    male_utterances_list = [read_utterances(basepath, male) for male in males]

    # utterances as Wave objects
    female_utterances_list = read_utterances_files(basepath,
                                                   female_utterances_list, 'f')
    male_utterances_list = read_utterances_files(basepath,
                                                 male_utterances_list, 'm')

    for utterances in female_utterances_list:
        for utterance in utterances:
            uttMFCCs = features.mfcc(utterance.signal,
                                     samplerate=utterance.sample_rate,
                                     numcep=19,
                                     highfreq=utterance.sample_rate / 2)
            if (order > 0):
                uttMFCCs = features.appendDeltasAllFrames(uttMFCCs, order)
            print(uttMFCCs.shape)
            print(uttMFCCs)

        print()
예제 #10
0
def run_tests(test_files):
    # Classify input data
    for test_file in test_files:
        # Read input file
        sampling_freq, signal = wavfile.read(test_file)

        # Extract MFCC features
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            features_mfcc = mfcc(signal, sampling_freq)

        # Define variables
        max_score = -float('inf') 
        output_label = None 

        # Run the current feature vector through all the HMM
        # models and pick the one with the highest score
        for item in speech_models:
            model, label = item
            score = model.compute_score(features_mfcc)
            if score > max_score:
                max_score = score
                predicted_label = label

        # Print the predicted output 
        start_index = test_file.find('/') + 1
        end_index = test_file.rfind('/')
        original_label = test_file[start_index:end_index]
        print('\nOriginal: ', original_label) 
        print('Predicted:', predicted_label)
예제 #11
0
def vector_quantize(myfiles, outdir, model): #given a list of files transform them to spectral vectors and compute the KMeans VQ
    for f in myfiles:
        print "Quantizing: ", f
        (rate, sig) = wav.read(f)
        #print rate, sig.shape
        #get the spectral vectors
        mfcc_feat = mfcc(sig,rate)
        #print mfcc_feat.shape
        fbank_feat = mfcc_feat #logfbank(sig,rate) #this has the spectral vectors now
        #print fbank_feat.shape
        val = model.predict(fbank_feat)
        #fcomps = os.path.split(f) #file components path, filename
        fcomps = f.split("/")
        fn = fcomps[-2]+"/"+fcomps[-1].split('.')[0] + '_vq.txt'
        #outpath = os.path.join(fcomps[0], 'outputs')
        fn = os.path.join(outdir, fn)
        d = os.path.dirname(fn)
        if not os.path.exists(d):
                os.makedirs(d)
        #print fn

        #val = trim_background(val)
        #raw_input("enter...")
        
        f = open(fn, 'wb')
        for v in val:
            f.write(str(v) + '\n')
        f.close()
        print 'output vector quantized file:  ', f, ' written'
    return
예제 #12
0
def training():
    '''
    Takes input signal and searches current dataset for hit.
    If hit, then add to correct dataset.
    If miss, asks user for currect input and adds to dataset.
    '''

    print("please speak a word into the microphone")
    record_to_file('training.wav')
    print("done - result written to training.wav")

    (rate, sig) = wav.read("training.wav")

    mfcc_feat = mfcc(sig, rate)
    fbank_feat = logfbank(sig, rate)

    recording = fbank_feat[1:3, :]

    testing = check_for_match(recording)

    verify = raw_input("did you say " + testing + " ")

    if verify is 'y':
        parse_array(recording, testing)

    if verify is 'n':
        correct_word = input("what word did you mean? ")
        print correct_word
        parse_array(recording, correct_word)
def generate_testing_mfccs(myfiles, outdir): 
    for f in myfiles:
        print "Generating MFCCs for: ", f
        (rate, sig) = wav.read(f)
        #print rate, sig.shape
        #get the spectral vectors
        mfcc_feat = mfcc(sig,rate)
        #mfcc_feat = scaler.transform(mfcc_feat)
        #fcomps = os.path.split(f) #file components path, filename
        fcomps = f.split("/")
        fn = fcomps[-2]+"/"+fcomps[-1].split('.')[0] + '_mfcc.txt'
        #outpath = os.path.join(fcomps[0], 'outputs')
        fn = os.path.join(outdir, fn)
        d = os.path.dirname(fn)
        if not os.path.exists(d):
                os.makedirs(d)
        f = open(fn, 'wb')
        final_mfccs_str=""
        for vector in mfcc_feat:
			str_mfcc = ""
			for element in vector:
				str_mfcc +=str(element)+","
			str_mfcc = str_mfcc[:-1]
			final_mfccs_str += str_mfcc+"\n"
        f.write(final_mfccs_str)
        f.close()
        print 'output MFCC file:  ', f, ' written'
    return
예제 #14
0
 def __init__(self, filename):
     self.filename = filename
     self.frequency, self.sound = wavfile.read(wavFilesPath + filename)
     self.channel1 = self.sound[:, 0]
     self.channel2 = self.sound[:, 1]
     self.duration = len(self.sound) / self.frequency
     self.mfccFeatures = mfcc(self.sound, self.frequency)
예제 #15
0
def extractLow(signal):
    return mfcc(signal,
                samplerate=SAMPLING_RATE,
                winlen=LO_FRAME_DURATION,
                winstep=LO_FRAME_STEP,
                numcep=NUM_CEPTRUM,
                appendEnergy=True)
예제 #16
0
def get_data(rootdir = TIMIT_main_dir):	
	inputs = []
	targets = []
	for dir_path, sub_dirs, files in os.walk(rootdir):
		for file in files:	        
			if (os.path.join(dir_path, file)).endswith('.wav'):
				wav_file_name = os.path.join(dir_path, file)
				input_data, f_s = sf.read(wav_file_name)
				# mfcc_feat = MFCC_input(mfcc(input_data,f_s))
				mfcc_feat = mfcc(input_data,f_s)
				#Delta features
				delta_feat = mfcc_feat[:-1]-mfcc_feat[1:]
				#Delta-Delta features
				deltadelta_feat = delta_feat[:-1]-delta_feat[1:]

				#Removing the first two frames
				mfcc_feat = mfcc_feat[2:]
				delta_feat = delta_feat[1:]

				#Concatenating mfcc, delta and delta-delta features
				full_input = np.concatenate((mfcc_feat,delta_feat,deltadelta_feat), axis=1)

				inputs.append(np.asarray(full_input, dtype=theano.config.floatX))#Rakeshvar wants one frame along each column but i am using Lasagne

				text_file_name = wav_file_name[:-4] + '.txt'
				target_data_file = open(text_file_name)
				target_data = str(target_data_file.read()).lower().translate(None, '!:,".;?')
				# target_data = str(target_data_file.read()).lower().translate(str.maketrans('','', '!:,".;?'))
				target_data = target_data[8:-1]#No '.' in lexfree dictionary
				targets.append(target_data)
	return inputs, targets
예제 #17
0
    def shifted_delta_cepstra(self, wav_fn, delta=1, shift=3, k_conc=3):
        """
        :param
            delta: represents the time advance and delay for the sdc
            k_conc: is the number of blocks whose delta coefficients are concd
            shift: is the time shift between consecutive blocks

        Shifted delta cepstra are feature vectors created by concatenating
        delta cepstra computed across multiple speech frames.
        See the paper
            PA Torres-Carrasquillo et al (2002)
            Approaches to language identification using
                Gaussian mixture models and Shifted delta cepstral features.
        """
        (rate, sig) = wav.read(wav_fn)
        mfcc_feats = mfcc(sig, rate)
        # len(mfcc) == 39 == 3 * (12 cepstral + 1 energy)
        # TODO include original cepstra as well?
        delta_feats = mfcc_feats[delta:] - mfcc_feats[:-delta]
        output_duration = delta_feats.shape[0] - shift * k_conc
        shifted = np.zeros(
            (output_duration, (k_conc + 1) * delta_feats.shape[1]))
        mfcc_dim = mfcc_feats.shape[1]
        shifted[:, 0:mfcc_dim] = mfcc_feats[:output_duration]
        for i in xrange(output_duration):
            shifted[i,
                    mfcc_dim:] = delta_feats[i:i +
                                             k_conc * shift:shift, :].reshape(
                                                 (1, -1))
        logger.debug('{} --> {}'.format(mfcc_feats.shape, shifted.shape))
        return shifted
def build_codebook(
        trgfile,
        codesize=32,
        fname=None
):  # given a training file constructs the codebook using kmeans
    #print "Codesize is ", codesize
    (rate, sig) = wav.read(trgfile)
    print rate, sig.shape
    #get the spectral vectors
    print("MFCC generation begins")
    mfcc_feat = mfcc(sig, rate)
    print("MFCC generation ends")
    print mfcc_feat.shape
    sys.exit(0)
    #print("Fbank creation begins")
    #fbank_feat = logfbank(sig,rate) #this has the spectral vectors now
    #print("Fbank creation ends")
    #print fbank_feat.shape
    print "codesize = ", codesize
    km = KMeans(n_clusters=codesize)
    #km.fit(fbank_feat)
    km.fit(mfcc_feat)

    if fname != None:
        pickle.dump(km, open(fname, 'wb'))
    return km
예제 #19
0
def extractLow(signal):
        return mfcc(signal,
                    samplerate = SAMPLING_RATE,
                    winlen = LO_FRAME_DURATION,
                    winstep = LO_FRAME_STEP,
                    numcep = NUM_CEPTRUM,
                    appendEnergy = True)
예제 #20
0
def main():
    if len(sys.argv) < 2:
		sys.stderr.write('Usage: python ' + sys.argv[0] + ' lang_test.wav')
		sys.exit(1)
    file = sys.argv[1]
    
    languages = pickle.load( open('languages.dat', 'r') )

    (rate,sig) = wav.read(file) # returns (sample rate, numpy.ndarray of samples)
    mfcc_feat = mfcc(sig,rate)
    mfccs_deltas = recognizer_util.get_deltas(mfcc_feat, 0)
    mfccs_deltas_ddeltas = recognizer_util.get_deltas(mfccs_deltas, 13)
    test_avg = recognizer_util.col_avg(mfccs_deltas_ddeltas)
    
    results = {}
    
    for language in languages.keys():
        dist = get_distance(languages[language], test_avg)
        results[dist] = language
    
    sorted = results.keys()
    sorted.sort()
    print
    language = results[sorted[0]]
    sys.stdout.write(language) 
    print
예제 #21
0
    def shifted_delta_cepstra(self, wav_fn, delta=1, shift=3, k_conc=3):
        """
        :param
            delta: represents the time advance and delay for the sdc
            k_conc: is the number of blocks whose delta coefficients are concd
            shift: is the time shift between consecutive blocks

        Shifted delta cepstra are feature vectors created by concatenating
        delta cepstra computed across multiple speech frames.
        See the paper
            PA Torres-Carrasquillo et al (2002)
            Approaches to language identification using
                Gaussian mixture models and Shifted delta cepstral features.
        """
        (rate,sig) = wav.read(wav_fn)
        mfcc_feats = mfcc(sig,rate)
        # len(mfcc) == 39 == 3 * (12 cepstral + 1 energy)
        # TODO include original cepstra as well?
        delta_feats = mfcc_feats[delta:] - mfcc_feats[:-delta]
        output_duration = delta_feats.shape[0] - shift*k_conc
        shifted = np.zeros((output_duration,
                            (k_conc + 1) * delta_feats.shape[1]))
        mfcc_dim = mfcc_feats.shape[1]
        shifted[:,0:mfcc_dim] = mfcc_feats[:output_duration]
        for i in xrange(output_duration):
            shifted[i,mfcc_dim:] = delta_feats[i:i+k_conc*shift:shift,
                                               :].reshape((1,-1))
        logger.debug('{} --> {}'.format(mfcc_feats.shape, shifted.shape))
        return shifted
예제 #22
0
    def _gen_features(self, data_dir, outfile):
        """ Generates a csv file containing labeled lines for each speaker """

        with open(outfile, 'w') as ohandle:
            melwriter = csv.writer(ohandle)
            speakers = os.listdir(data_dir)

            for spkr_dir in speakers:
                for soundclip in os.listdir(os.path.join(data_dir, spkr_dir)):
                    # generate mel coefficients for the current clip
                    clip_path = os.path.abspath(
                        os.path.join(data_dir, spkr_dir, soundclip))
                    sample_rate, data = wavfile.read(clip_path)
                    ceps = mfcc(data, sample_rate)

                    # write an entry into the training file for the current speaker
                    # the vector to store in csv file contains the speaker's name at the end
                    fvec = self._mfcc_to_fvec(ceps)
                    fvec.append(spkr_dir)

                    logging.debug(
                        fvec)  # see the numbers [as if they make sense ]

                    # write one row to the csv file
                    melwriter.writerow(fvec)
예제 #23
0
def create_mfcc(method, filename):
    """Perform standard preprocessing, as described by Alex Graves (2012)
	http://www.cs.toronto.edu/~graves/preprint.pdf
	Output consists of 12 MFCC and 1 energy, as well as the first derivative of these.
	[1 energy, 12 MFCC, 1 diff(energy), 12 diff(MFCC)

	method is a dummy input!!"""

    (rate, sample) = wav.read(filename)

    mfcc = features.mfcc(sample,
                         rate,
                         winlen=0.025,
                         winstep=0.01,
                         numcep=13,
                         nfilt=26,
                         preemph=0.97,
                         appendEnergy=True)

    derivative = np.zeros(mfcc.shape)
    for i in range(1, mfcc.shape[0] - 1):
        derivative[i, :] = mfcc[i + 1, :] - mfcc[i - 1, :]

    out = np.concatenate((mfcc, derivative), axis=1)

    return out, out.shape[0]
def compute_features(filename):

    fs,audio_array = wav.read(filename)
    
    mfcc_25 = mfcc(audio_array,samplerate=fs,winlen=0.064,winstep=0.032,numcep=25,
               nfilt=40,nfft=512,lowfreq=0,highfreq=fs/2,preemph=0,
               ceplifter=0,appendEnergy=True)
    
    first = np.diff(mfcc_25, axis=0)
    second = np.diff(first, axis=0)
    
    minimum = np.amin(mfcc_25,axis=0)
    maximum = np.amax(mfcc_25,axis=0)
    median = np.median(mfcc_25,axis=0)
    mean = np.mean(mfcc_25,axis=0)
    variance = np.var(mfcc_25,axis=0)
    skewness = scipy.stats.skew(mfcc_25,axis=0)
    kurtosis = scipy.stats.kurtosis(mfcc_25,axis=0)
    first_mean = np.mean(first,axis=0)
    first_variance = variance = np.var(first,axis=0)
    second_mean = np.mean(second,axis=0)
    second_variance = variance = np.var(second,axis=0)
     
    features = np.concatenate((minimum, maximum, median, mean, variance, skewness, kurtosis, first_mean, first_variance, second_mean, second_variance), axis=0)
    
    return features
예제 #25
0
def vector_quantize(
    myfiles, outdir, model
):  #given a list of files transform them to spectral vectors and compute the KMeans VQ
    for f in myfiles:
        print "Quantizing: ", f
        (rate, sig) = wav.read(f)
        #print rate, sig.shape
        #get the spectral vectors
        mfcc_feat = mfcc(sig, rate)
        #print mfcc_feat.shape
        fbank_feat = mfcc_feat  #logfbank(sig,rate) #this has the spectral vectors now
        #print fbank_feat.shape
        val = model.predict(fbank_feat)
        #fcomps = os.path.split(f) #file components path, filename
        fcomps = f.split("/")
        fn = fcomps[-2] + "/" + fcomps[-1].split('.')[0] + '_vq.txt'
        #outpath = os.path.join(fcomps[0], 'outputs')
        fn = os.path.join(outdir, fn)
        d = os.path.dirname(fn)
        if not os.path.exists(d):
            os.makedirs(d)
        #print fn

        val = trim_background(val)
        #raw_input("enter...")

        f = open(fn, 'wb')
        for v in val:
            f.write(str(v) + '\n')
        f.close()
        print 'output vector quantized file:  ', f, ' written'
    return
예제 #26
0
 def feature_extract_mfcc(self, sound, rate):
     """
     extract every features for training
     - frequency space: MFCC. pitch
     :return: 
     """
     reg = re.compile('(\d+)-(\d+)-(\d+).wav')
     #plotter.plot_frame(sound, show=True)
     mfcc0 = mfcc(sound.reshape(1, -1),
                  rate,
                  winlen=cfg.frame,
                  winstep=cfg.step,
                  nfft=1536,
                  winfunc=np.hamming)
     mfcc0 = mfcc0 - np.mean(mfcc0)
     mfcc1 = delta(mfcc0, 3)
     mfcc2 = delta(mfcc1, 3)
     mfcc0 = scale(mfcc0)
     '''
     if audio in ['01','00']:
         print(filename)
         plotter.plot_mfcc(mfcc0,'311')
         plotter.plot_mfcc(mfcc1,'312')
         plotter.plot_mfcc(mfcc2,'313')
         plotter.show()
     '''
     return (mfcc0, mfcc1, mfcc2), min(len(mfcc0), 200)
def compare(control_path, exp_path):
    """
    Compares two wav files and returns a score. Uses mel frequency ceptrum coefficients as well as dynamic time warping.

    :param control_path: the 'correct' wav - what you are comparing to
    :param exp_path: the unknown wav
    """
    (rate, sig) = wavread(control_path)
    (rate2, sig2) = wavread(exp_path)

    x = mfcc(sig, rate)
    y = mfcc(sig2, rate2)

    dist, cost, acc = dtw.dtw(x, y, dist=lambda x, y: dtw.norm(x - y, ord=1))\

    return dist
예제 #28
0
def compare(control_path, exp_path):
    """
    Compares two wav files and returns a score. Uses mel frequency ceptrum coefficients as well as dynamic time warping.

    :param control_path: the 'correct' wav - what you are comparing to
    :param exp_path: the unknown wav
    """
    (rate,sig) = wavread(control_path)
    (rate2,sig2) = wavread(exp_path)

    x = mfcc(sig,rate)
    y = mfcc(sig2,rate2)

    dist, cost, acc = dtw.dtw(x, y, dist=lambda x, y: dtw.norm(x - y, ord=1))\

    return dist
예제 #29
0
def get_data(rootdir=TIMIT_main_dir):
    inputs = []
    targets = []
    for dir_path, sub_dirs, files in os.walk(rootdir):
        for file in files:
            if (os.path.join(dir_path, file)).endswith('.wav'):
                wav_file_name = os.path.join(dir_path, file)
                input_data, f_s = sf.read(wav_file_name)
                # mfcc_feat = MFCC_input(mfcc(input_data,f_s))
                mfcc_feat = mfcc(input_data, f_s)
                #Delta features
                delta_feat = mfcc_feat[:-1] - mfcc_feat[1:]
                #Delta-Delta features
                deltadelta_feat = delta_feat[:-1] - delta_feat[1:]

                #Removing the first two frames
                mfcc_feat = mfcc_feat[2:]
                delta_feat = delta_feat[1:]

                #Concatenating mfcc, delta and delta-delta features
                full_input = np.concatenate(
                    (mfcc_feat, delta_feat, deltadelta_feat), axis=1)

                inputs.append(
                    np.asarray(full_input, dtype=theano.config.floatX)
                )  #Rakeshvar wants one frame along each column but i am using Lasagne

                text_file_name = wav_file_name[:-4] + '.txt'
                target_data_file = open(text_file_name)
                target_data = str(target_data_file.read()).lower().translate(
                    None, '!:,".;?')
                # target_data = str(target_data_file.read()).lower().translate(str.maketrans('','', '!:,".;?'))
                target_data = target_data[8:-1]  #No '.' in lexfree dictionary
                targets.append(target_data)
    return inputs, targets
예제 #30
0
def read_data(files_amount, total_length, nc = 13, path=''):
	for i in range(files_amount):
		(rate,sig) = wav.read(path + str(i) + '.wav')
		mfcc_feat = mfcc(sig,rate, numcep = nc)
		mfcc_feat = np.reshape(mfcc_feat, (len(mfcc_feat)*nc, 1))

		if any(np.isnan(mfcc_feat)) or any(np.isinf(mfcc_feat)):
			ind = [x for x in range(len(mfcc_feat)) if np.isnan(mfcc_feat[x]) or np.isinf(mfcc_feat[x])]
			for x in ind:
				mfcc_feat[x] = 0

		if i == 0:
			mfcc_data = mfcc_feat

		if i != 0:
			if len(mfcc_feat) == total_length:
				mfcc_data = np.hstack((mfcc_data, mfcc_feat))
			else:
				if len(mfcc_feat) > total_length:
					mfcc_data = np.hstack((mfcc_data, mfcc_feat[:(total_length)]))
				else:
					xx = np.vstack((mfcc_feat,np.reshape(np.asarray([0] * (total_length-len(mfcc_feat))), (total_length-len(mfcc_feat),1) )))
					mfcc_data = np.hstack((mfcc_data, xx))

	return mfcc_data
예제 #31
0
def crossover(playlist_1, playlist_2, playlist_size):
	#Crosses over playlists
	global all_playlistsl
	#print "Playlist_size: ", playlist_size
	one = all_playlists[playlist_1]
	two = all_playlists[playlist_2]

	child = Playlist(playlist_size)

	one_percentage = (one.fitness / float(one.fitness + two.fitness))
	#print "One %: ", one_percentage
	one_genes = int(floor(playlist_size * one_percentage))
	print "One genes: ", one_genes

	one_copy = copy.deepcopy(one)
	two_copy = copy.deepcopy(two)

	#Get genes from first parent
	for i in range(one_genes):
		#if len(one_copy.songs) <= 1:
		#	two_genes+=1
		#	break
		all_songs.append(file)

	woteva = mfcc(sig, rate)
	woteva = reduce_matrix(woteva, 500)
	return woteva
예제 #32
0
def crossover(playlist_1, playlist_2, playlist_size):
    #Crosses over playlists
    global all_playlistsl
    #print "Playlist_size: ", playlist_size
    one = all_playlists[playlist_1]
    two = all_playlists[playlist_2]

    child = Playlist(playlist_size)

    one_percentage = (one.fitness / float(one.fitness + two.fitness))
    #print "One %: ", one_percentage
    one_genes = int(floor(playlist_size * one_percentage))
    print "One genes: ", one_genes

    one_copy = copy.deepcopy(one)
    two_copy = copy.deepcopy(two)

    #Get genes from first parent
    for i in range(one_genes):
        #if len(one_copy.songs) <= 1:
        #	two_genes+=1
        #	break
        all_songs.append(file)

    woteva = mfcc(sig, rate)
    woteva = reduce_matrix(woteva, 500)
    return woteva
예제 #33
0
def make_mean_mfcc(filename):
    try:
        (rate, sig) = wav.read(filename)
        mfcc_feat = mfcc(sig, rate)
        avg_mfcc = np.mean(mfcc_feat, axis = 0)
        return avg_mfcc
    except:
        pass
예제 #34
0
 def predict(self, signal, fs = 44100):
     if len(signal.shape) > 1:
         signal = signal[:, 0]
     signal_new = remove_silence(fs, signal)
     # if len(signal_new) < len(signal) / 4:
     #     return "Silence"
     mfcc_vecs = mfcc(signal_new, fs, numcep = 15)
     return self.predict_feat(mfcc_vecs)
예제 #35
0
def readSegFeat(start_t, end_t, signal, sr):
    try:
        sig = signal[int(sr * start_t):int(sr * end_t)]
    except:
        sig = signal[int(sr * start_t):-1]
    cleansig = remove_silence(sr, sig)
    mfcc_vecs = mfcc(cleansig, sr, numcep=15)
    return mfcc_vecs
예제 #36
0
def extract_feats(signal, sr):
    feats = mfcc(signal, sr)
    #fbank_feat = logfbank(signal, sr, nfilt = 17)
    #feats = np.hstack((mfcc_feat, fbank_feat))
    mu = np.mean(feats, axis = 0)
    sigma = np.std(feats, axis = 0)
    feature = (feats - mu) / sigma
    return feature
예제 #37
0
 def predict(self, signal, fs=44100):
     if len(signal.shape) > 1:
         signal = signal[:, 0]
     signal_new = remove_silence(fs, signal)
     # if len(signal_new) < len(signal) / 4:
     #     return "Silence"
     mfcc_vecs = mfcc(signal_new, fs, numcep=15)
     return self.predict_feat(mfcc_vecs)
예제 #38
0
def readSegFeat(start_t, end_t, signal, sr):
    try:
        sig = signal[int(sr * start_t) : int(sr * end_t)] 
    except:
        sig = signal[int(sr * start_t) : -1]
    cleansig = remove_silence(sr, sig)
    mfcc_vecs = mfcc(cleansig, sr, numcep = 15) 
    return mfcc_vecs
예제 #39
0
def addMFCC(data_dict):
    for name in data_dict:
        data_dict[name]['mfcc'] =[]
        audio_path = data_dict[name]['raw']
        sr, WAV = wav.read(audio_path)
        MFCC = mfcc(WAV, sr)          
        data_dict[name]['mfcc'].append(MFCC)
    return data_dict
예제 #40
0
def extract_mfcc():

    #nadi sve .wav fileove u audio direktoriju
    audioDir = "audio/"
    if not os.path.exists(audioDir):
        os.makedirs(audioDir)
        print "Ne postoji audio direktorij!"
    audioFiles = []
    for file in os.listdir(audioDir):
        if fnmatch.fnmatch(file, '*.wav'):
            audioFiles.append(file)

    #ispis broja pronadenih .wav datoteka u audio direktoriju
    print ""
    print "Pronasao sam %d audio zapisa u %s direktoriju!" % (len(audioFiles),
                                                              audioDir)
    print ""

    #petlja koja prolazi kroz sve .wav datoteke unutar audio direktorija
    for x in range(0, len(audioFiles)):

        #ime .wav audio zapisa, ime .png grafa, ime .txt formata audio zapisa, ime direktorija za pohranu svega
        filename = audioFiles[x]
        floatFile = filename.split(".")[0] + ".txt"
        directory = audioDir + filename.split(".")[0]

        #provjere dali postoji datoteka ili direktoriji
        if not os.path.isfile(audioDir + filename):
            sys.exit("File does not exist!")
        if not os.path.exists(directory):
            os.makedirs(directory)
        if os.path.exists(directory):
            shutil.rmtree(directory)
            os.makedirs(directory)

        #citanje wav datoteke i slanje za izracun mfcc
        (rate, sig) = wav.read(audioDir + filename)
        mfcc_feat = mfcc(sig,
                         rate,
                         winlen=0.025,
                         winstep=0.01,
                         numcep=13,
                         preemph=0.99)

        #ispis imena izracunatih .wav datoteka
        print "MFCC karakteristike izracunate za %s!" % filename

        #spremanje signala u float formatu i mfcc znacajki u .txt datoteke
        np.savetxt(directory + "/" + floatFile, sig, fmt="%.4f")
        np.savetxt(directory + "/mfcc_features.txt",
                   mfcc_feat,
                   fmt="%.16f",
                   delimiter=",")

    #ispis broja .wav datoteka iz kojih smo izvukli MFCC
    print "Ukupno %d MFCC karakteristika izracunato!" % len(audioFiles)

    return 1
def feature_extract(wav_name, winlen=0.025, winstep=0.01):
    """This function returns (mfcc) feature vectors extracted from wav_name"""
    rate, signal = wav.read(wav_name)
    signal = numpy.sum(signal, axis=1) / signal.shape[1]
    signal = sigproc.framesig(signal, rate * winlen, rate * winstep)
    signal = vad.vad_filter(signal)
    signal = sigproc.deframesig(signal, 0, rate * winlen, rate * winstep)
    mfcc_feat = mfcc(signal, rate)
    return mfcc_feat
    def source_save(self):
        for key, value in self.train_file_map.items():
            wav = wave.open(key)
            rate = wav.getframerate()

            #Binary file needs to be munged because it is 2 channel
            #encoding is 24bit signed integer Pulse Code Modulation (PCM)
            #with a 44.1kHz sampling

            #The initial way I was hoping to munge the raw byte data
            #nframes = wav.getnframes()
            #buf = wav.readframes(nframes)

            #data is 24 bits in 3 bytes.  np.int24 does not exist!
            #dt = np.dtype(np.int24)
            #data is in little endian format
            #dt = dt.newbyteorder('<')
            #sig = np.frombuffer(buf,dtype=dt)

            #numpy doesn't support int24 yet so had to use this:
            #http://stackoverflow.com/questions/12080279/how-do-i-create-a-numpy-dtype-that-includes-24-bit-integers

            rawdatamap = np.memmap(key, dtype=np.dtype('u1'), mode='r')
            usablebytes = rawdatamap.shape[0] - rawdatamap.shape[0] % 12
            frames = int(usablebytes / 12)
            rawbytes = rawdatamap[:usablebytes]

            #This line is the difficult part which required stackoverflow, it makes the data into 32bit data,
            #but because it is actually 24bit data there is included redundant data in the first byte.
            realdata = as_strided(rawbytes.view(np.int32),
                                  strides=(
                                      12,
                                      3,
                                  ),
                                  shape=(frames, 2))

            #This ANDs the bits by a byte mask of the last 24bits, to get rid of the redundant data
            sig = realdata & 0x00ffffff

            #mfcc is mel frequency cepstral coefficent
            #http://practicalcryptography.com/miscellaneous/machine-learning/guide-mel-frequency-cepstral-coefficients-mfccs/
            #mfcc_feat needs to be stored in MongoDB, it is a numpy array that is 5999 in length
            #Each Audio file is a scene which is being classified, one of feature vectors
            #used to classfy the scene is
            #the mfcc_feat array

            #mfcc will return an array of that is 5999 rows by 13 columns
            #Each column is a feature vector for training the classifier for that audio sample's
            #class (i.e. tram, park)
            #The window length for analysis is 0.025 seconds,
            #the window step between windows is 0.01 seconds.
            #This is the entire array of feature vectors for each audio sample.
            #Additional feature vectors might be added later but this is good for inital tests.
            mfcc_feat = mfcc(sig, samplerate=rate)

            #Insert records into mongodb
            self.insert_mongo(self.mfcc_fv, mfcc_feat, key, value)
예제 #43
0
def _extract_mfcc(filename):
    """Extracts mfccs from wav files"""
    savename = filename[0:len(filename) - 4] + '.mfc'
    samp_rate, X = read(filename)
    # ceps, mspec, spec = mfcc(X)
    ceps = feat.mfcc(X, samp_rate)
    num_ceps = ceps.shape[0]
    x = np.mean(ceps[int(num_ceps * 1 / 10):int(num_ceps * 9 / 10)], axis=0)
    np.save(savename, x)
예제 #44
0
def start(seed):
    #Prepare global variables and the seed_mfcc
    (rate, sig) = wav.read(seed)
    for file in glob.glob("*.wav"):
        all_songs.append(file)

    woteva = mfcc(sig, rate)
    woteva = reduce_matrix(woteva, 500)
    return woteva
예제 #45
0
def learn(wav_filename, old_data=None):
    rate, signal = wav.read(wav_filename, 'r')
    mfcc_feat = mfcc(signal, rate)
    if not old_data == None:
        mfcc_feat = np.concatenate((mfcc_feat, old_data))

    gmm = mixture.GMM(GMM_CLUSTERS)
    gmm.fit(mfcc_feat)
    return gmm, mfcc_feat
예제 #46
0
def plot_bic(wavfilename, sadfilename):
    sample_rate, wav = wavfile.read(wavfilename)
    mfcc_feat = features.mfcc(wav, sample_rate)
    ref = getsad_ref(sadfilename)
    an_win_mfcc = mfcc_cut_vad_an_win(mfcc_feat, ref)
    [time, bic_value] = bic(an_win_mfcc)
    pyplot.plot(time, bic_value)
    pyplot.scatter(time, bic_value)
    pyplot.show()
예제 #47
0
def start(seed):
	#Prepare global variables and the seed_mfcc
	(rate, sig) = wav.read(seed)
	for file in glob.glob("*.wav"):
		all_songs.append(file)

	woteva = mfcc(sig, rate)
	woteva = reduce_matrix(woteva, 500)
	return woteva
예제 #48
0
def generate_speech(addr):
    try:
        (rate,sig) = wav.read(addr)
        #plot_graf(sig, rate)
        mfcc_feat = mfcc(sig,rate, highfreq=4000, numcep=20)
        return lbg.generate_codebook(mfcc_feat, 16)[0]
    except ValueError:
        print("ValueErroe: Not a WAV file \nExit.")
        return -1
예제 #49
0
 def get_binned_features(self):
     binned_features = {}
     for i in range(len(self.binned_signals)):
         binned_features[i] = features.mfcc(self.binned_signals[i],
                                            self.sample_rate,
                                            winlen=self.feature_winlen,
                                            numcep=self.feature_numcep,
                                            nfilt=self.feature_nfilt)
     return binned_features
예제 #50
0
def get_MFCC_feature(sig, rate):
    p_array = mfcc(sig, rate, winlen=0.025, winstep=0.01)  # 获取梅尔倒谱系数
    col_num = p_array.shape[1]
    feature_array = []
    for ii in range(col_num):
        #  test1.  取某一维的标准差
        # feature_array.append(np.std(p_array[:,ii]))
        feature_array.append(p_array[:, ii])
    return feature_array
def feature_extract(wav_name, winlen=0.025, winstep=0.01):
    """This function returns (mfcc) feature vectors extracted from wav_name"""
    rate, signal = wav.read(wav_name)
    signal = numpy.sum(signal, axis=1)/signal.shape[1]
    signal = sigproc.framesig(signal, rate*winlen, rate*winstep)
    signal = vad.vad_filter(signal)
    signal = sigproc.deframesig(signal, 0, rate*winlen, rate*winstep)
    mfcc_feat = mfcc(signal, rate)
    return mfcc_feat
예제 #52
0
def MFCC(data, samp):

    mfcc_feat = mfcc(data, samp)
    mMin = mfcc_feat.min()
    mMax = mfcc_feat.max()
    mfcc_feat -= mMin
    mfcc_feat *= 255 / mfcc_feat.max()
    outImg = np.array(mfcc_feat, np.uint8)
    return outImg
예제 #53
0
    def generate(self,testsample):
        (rate,audio) = wav.read(testsample.path)

        # grab first channel
        one_channel = _extract_single_channel(audio)
        N = len(audio)
        mfcc_feat = mfcc(one_channel,rate)
        cols=mfcc_feat.shape[0]*mfcc_feat.shape[1]
        return mfcc_feat.reshape((1,cols))[0]
def MFCC(data, samp):

    mfcc_feat = mfcc(data,samp)
    mMin = mfcc_feat.min()
    mMax = mfcc_feat.max()
    mfcc_feat -= mMin
    mfcc_feat *= 255/mfcc_feat.max()
    outImg = np.array(mfcc_feat, np.uint8)
    return outImg
예제 #55
0
def build_models(input_folder):
    # Initialize the variable to store all the models
    speech_models = []

    # Parse the input directory
    for dirname in os.listdir(input_folder):
        # Get the name of the subfolder
        subfolder = os.path.join(input_folder, dirname)

        if not os.path.isdir(subfolder):
            continue

        # Extract the label
        label = subfolder[subfolder.rfind('/') + 1:]

        # Initialize the variable to store the training data
        X = np.array([])

        # Create a list of files to be used for training
        # We will leave one file per folder for testing
        training_files = [
            x for x in os.listdir(subfolder) if x.endswith('.wav')
        ][:-1]

        # Iterate through the training files and build the models
        for filename in training_files:
            # Extract the current file path
            filepath = os.path.join(subfolder, filename)

            # Read the audio signal from the input file
            sampling_freq, signal = wavfile.read(filepath)

            # Extract the MFCC features
            with warnings.catch_warnings():
                warnings.simplefilter('ignore')
                features_mfcc = mfcc(signal, sampling_freq)

            # Append to the variable X
            if len(X) == 0:
                X = features_mfcc
            else:
                X = np.append(X, features_mfcc, axis=0)

        # Create the HMM model
        model = ModelHMM()

        # Train the HMM model
        model.train(X)

        # Save the model for the current word
        speech_models.append((model, label))

        # Reset the variable
        model = None

    return speech_models
예제 #56
0
    def predict(self, soundclip):
        """ Recognizes the speaker in the sound clip. """

        sample_rate, data = wavfile.read(os.path.abspath(soundclip))
        ceps = mfcc(data, sample_rate)
        fvec = self._mfcc_to_fvec(ceps)

        speaker_id = self.recognizer.predict(fvec)[0]

        return self.spkr_iton[speaker_id]