def __init__(self, samplingFrequency=8000, framePeriod=25e-3, hopPeriod=10e-3, trainDir="./train_audio/", thresh=1.2): self.samplingFrequency = samplingFrequency self.framePeriod = framePeriod self.hopPeriod = hopPeriod self.trainDir = trainDir self.hopLength = int(samplingFrequency * hopPeriod) self.frameLength = int(samplingFrequency * framePeriod) self.referenceMFCC = [] for file_name in os.listdir(trainDir): if file_name.endswith(".wav"): (fs, data) = wv.read(trainDir + file_name) num_frames = int(data.shape[0] / self.hopLength) - int(np.ceil(self.frameLength / self.hopLength)) MFCC_calculator = mfcc.MFCC() MFCC_MATRIX = np.empty([39, abs(num_frames)]) for k in range(num_frames): MFCC_MATRIX[:, k] = MFCC_calculator.compute_mfcc( data[k * self.hopLength: k * self.hopLength + self.frameLength]) self.referenceMFCC.append(MFCC_MATRIX) DTW_calculator = dtw.DTW() distance_list = [DTW_calculator.compute_distance(np.transpose(matrix), np.transpose(matrix2)) for matrix in self.referenceMFCC for matrix2 in self.referenceMFCC] self.thresh = np.mean(np.array((distance_list))) * thresh
def distance(self, fileName): """ This function is used for calculating the DTW distance between the test utterance and each of the 10 training utterances. :param fileName: Name of test utterance .wav file :type fileName: str :returns: List of DTW distances of test utterance with each training utterance :rtype: list """ if isinstance(fileName, (bytes, bytearray)): data = np.fromstring(fileName) else: (fs, data) = wv.read(fileName) DTW_calculator = dtw.DTW() num_frames = int(data.shape[0] / self.hopLength) - int(np.ceil(self.frameLength / self.hopLength)) if num_frames <= 0: return 10000 MFCC_calculator = mfcc.MFCC() MFCC_MATRIX = abs(np.empty([39, num_frames])) for k in range(num_frames): MFCC_MATRIX[:, k] = MFCC_calculator.compute_mfcc( data[k * self.hopLength: k * self.hopLength + self.frameLength]) distance_list = [DTW_calculator.compute_distance(np.transpose(matrix), np.transpose(MFCC_MATRIX)) for matrix in self.referenceMFCC] return distance_list
def main(args): words = [] wordstoindex = {} mfcc_convert = mfcc.MFCC() mfcc_savepath_prefix = os.path.splitext(args.mfccs)[0] num_frames = 0 h5file = h5py.File(args.mfccs, "w") for line_id, line in enumerate(open(args.wordlist, "r").readlines()): word, path, start, end = line.strip().split() if not wordstoindex.has_key(word): wordstoindex[word] = len(words) words.append(word) class_id = wordstoindex[word] start, end = int(start), int(end) command = ("sox -t sph {0} -r 16000 -t wav {1}" ).format(path, "tmp.wav") subprocess.call(command, shell=True) sample_rate, samples = wavfile.read("tmp.wav") samples = samples.astype(float) full_mfccs = mfcc_convert.sig2s2mfc(samples) cutoff = numpy.sort(full_mfccs[:, 0]) speech_areas = full_mfccs[full_mfccs[:, 0] > cutoff] cepstral_mean = speech_areas.mean(axis=0) cepstral_variance = speech_areas.std(axis=0) word_samples = samples[start - 480: end + 480] word_mfccs = ((mfcc_convert.sig2s2mfc(word_samples) - cepstral_mean) / cepstral_variance) n_frames, n_dims = word_mfccs.shape full_mfccs = numpy.empty((n_frames, n_dims * 3), dtype=numpy.float32) full_mfccs[:,:n_dims] = word_mfccs mfcc.deltas(word_mfccs, output_frames=full_mfccs[:,n_dims:2*n_dims]) mfcc.deltas(full_mfccs[:,n_dims:2*n_dims], output_frames=full_mfccs[:,2*n_dims:]) if line_id == 0: mfcc_dset = h5file.create_dataset("mfccs", (n_frames, 39), maxshape=(None, 39), dtype=numpy.float32) label_dset = h5file.create_dataset("labels", (100, 3), maxshape=(None, 3), dtype=numpy.int32) data_idx = 0 cur_idx = 0 if cur_idx + n_frames >= len(mfcc_dset): h5file.flush() mfcc_dset.resize((2*len(mfcc_dset), mfcc_dset.shape[1])) print "mfcc doubling", cur_idx if data_idx == len(label_dset): h5file.flush() label_dset.resize((2*label_dset.shape[0], label_dset.shape[1])) try: mfcc_dset[cur_idx:cur_idx+n_frames] = full_mfccs except: import pdb; pdb.set_trace() label_dset[data_idx] = class_id, cur_idx, n_frames cur_idx += n_frames data_idx += 1 mfcc_dset.resize((cur_idx, mfcc_dset.shape[1])) label_dset.resize((data_idx, label_dset.shape[1])) h5file.flush() words.sort() open(args.wordkey, "w").write("\n".join( "%s %d" % (w, wordstoindex[w]) for w in words))
def process_one_file(wav_base_name): wavefilename = wavdir + "/" + wav_base_name + ".wav" print wavefilename fh = wave.open(wavefilename, "r") sampwidth = fh.getsampwidth() print fh.getparams() nsamples = fh.getnframes() bytes = fh.readframes(nsamples) code = "h" samples = array.array(code, bytes) samples = np.array(samples) fh.close() print len(samples) print samples[12000:12020] mfcc_processor = mfcc.MFCC() feats = mfcc_processor.sig2s2mfc(samples) print len(feats) print feats[10:12] featfilename = featdir + "/" + wav_base_name + ".feat" feats.dump(featfilename) checkfeats = np.load(featfilename) assert (feats == checkfeats).all()