def build_aa_dataset(in_samples, out_samples, shift, n_train=100, n_valid=10): aa_seqs = np.load('/data/lisa/data/timit/readable/per_phone/wav_aa.npy') mean = np.mean(np.hstack(aa_seqs)) std = np.std(np.hstack(aa_seqs)) print "mean:%f , std:%f"%(mean,std) aa_max,aa_min = np.max(np.hstack(aa_seqs)), np.min(np.hstack(aa_seqs)) norm_seqs = np.asarray([(seq.astype('float32')-mean)/std \ for seq in aa_seqs]) # n_seq = norm_seqs.shape[0] # n_train = n_seq*9/10 # train_aa_seqs = norm_seqs[:n_train] # valid_aa_seqs = norm_seqs[n_train:] # n_train = 100 # n_valid = 10 train_aa_seqs = norm_seqs[:n_train] valid_aa_seqs = norm_seqs[n_train:n_train+n_valid] print 'train sequences:', train_aa_seqs.shape[0] print 'valid sequences:', valid_aa_seqs.shape[0] frame_len = in_samples + out_samples overlap = frame_len - shift train_samples = [] valid_samples = [] for wav_seq in train_aa_seqs: train_samples.append(segment_axis(wav_seq, frame_len, overlap)) train_samples = np.vstack(train_samples[:]) np.random.seed(123) train_samples = np.random.permutation(train_samples) for wav_seq in valid_aa_seqs: valid_samples.append(segment_axis(wav_seq, frame_len, overlap)) valid_samples = np.vstack(valid_samples[:]) print 'train examples:', train_samples.shape print 'valid examples:', valid_samples.shape train_x = train_samples[:,:in_samples] train_y = train_samples[:,in_samples:] print train_x.shape, train_y.shape valid_x = valid_samples[:,:in_samples] valid_y = valid_samples[:,in_samples:] print valid_x.shape, valid_y.shape return utils.shared_dataset(train_x), \ utils.shared_dataset(train_y), \ utils.shared_dataset(valid_x), \ utils.shared_dataset(valid_y)
def get_emph_spec(audio, nperseg=256, noverlap = 96, nfft=512, fs=16000): # Function to generate the emphasized spectrogram prefac = 0.97 w = hamming(nperseg, sym=0) extract = preemp(audio, prefac) framed = segment_axis(extract, nperseg, noverlap) * w spec = np.abs(fft(framed, nfft, axis=-1)) return spec
def pro_signal(signal, window='hanning', frame_len=1024, overlap=512): if window == 'hanning': # w = np.hanning(frame_len) w = sqrt_hann(frame_len) else: w = window y = segment_axis(signal, frame_len, overlap=overlap, end='cut') # use cut instead of pad y = w * y return y
def frames(self, utterance, framelen, overlap): phtimes = self.phone_times(utterance) s = self.samples(utterance)[1] uttfr = [] uttph = [] for p in phtimes: if p[2] - p[1] > framelen: uttfr.append(segment_axis(s[p[1]:p[2]], framelen, overlap)) uttph.append(list(itertools.repeat(p[0], uttfr[-1].shape[0]))) return np.vstack(uttfr), list(itertools.chain(*uttph))
def build_data_sets(frame_len): """builds data sets for training/validating/testing the models""" print 'loading data...' save_stdout = sys.stdout sys.stdout = open('timit.log', 'w') # creating wrapper object for TIMIT dataset dataset = TIMIT() dataset.load("train") sys.stdout = save_stdout overlap = frame_len - 1 wav_seqs = dataset.train_raw_wav[0:10] norm_seqs = utils.normalize(wav_seqs) # Segment into frames samples = map(lambda seq: segment_axis(seq, frame_len, overlap), norm_seqs) # stack all data in one matrix, each row is a frame data = np.vstack(samples) # shuffle the frames so we can assume data is IID np.random.seed(123) data = np.random.permutation(data) # take 10% for test, 10% for valid, and 80% for training chunk = data.shape[0] / 10 # now split data to x and y for train, valid, and test train_x = data[:8*chunk,:-1] train_y = data[:8*chunk,-1] valid_x = data[8*chunk:9*chunk,:-1] valid_y = data[8*chunk:9*chunk,-1] test_x = data[9*chunk:,:-1] test_y = data[9*chunk:,-1] print 'train_x shape', train_x.shape print 'train_y shape', train_y.shape print 'Done' print 'There are %d training samples'%train_x.shape[0] print 'There are %d validation samples'%valid_x.shape[0] return utils.shared_dataset_xy((train_x,train_y)),\ utils.shared_dataset_xy((valid_x,valid_y)),\ utils.shared_dataset_xy((test_x,test_y))
def build_data_sets(frame_len): """builds data sets for training/validating/testing the models""" print 'loading data...' save_stdout = sys.stdout sys.stdout = open('timit.log', 'w') # creating wrapper object for TIMIT dataset dataset = TIMIT() dataset.load("train") sys.stdout = save_stdout overlap = frame_len - 1 wav_seqs = dataset.train_raw_wav[0:10] norm_seqs = utils.normalize(wav_seqs) # Segment into frames samples = map(lambda seq: segment_axis(seq, frame_len, overlap), norm_seqs) # stack all data in one matrix, each row is a frame data = np.vstack(samples) # shuffle the frames so we can assume data is IID np.random.seed(123) data = np.random.permutation(data) # take 10% for test, 10% for valid, and 80% for training chunk = data.shape[0] / 10 # now split data to x and y for train, valid, and test train_x = data[:8 * chunk, :-1] train_y = data[:8 * chunk, -1] valid_x = data[8 * chunk:9 * chunk, :-1] valid_y = data[8 * chunk:9 * chunk, -1] test_x = data[9 * chunk:, :-1] test_y = data[9 * chunk:, -1] print 'train_x shape', train_x.shape print 'train_y shape', train_y.shape print 'Done' print 'There are %d training samples' % train_x.shape[0] print 'There are %d validation samples' % valid_x.shape[0] return utils.shared_dataset_xy((train_x,train_y)),\ utils.shared_dataset_xy((valid_x,valid_y)),\ utils.shared_dataset_xy((test_x,test_y))
def get_markov_frames(self, subset, id): """ Given the subset and an id, this method returns the list [input_frames, input_phonemes, input_words, output_phoneme, output_word, spkr_info, output_frame, ending_phoneme, ending_word]. """ assert subset + "_intervals_seq" in self.__dict__.keys() assert id < self.__dict__[subset + "_intervals_seq"][-1] n_frames_in = self.__dict__[subset + "_n_frames_in"] frame_length = self.__dict__[subset + "_frame_length"] overlap = self.__dict__[subset + "_overlap"] wav_length = self.__dict__[subset + "_wav_length"] intervals_seq = self.__dict__[subset + "_intervals_seq"] # Find the acoustic samples sequence we are looking for seq_id = np.digitize([id], intervals_seq) - 1 seq_id = seq_id[0] # Find the position in this sequence idx_in_seq = id - intervals_seq[seq_id] - (wav_length - frame_length \ + overlap) # Get the sequence wav_seq = self.__dict__[subset + "_raw_wav"][seq_id] # Get the phonemes phn_l_start = self.__dict__[subset + "_seq_to_phn"][seq_id][0] phn_l_end = self.__dict__[subset + "_seq_to_phn"][seq_id][1] phn_start_end = self.__dict__[subset + "_phn"][phn_l_start:phn_l_end] phn_seq = np.zeros_like(wav_seq) # Some timestamp does not correspond to any phoneme so 0 is # the index for "NO_PHONEME" and the other index are shifted by one for (phn_start, phn_end, phn) in phn_start_end: phn_seq[phn_start:phn_end] = phn + 1 # Get the words wrd_l_start = self.__dict__[subset + "_seq_to_wrd"][seq_id][0] wrd_l_end = self.__dict__[subset + "_seq_to_wrd"][seq_id][1] wrd_start_end = self.__dict__[subset + "_wrd"][wrd_l_start:wrd_l_end] wrd_seq = np.zeros_like(wav_seq) # Some timestamp does not correspond to any word so 0 is # the index for "NO_WORD" and the other index are shifted by one for (wrd_start, wrd_end, wrd) in wrd_start_end: wrd_seq[wrd_start:wrd_end] = wrd + 1 # Binary variable announcing the end of the word or phoneme end_phn = np.zeros_like(phn_seq) end_wrd = np.zeros_like(wrd_seq) for i in range(len(phn_seq) - 1): if phn_seq[i] != phn_seq[i + 1]: end_phn[i] = 1 if wrd_seq[i] != wrd_seq[i + 1]: end_wrd[i] = 1 end_phn[-1] = 1 end_wrd[-1] = 1 # Find the speaker id spkr_id = self.__dict__[subset + "_spkr"][seq_id] # Find the speaker info spkr_info = self.spkrinfo[spkr_id] # Pick the selected segment padded_wav_seq = np.zeros((wav_length)) if idx_in_seq < 0: padded_wav_seq[-idx_in_seq:] = wav_seq[0:(wav_length + idx_in_seq)] else: padded_wav_seq = wav_seq[idx_in_seq:(idx_in_seq + wav_length)] padded_phn_seq = np.zeros((wav_length)) if idx_in_seq < 0: padded_phn_seq[-idx_in_seq:] = phn_seq[0:(wav_length + idx_in_seq)] else: padded_phn_seq = phn_seq[idx_in_seq:(idx_in_seq + wav_length)] padded_wrd_seq = np.zeros((wav_length)) if idx_in_seq < 0: padded_wrd_seq[-idx_in_seq:] = wrd_seq[0:(wav_length + idx_in_seq)] else: padded_wrd_seq = wrd_seq[idx_in_seq:(idx_in_seq + wav_length)] # Segment into frames wav_seq = segment_axis(padded_wav_seq, frame_length, overlap) # Take the most occurring phoneme in a sequence phn_seq = segment_axis(padded_phn_seq, frame_length, overlap) phn_seq = scipy.stats.mode(phn_seq, axis=1)[0].flatten() phn_seq = np.asarray(phn_seq, dtype='int') # Take the most occurring word in a sequence wrd_seq = segment_axis(padded_wrd_seq, frame_length, overlap) wrd_seq = scipy.stats.mode(wrd_seq, axis=1)[0].flatten() wrd_seq = np.asarray(wrd_seq, dtype='int') # Announce the end if and only if it was announced in the current frame end_phn = segment_axis(end_phn, frame_length, overlap) end_phn = end_phn.max(axis=1) end_wrd = segment_axis(end_wrd, frame_length, overlap) end_wrd = end_wrd.max(axis=1) # Put names on the output input_frames = wav_seq[:-1] input_phonemes = phn_seq[:-1] input_words = wrd_seq[:-1] output_phoneme = phn_seq[-1] output_word = wrd_seq[-1] output_frame = wav_seq[-1] ending_phoneme = end_phn[-1] ending_word = end_wrd[-1] return [input_frames, input_phonemes, input_words, output_phoneme, \ output_word, spkr_info, output_frame, ending_phoneme, \ ending_word]
def get_raw_seq(self, subset, seq_id, frame_length, overlap): """ Given the id of the subset, the id of the sequence, the frame length and the overlap between frames, this method will return a frames sequence from a given set, the associated phonemes and words sequences (including a binary variable indicating change) and the information vector on the speaker. """ self.check_subset_value(subset) self.check_subset_presence(subset) # Check if the id is valid n_seq = self.__dict__[subset + "_n_seq"] if seq_id >= n_seq: raise ValueError("This sequence does not exist.") import pdb pdb.set_trace() # Get the sequence wav_seq = self.__dict__[subset + "_raw_wav"][seq_id] # Get the phonemes phn_l_start = self.__dict__[subset + "_seq_to_phn"][seq_id][0] phn_l_end = self.__dict__[subset + "_seq_to_phn"][seq_id][1] phn_start_end = self.__dict__[subset + "_phn"][phn_l_start:phn_l_end] phn_seq = np.zeros_like(wav_seq) # Some timestamp does not correspond to any phoneme so 0 is # the index for "NO_PHONEME" and the other index are shifted by one for (phn_start, phn_end, phn) in phn_start_end: phn_seq[phn_start:phn_end] = phn + 1 # Get the words wrd_l_start = self.__dict__[subset + "_seq_to_wrd"][seq_id][0] wrd_l_end = self.__dict__[subset + "_seq_to_wrd"][seq_id][1] wrd_start_end = self.__dict__[subset + "_wrd"][wrd_l_start:wrd_l_end] wrd_seq = np.zeros_like(wav_seq) # Some timestamp does not correspond to any word so 0 is # the index for "NO_WORD" and the other index are shifted by one for (wrd_start, wrd_end, wrd) in wrd_start_end: wrd_seq[wrd_start:wrd_end] = wrd + 1 import pdb pdb.set_trace() # Binary variable announcing the end of the word or phoneme end_phn = np.zeros_like(phn_seq) end_wrd = np.zeros_like(wrd_seq) for i in range(len(phn_seq) - 1): if phn_seq[i] != phn_seq[i + 1]: end_phn[i] = 1 if wrd_seq[i] != wrd_seq[i + 1]: end_wrd[i] = 1 end_phn[-1] = 1 end_wrd[-1] = 1 import pdb pdb.set_trace() # Find the speaker id spkr_id = self.__dict__[subset + "_spkr"][seq_id] # Find the speaker info spkr_info = self.spkrinfo[spkr_id] # Segment into frames wav_seq = segment_axis(wav_seq, frame_length, overlap) # Take the most occurring phoneme in a frame phn_seq = segment_axis(phn_seq, frame_length, overlap) phn_seq = scipy.stats.mode(phn_seq, axis=1)[0].flatten() phn_seq = np.asarray(phn_seq, dtype='int') # Take the most occurring word in a frame wrd_seq = segment_axis(wrd_seq, frame_length, overlap) wrd_seq = scipy.stats.mode(wrd_seq, axis=1)[0].flatten() wrd_seq = np.asarray(wrd_seq, dtype='int') # Announce the end if and only if it was announced in the current frame end_phn = segment_axis(end_phn, frame_length, overlap) end_phn = end_phn.max(axis=1) end_wrd = segment_axis(end_wrd, frame_length, overlap) end_wrd = end_wrd.max(axis=1) return [wav_seq, phn_seq, end_phn, wrd_seq, end_wrd, spkr_info]
def mfcc(input, nwin=256, nfft=512, fs=16000, nceps=13): """Compute Mel Frequency Cepstral Coefficients. Parameters ---------- input: ndarray input from which the coefficients are computed Returns ------- ceps: ndarray Mel-cepstrum coefficients mspec: ndarray Log-spectrum in the mel-domain. Notes ----- MFCC are computed as follows: * Pre-processing in time-domain (pre-emphasizing) * Compute the spectrum amplitude by windowing with a Hamming window * Filter the signal in the spectral domain with a triangular filter-bank, whose filters are approximatively linearly spaced on the mel scale, and have equal bandwith in the mel scale * Compute the DCT of the log-spectrum References ---------- .. [1] S.B. Davis and P. Mermelstein, "Comparison of parametric representations for monosyllabic word recognition in continuously spoken sentences", IEEE Trans. Acoustics. Speech, Signal Proc. ASSP-28 (4): 357-366, August 1980.""" # MFCC parameters: taken from auditory toolbox over = nwin - 160 # Pre-emphasis factor (to take into account the -6dB/octave rolloff of the # radiation at the lips level) prefac = 0.97 #lowfreq = 400 / 3. lowfreq = 133.33 #highfreq = 6855.4976 linsc = 200 / 3. logsc = 1.0711703 nlinfil = 13 nlogfil = 27 nfil = nlinfil + nlogfil w = hamming(nwin, sym=0) fbank = trfbank(fs, nfft, lowfreq, linsc, logsc, nlinfil, nlogfil)[0] #------------------ # Compute the MFCC #------------------ extract = preemp(input, prefac) framed = segment_axis(extract, nwin, over) * w # Compute the spectrum magnitude spec = np.abs(fft(framed, nfft, axis=-1)) # Filter the spectrum through the triangle filterbank mspec = np.log10(np.dot(spec, fbank.T)) # Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain) ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:, :nceps] return ceps, mspec, spec
def mfcc(input, nwin=256, nfft=512, fs=16000, nceps=13): """Compute Mel Frequency Cepstral Coefficients. Parameters ---------- input: ndarray input from which the coefficients are computed Returns ------- ceps: ndarray Mel-cepstrum coefficients mspec: ndarray Log-spectrum in the mel-domain. Notes ----- MFCC are computed as follows: * Pre-processing in time-domain (pre-emphasizing) * Compute the spectrum amplitude by windowing with a Hamming window * Filter the signal in the spectral domain with a triangular filter-bank, whose filters are approximatively linearly spaced on the mel scale, and have equal bandwith in the mel scale * Compute the DCT of the log-spectrum References ---------- .. [1] S.B. Davis and P. Mermelstein, "Comparison of parametric representations for monosyllabic word recognition in continuously spoken sentences", IEEE Trans. Acoustics. Speech, Signal Proc. ASSP-28 (4): 357-366, August 1980.""" # MFCC parameters: taken from auditory toolbox over = nwin - 160 # Pre-emphasis factor (to take into account the -6dB/octave rolloff of the # radiation at the lips level) prefac = 0.97 #lowfreq = 400 / 3. lowfreq = 133.33 #highfreq = 6855.4976 linsc = 200/3. logsc = 1.0711703 nlinfil = 13 nlogfil = 27 nfil = nlinfil + nlogfil w = hamming(nwin, sym=0) fbank = trfbank(fs, nfft, lowfreq, linsc, logsc, nlinfil, nlogfil)[0] #------------------ # Compute the MFCC #------------------ extract = preemp(input, prefac) framed = segment_axis(extract, nwin, over) * w # Compute the spectrum magnitude spec = np.abs(fft(framed, nfft, axis=-1)) # Filter the spectrum through the triangle filterbank mspec = np.log10(np.dot(spec, fbank.T)) # Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain) ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:, :nceps] return ceps, mspec, spec
def get_markov_frames(self, subset, id): """ Given the subset and an id, this method returns the list [input_frames, input_phonemes, input_words, output_phoneme, output_word, spkr_info, output_frame, ending_phoneme, ending_word]. """ assert subset+"_intervals_seq" in self.__dict__.keys() assert id < self.__dict__[subset+"_intervals_seq"][-1] n_frames_in = self.__dict__[subset+"_n_frames_in"] frame_length = self.__dict__[subset+"_frame_length"] overlap = self.__dict__[subset+"_overlap"] wav_length = self.__dict__[subset+"_wav_length"] intervals_seq = self.__dict__[subset+"_intervals_seq"] # Find the acoustic samples sequence we are looking for seq_id = np.digitize([id], intervals_seq) - 1 seq_id = seq_id[0] # Find the position in this sequence idx_in_seq = id - intervals_seq[seq_id] - (wav_length - frame_length \ + overlap) # Get the sequence wav_seq = self.__dict__[subset+"_raw_wav"][seq_id] # Get the phonemes phn_l_start = self.__dict__[subset+"_seq_to_phn"][seq_id][0] phn_l_end = self.__dict__[subset+"_seq_to_phn"][seq_id][1] phn_start_end = self.__dict__[subset+"_phn"][phn_l_start:phn_l_end] phn_seq = np.zeros_like(wav_seq) # Some timestamp does not correspond to any phoneme so 0 is # the index for "NO_PHONEME" and the other index are shifted by one for (phn_start, phn_end, phn) in phn_start_end: phn_seq[phn_start:phn_end] = phn+1 # Get the words wrd_l_start = self.__dict__[subset+"_seq_to_wrd"][seq_id][0] wrd_l_end = self.__dict__[subset+"_seq_to_wrd"][seq_id][1] wrd_start_end = self.__dict__[subset+"_wrd"][wrd_l_start:wrd_l_end] wrd_seq = np.zeros_like(wav_seq) # Some timestamp does not correspond to any word so 0 is # the index for "NO_WORD" and the other index are shifted by one for (wrd_start, wrd_end, wrd) in wrd_start_end: wrd_seq[wrd_start:wrd_end] = wrd+1 # Binary variable announcing the end of the word or phoneme end_phn = np.zeros_like(phn_seq) end_wrd = np.zeros_like(wrd_seq) for i in range(len(phn_seq) - 1): if phn_seq[i] != phn_seq[i+1]: end_phn[i] = 1 if wrd_seq[i] != wrd_seq[i+1]: end_wrd[i] = 1 end_phn[-1] = 1 end_wrd[-1] = 1 # Find the speaker id spkr_id = self.__dict__[subset+"_spkr"][seq_id] # Find the speaker info spkr_info = self.spkrinfo[spkr_id] # Pick the selected segment padded_wav_seq = np.zeros((wav_length)) if idx_in_seq < 0: padded_wav_seq[-idx_in_seq:] = wav_seq[0:(wav_length+idx_in_seq)] else: padded_wav_seq = wav_seq[idx_in_seq:(idx_in_seq + wav_length)] padded_phn_seq = np.zeros((wav_length)) if idx_in_seq < 0: padded_phn_seq[-idx_in_seq:] = phn_seq[0:(wav_length+idx_in_seq)] else: padded_phn_seq = phn_seq[idx_in_seq:(idx_in_seq + wav_length)] padded_wrd_seq = np.zeros((wav_length)) if idx_in_seq < 0: padded_wrd_seq[-idx_in_seq:] = wrd_seq[0:(wav_length+idx_in_seq)] else: padded_wrd_seq = wrd_seq[idx_in_seq:(idx_in_seq + wav_length)] # Segment into frames wav_seq = segment_axis(padded_wav_seq, frame_length, overlap) # Take the most occurring phoneme in a sequence phn_seq = segment_axis(padded_phn_seq, frame_length, overlap) phn_seq = scipy.stats.mode(phn_seq, axis=1)[0].flatten() phn_seq = np.asarray(phn_seq, dtype='int') # Take the most occurring word in a sequence wrd_seq = segment_axis(padded_wrd_seq, frame_length, overlap) wrd_seq = scipy.stats.mode(wrd_seq, axis=1)[0].flatten() wrd_seq = np.asarray(wrd_seq, dtype='int') # Announce the end if and only if it was announced in the current frame end_phn = segment_axis(end_phn, frame_length, overlap) end_phn = end_phn.max(axis=1) end_wrd = segment_axis(end_wrd, frame_length, overlap) end_wrd = end_wrd.max(axis=1) # Put names on the output input_frames = wav_seq[:-1] input_phonemes = phn_seq[:-1] input_words = wrd_seq[:-1] output_phoneme = phn_seq[-1] output_word = wrd_seq[-1] output_frame = wav_seq[-1] ending_phoneme = end_phn[-1] ending_word = end_wrd[-1] return [input_frames, input_phonemes, input_words, output_phoneme, \ output_word, spkr_info, output_frame, ending_phoneme, \ ending_word]
def get_raw_seq(self, subset, seq_id, frame_length, overlap): """ Given the id of the subset, the id of the sequence, the frame length and the overlap between frames, this method will return a frames sequence from a given set, the associated phonemes and words sequences (including a binary variable indicating change) and the information vector on the speaker. """ self.check_subset_value(subset) self.check_subset_presence(subset) # Check if the id is valid n_seq = self.__dict__[subset+"_n_seq"] if seq_id >= n_seq: raise ValueError("This sequence does not exist.") import pdb; pdb.set_trace() # Get the sequence wav_seq = self.__dict__[subset+"_raw_wav"][seq_id] # Get the phonemes phn_l_start = self.__dict__[subset+"_seq_to_phn"][seq_id][0] phn_l_end = self.__dict__[subset+"_seq_to_phn"][seq_id][1] phn_start_end = self.__dict__[subset+"_phn"][phn_l_start:phn_l_end] phn_seq = np.zeros_like(wav_seq) # Some timestamp does not correspond to any phoneme so 0 is # the index for "NO_PHONEME" and the other index are shifted by one for (phn_start, phn_end, phn) in phn_start_end: phn_seq[phn_start:phn_end] = phn+1 # Get the words wrd_l_start = self.__dict__[subset+"_seq_to_wrd"][seq_id][0] wrd_l_end = self.__dict__[subset+"_seq_to_wrd"][seq_id][1] wrd_start_end = self.__dict__[subset+"_wrd"][wrd_l_start:wrd_l_end] wrd_seq = np.zeros_like(wav_seq) # Some timestamp does not correspond to any word so 0 is # the index for "NO_WORD" and the other index are shifted by one for (wrd_start, wrd_end, wrd) in wrd_start_end: wrd_seq[wrd_start:wrd_end] = wrd+1 import pdb; pdb.set_trace() # Binary variable announcing the end of the word or phoneme end_phn = np.zeros_like(phn_seq) end_wrd = np.zeros_like(wrd_seq) for i in range(len(phn_seq) - 1): if phn_seq[i] != phn_seq[i+1]: end_phn[i] = 1 if wrd_seq[i] != wrd_seq[i+1]: end_wrd[i] = 1 end_phn[-1] = 1 end_wrd[-1] = 1 import pdb; pdb.set_trace() # Find the speaker id spkr_id = self.__dict__[subset+"_spkr"][seq_id] # Find the speaker info spkr_info = self.spkrinfo[spkr_id] # Segment into frames wav_seq = segment_axis(wav_seq, frame_length, overlap) # Take the most occurring phoneme in a frame phn_seq = segment_axis(phn_seq, frame_length, overlap) phn_seq = scipy.stats.mode(phn_seq, axis=1)[0].flatten() phn_seq = np.asarray(phn_seq, dtype='int') # Take the most occurring word in a frame wrd_seq = segment_axis(wrd_seq, frame_length, overlap) wrd_seq = scipy.stats.mode(wrd_seq, axis=1)[0].flatten() wrd_seq = np.asarray(wrd_seq, dtype='int') # Announce the end if and only if it was announced in the current frame end_phn = segment_axis(end_phn, frame_length, overlap) end_phn = end_phn.max(axis=1) end_wrd = segment_axis(end_wrd, frame_length, overlap) end_wrd = end_wrd.max(axis=1) return [wav_seq, phn_seq, end_phn, wrd_seq, end_wrd, spkr_info]
def _build_frames_w_phn(dataset, subset, wav_seqs, seqs_to_phns, in_samples, out_samples, shift, win_width, shuffle): #import pdb; pdb.set_trace() norm_seqs = utils.standardize(wav_seqs) #norm_seqs = utils.normalize(wav_seqs) frame_len = in_samples + out_samples overlap = frame_len - shift samples = [] seqs_phn_info = [] seqs_phn_shift = [] # CAUTION!: I am using here reduced phone set # we can also try using the full set but we must store phn+1 # because 0 no more refers to 'h#' (no speech) for ind in range(len(norm_seqs)): #import pdb; pdb.set_trace() wav_seq = norm_seqs[ind] phn_seq = seqs_to_phns[ind] phn_start_end = dataset.__dict__[subset+"_phn"][phn_seq[0]:phn_seq[1]] # create a matrix with consecutive windows # phones are padded by h#, because each window will be shifted once # the first phone samples has passed phones = np.append(phn_start_end[:,2].astype('int16'), np.zeros((1,),dtype='int16')) # phones = np.append(phn_start_end[:,2], # np.zeros((1,))) phn_windows = segment_axis(phones, win_width, win_width-1) # array that has endings of each phone phn_ends = phn_start_end[:,1] # extend the last phone till the end, this is not wrong as long as the # last phone is no speech phone (h#) phn_ends[-1] = wav_seq.shape[0]-1 # create a mapping from each sample to phn_window phn_win_shift = np.zeros_like(wav_seq,dtype='int16') phn_win_shift[phn_ends] = 1 phn_win = phn_win_shift.cumsum(dtype='int16') # minor correction! phn_win[-1] = phn_win[-2] # Segment samples into frames samples.append(segment_axis(wav_seq, frame_len, overlap)) # for phones we care only about one value to mark the start of a new window. # the start of a phone window in a frame is when all samples of previous # phone hav passed, so we use 'min' function to choose the current phone # of the frame phn_frames = segment_axis(phn_win, frame_len, overlap).min(axis=1) # replace the window index with the window itself win_frames = phn_windows[phn_frames] seqs_phn_info.append(win_frames) #import pdb; pdb.set_trace() # create a window shift for each frame shift_frames_aux = np.roll(phn_frames,1) shift_frames_aux[0] = 0 shift_frames = phn_frames - shift_frames_aux # to mark the ending of the sequence - countering the first correction! shift_frames[-1] = 1 seqs_phn_shift.append(shift_frames) #import pdb; pdb.set_trace() #import pdb; pdb.set_trace() # stack all data in one matrix, each row is a frame samples_data = np.vstack(samples[:]) phn_data = np.vstack(seqs_phn_info[:]) shift_data = np.hstack(seqs_phn_shift[:]) #convert phone data to one-hot from pylearn2.format.target_format import OneHotFormatter fmt = OneHotFormatter(max_labels=39, dtype='float32') phn_data = fmt.format(phn_data) phn_data = phn_data.reshape(phn_data.shape[0], phn_data.shape[1]*phn_data.shape[2]) full_data = np.hstack([samples_data[:,:in_samples], phn_data, #input samples_data[:,in_samples:], #out1 shift_data.reshape(shift_data.shape[0],1)]) #out2 if shuffle: np.random.seed(123) full_data = np.random.permutation(full_data) data_x = full_data[:,:in_samples+win_width*39] data_y1 = full_data[:,in_samples+win_width*39:-1] data_y2 = full_data[:,-1] print 'Done' print 'There are %d examples in %s set'%(data_x.shape[0],subset) print "--------------" print 'data_x.shape', data_x.shape print 'data_y1.shape', data_y1.shape return utils.shared_dataset(data_x), \ utils.shared_dataset(data_y1),\ utils.shared_dataset(data_y2)
from scipy.io import wavfile from scikits.talkbox import segment_axis resolution = 160 step = 8 b = 1.019 n_channels = 64 overlap = 80 # Compute a multiscale dictionary D_multi = np.r_[tuple(gammatone_matrix(b, fc, resolution, step) for fc in erb_space(150, 8000, n_channels))] # Load test signal fs, y = wavfile.read('/home/jfsantos/data/TIMIT/TRAIN/DR1/FCJF0/SA1.WAV') y = y / 2.0**15 Y = segment_axis(y, resolution, overlap=overlap, end='pad') Y = np.hanning(resolution) * Y # segments should be windowed and overlap coder = SparseCoder(dictionary=D_multi, transform_n_nonzero_coefs=None, transform_alpha=1., transform_algorithm='omp') X = coder.transform(Y) density = len(np.flatnonzero(X)) out= np.zeros((np.ceil(len(y)/resolution)+1)*resolution) for k in range(0, len(X)): idx = range(k*(resolution-overlap),k*(resolution-overlap) + resolution) out[idx] += np.dot(X[k], D_multi) squared_error = np.sum((y - out[0:len(y)]) ** 2) wavfile.write('reconst_%d_%d.wav'%(resolution,overlap), fs, np.asarray(out, dtype=np.float32))
def make_frames(self, signal, fs, frame_duration, overlap=0.5): nsamples_pframe = fs * frame_duration / 1000 overlapframes = nsamples_pframe * overlap frames = segment_axis(signal, nsamples_pframe, overlapframes) return frames
for fc in erb_space(150, 8000, n_channels) ]).flatten() centers = np.array([ gammatone_matrix(b, fc, resolution, step)[2] + i * resolution for i, fc in enumerate(erb_space(150, 8000, n_channels)) ]).flatten() # Load test signal filename = 'data/fsew/fsew0_001.wav' f = Sndfile(filename, 'r') nf = f.nframes fs = f.samplerate length_sound = 20000 y = f.read_frames(5000) y = f.read_frames(length_sound) Y = segment_axis(y, resolution, overlap=overlap, end='pad') Y = np.hanning(resolution) * Y # Encoding with matching pursuit X = np.zeros((Y.shape[0], D_multi.shape[0])) for idx in range(Y.shape[0]): X[idx, :] = matching_pursuit(Y[idx, :], D_multi) # Reconstruction of the signal out = np.zeros(int((np.ceil(len(y) / resolution) + 1) * resolution)) for k in range(0, len(X)): idx = range(k * (resolution - overlap), k * (resolution - overlap) + resolution) out[idx] += np.dot(X[k], D_multi) squared_error = np.sum((y - out[0:len(y)])**2)