def read_pm(self, fname): f = open(fname, 'r') lines = f.readlines() f.close() for (i,line) in enumerate(lines): if line.startswith('EST_Header_End'): start = i+1 break lines = lines[start:] lines = [float(re.split('\s+',line)[0]) for line in lines] lines = np.array(lines) ## debug: make sure monotonic increase start_end = segment_axis(lines, 2, overlap=1) diffs = start_end[:,1] - start_end[:,0] neg_diffs = (diffs < 0.0) if sum(neg_diffs) > 0: print ('WARNING: pitch marks not monotonically increasing in %s'%(fname)) # return np.ones((1,1)) lines = lines[:-1] start_end = segment_axis(lines, 2, overlap=1) diffs = start_end[:,1] - start_end[:,0] neg_diffs = (diffs < 0.0) if sum(neg_diffs) > 0: print ('WARNING: pitch marks not monotonically increasing in %s'%(fname)) return lines
def get_file_data_from_one_file(self): ''' Here is most of the database specific stuff. This one should be provided by subclasses to manipulate data appropriately. ''' #print '----> get_file_data_from_one_file (%s)'%(self.operation) (wave_file, pitch_mark_file) = self.filelist[self.file_index] pitchmarks = read_pm(pitch_mark_file) * 48000 if pitchmarks.size == 1: return (np.zeros((0, 0)), np.zeros((0, 0))) pitchmarks = np.array(pitchmarks, dtype='int') pitchmark_triples = segment_axis(pitchmarks, 3, 2, axis=0) wave, sample_rate = read_wave(wave_file) pad_length = 1000 frags = [ get_pm_wavefrag(pm_trip, wave, pad_length) for pm_trip in pitchmark_triples ] frags = np.vstack(frags) ### do 0-1 range normalisation, asssume 16bit signed audio: data_range = math.pow(2, 16) frags -= (data_range / 2) * -1.0 frags /= data_range return (frags, frags)
def window_signal(self): """ Takes the signal field, divides it into frames and creates a window with the frame and two adjacent frames to create a window. Pads the end with zeros if the array size is not properly divisible. Does NOT modify original signal. """ samples_per_frame = self.get_frame_size() * self.get_fs() window_length = samples_per_frame * 3 return segment_axis(self.get_signal(), window_length, samples_per_frame, end="pad")
def cache(data): # first sort by the given dim: xdim_index = self.dims.index(progression_dim) sorted_data = self.data[self.data[:,xdim_index].argsort(),] # create windows: from segmentaxis import segment_axis seg_data = segment_axis(sorted_data, window_size, overlap, axis=0) med_data = np.median(seg_data, axis=1) data.table = DataTable(med_data, self.dims)
def __init__(self, which_set='train', seq_len=3010, frame_size=320, hop_size=32, axes=('b', 0, 1, 'c'), noutput_frames=1, ninput_frames=9, overlap=False, # not implemented yet preprocessing=True): self.__dict__.update(locals()) del self.self # cut sequences down to seq_len dat = np.load('/data/lisa_ubi/speech/onomatopoeia/dataset/per_phone_timit/wav_aa.npy') lengths = [len(i) for i in dat] daat = A([A(dat[i][:seq_len]) for i in range(len(dat)) if lengths[i] > seq_len]) if preprocessing: self.mean = np.mean(daat) daat -= self.mean self.std = np.std(daat) daat /= self.std # Convert to Spectral daat = A([np.abs(stft(ex,1,frame_size,hop_size)).flatten() for ex in daat]) #print daat.shape #aat = A([arr.reshape(len(arr),-1) for arr in daat]) print daat.shape # Stride and flatten #print ninput_frames*frame_size daat = segment_axis(daat, ninput_frames*frame_size, (ninput_frames-1)*frame_size, axis=1) print daat.shape if which_set == 'train': daat = daat[:int(.8*len(daat))] if which_set == 'valid': daat = daat[int(.8*len(daat)):int(.9*len(daat))] if which_set == 'test': daat = daat[int(.9*len(daat)):] features = daat[:,:-noutput_frames,:].reshape(-1,frame_size*ninput_frames) targets = daat[:,noutput_frames:,-frame_size*noutput_frames:].reshape(-1,frame_size*noutput_frames) # FIXME BELOW HERE for CNNs #-------------------------- IMAGES_SHAPE = features.shape + (1,) print targets.shape print features.shape X, y = features, targets view_converter = DefaultViewConverter(shape=IMAGES_SHAPE, axes=axes) super(AA, self).__init__(X=X, y=y, view_converter=view_converter)
def test_merge_frames(self): """ Merges frames back into a single array presenting a reconstructed audio signal. """ expected_array = np.array([0, 1, 2, 3, 3.4, 4.25, 6, 7, 6.8, 7.65, 10, 11, 10.2, 11.05, 14, 15, 0, 0]) input_array = segment_axis(np.arange(0, 16), 6, 2, end="pad") fake_lpc_frame_array = LPCFrameArray(1, 2, input_array) fake_synthesizer = Synthesizer(fake_lpc_frame_array) result_array = fake_synthesizer._merge_frames(input_array) assert_array_almost_equal(expected_array, result_array)
def frame_signal(input, nwin=256, over=None): # MFCC parameters: taken from auditory toolbox if over is None: over = nwin - 160 # Pre-emphasis factor (to take into account the -6dB/octave rolloff of the # radiation at the lips level) prefac = 0.97 extract = preemp(input, prefac) w = hamming(nwin, sym=0) framed = segment_axis(extract, nwin, over) * w return framed
def window_agg(self, progression_dim, window_size=1000, overlap=500, agg_method='median'): """ Creates a sliding window that moves over the specified dimension and aggregates all values per window """ # First try using bottleneck, ignoring overlap try: return self.window_agg_using_bottleneck(progression_dim, window_size, overlap, agg_method) except ImportError: pass window_size = int(window_size) overlap = int(overlap) # first sort by the given dim: xdim_index = self.dims.index(progression_dim) sorted_data = self.data[self.data[:, xdim_index].argsort(), ] # create windows: from segmentaxis import segment_axis with Timer('segment_axis'): seg_data = segment_axis(sorted_data, window_size, overlap, axis=0) # we want to calculate the median in iterations, so that we don't run # out of memory when sorting the strides. Maybe this is unnecessary # for average, but we do it for it as well. min_values_per_iteration = 50 * 1000000. values_per_window = seg_data.shape[1] * seg_data.shape[2] windows_per_iteration = np.floor(min_values_per_iteration / values_per_window) start_win = 0 agg_data = None while start_win < seg_data.shape[0]: iteration_data = seg_data[start_win:start_win + windows_per_iteration] if agg_method == 'median': with Timer('Median over %d windows' % windows_per_iteration): iteration_agg_data = np.median(seg_data, axis=1) elif agg_method == 'average': iteration_agg_data = np.average(seg_data, axis=1) else: raise Exception('Unknown agg method') if agg_data == None: agg_data = iteration_agg_data else: agg_data = np.concatenate((agg_data, iteration_agg_data)) start_win += windows_per_iteration print start_win return DataTable(agg_data, self.dims, self.legends, self.tags.copy())
def get_multiple_trees_for_greedy_search(self): ''' Partition data in hard way with k-means, build 1 KD-tree per partition ''' #! m,n = self.unit_start_data.shape self.prev_join_rep = self.unit_start_data ## !osw self.current_join_rep = self.unit_end_data ## !osw start_time = self.start_clock('build multiple joint KD trees') ## Needs to be stored synthesis options specified (due to weights applied before tree construction): treefile = get_data_dump_name(self.config) + '_' + make_synthesis_condition_name(self.config) + '_joint_tree.pkl' multiepoch = self.config.get('multiepoch', 1) overlap = 0 if multiepoch > 1: overlap = multiepoch-1 ### reshape target rep: m,n = self.train_unit_features.shape self.train_unit_features = segment_axis(self.train_unit_features, multiepoch, overlap=overlap, axis=0) self.train_unit_features = self.train_unit_features.reshape(m-overlap,n*multiepoch) if self.config.get('last_frame_as_target', False): print 'test -- take last frame only as target...' ## TODO99 # self.train_unit_features = self.train_unit_features[:,-n:] self.train_unit_features = np.hstack([self.train_unit_features[:,:n], self.train_unit_features[:,-n:]]) ### alter join reps: -- first tried taking first vs. last m,n = self.current_join_rep.shape self.current_join_rep = self.current_join_rep[overlap:,:] self.prev_join_rep = self.prev_join_rep[:-overlap, :] #### ---- cluster self.train_unit_features self.cluster(self.config.get('multiple_search_trees', 1), limit=self.config.get('cluster_data_on_npoints', 10000)) ### ^--- populates self.cluster_ixx combined_rep = np.hstack([self.prev_join_rep, self.train_unit_features]) self.joint_trees = [] self.node_maps = [] for cluster_number in range(self.config.get('multiple_search_trees', 1)): t = self.start_clock('make joint join + target tree for cluster %s...'%(cluster_number)) self.joint_trees.append(scipy.spatial.cKDTree(combined_rep[self.cluster_ixx==cluster_number, :], leafsize=100, balanced_tree=False, compact_nodes=False)) self.node_maps.append(np.arange(self.number_of_units-overlap)[self.cluster_ixx==cluster_number]) self.stop_clock(t)
def window_agg(self, progression_dim, window_size=1000, overlap=500, agg_method='median'): """ Creates a sliding window that moves over the specified dimension and aggregates all values per window """ # First try using bottleneck, ignoring overlap try: return self.window_agg_using_bottleneck(progression_dim, window_size, overlap, agg_method) except ImportError: pass window_size = int(window_size) overlap = int(overlap) # first sort by the given dim: xdim_index = self.dims.index(progression_dim) sorted_data = self.data[self.data[:,xdim_index].argsort(),] # create windows: from segmentaxis import segment_axis with Timer('segment_axis'): seg_data = segment_axis(sorted_data, window_size, overlap, axis=0) # we want to calculate the median in iterations, so that we don't run # out of memory when sorting the strides. Maybe this is unnecessary # for average, but we do it for it as well. min_values_per_iteration = 50*1000000. values_per_window = seg_data.shape[1] * seg_data.shape[2] windows_per_iteration = np.floor(min_values_per_iteration / values_per_window) start_win = 0 agg_data = None while start_win < seg_data.shape[0]: iteration_data = seg_data[start_win:start_win + windows_per_iteration] if agg_method == 'median': with Timer('Median over %d windows' % windows_per_iteration): iteration_agg_data = np.median(seg_data, axis=1) elif agg_method == 'average': iteration_agg_data = np.average(seg_data, axis=1) else: raise Exception('Unknown agg method') if agg_data == None: agg_data = iteration_agg_data else: agg_data = np.concatenate((agg_data, iteration_agg_data)) start_win += windows_per_iteration print start_win return DataTable(agg_data, self.dims, self.legends, self.tags.copy())
def match(p): x = np.clip(p, 1e-6, 1 - 1e-6) * scales + mins #print(' '.join(str(s) for s in x), end=' ') model = trm.TubeModel(trm.Parameters( file_format=0, sample_rate_hz=22050.0, control_rate_hz=25.0, volume_db=60.0, channels=1, balance=0.0, waveform=0, pulse_rise=x[0], pulse_fall_min=x[1], pulse_fall_max=x[2], breathiness=x[3], length_cm=x[4], temperature_degc=25.0, loss_factor=x[5], aperture_scale_cm=x[6], mouth_coeff_hz=x[7], nose_coeff_hz=x[8], nose_radii_cm=x[9:14], throat_lowpass_cutoff_hz=1500.0, throat_volume_db=6.0, modulation=1, noise_crossmix_offset_db=54.0)) if not model.is_ok(): return 1e200 T = int(np.ceil(seconds * 28)) frames = np.zeros((T, 16), float) timet = x[14] for i, (level0, levelt, levelT) in enumerate(np.split(x[15:], 16)): frames[:, i] = scipy.interpolate.interp1d( [0, timet, 1], [level0, levelt, levelT])(np.linspace(0, 1, T)) #print(x[:14]) #[print(' '.join(str(x) for x in f)) for f in frames] with timeout(1): x = np.array(model.synthesize(frames)) y = scipy.interpolate.interp1d( np.linspace(0, 1, len(x)), x / abs(x).max())( np.linspace(0, 1, int(len(x) * fs / 22050))) z = segmentaxis.segment_axis(y, width, int(width * overlap)) spec = abs(scipy.fftpack.fft(z * env)) err = np.linalg.norm(target - spec[:len(target), :1 + width // 2]) #print(err) return err
def greedy_joint_search(self, unit_features, start_state=-1, holdout=[]): assert self.config['target_representation'] == 'epoch' start_time = self.start_clock('Greedy search') path = [] m,n = self.current_join_rep.shape #m,n = self.join_contexts_unweighted.shape if start_state < 0: prev_join_vector = np.zeros((n,)) else: prev_join_vector = self.prev_join_rep[start_state, :] multiepoch = self.config.get('multiepoch', 1) if multiepoch > 1: ### reshape target rep: m,n = unit_features.shape unit_features = segment_axis(unit_features, multiepoch, overlap=0, axis=0) unit_features = unit_features.reshape(m/multiepoch,n*multiepoch) if self.config.get('last_frame_as_target', False): print 'test -- take last frame only as target...' ## TODO99 # unit_features = unit_features[:,-n:] unit_features = np.hstack([unit_features[:,:n], unit_features[:,-n:]]) ix = -1 final_dists = [] ### for debugging for target_vector in unit_features: both = np.concatenate([prev_join_vector, target_vector]).reshape((1,-1)) # dists, indexes = self.joint_tree.query(both, k=7 + len(holdout)) # , n_jobs=4) dists, indexes = self.joint_tree.query(both, k=1+len(holdout), eps=self.config.get('search_epsilon', 0.0)) # , n_jobs=4) dindexes = zip(dists.flatten(), indexes.flatten()) # if ix > -1: # ## TODO: forbid regression -- configurable # dindexes = [(d,i) for (d,i) in dindexes if i not in range(ix-5, ix+1)] # dindexes = [(d,i) for (d,i) in dindexes if i not in holdout] (d, ix) = dindexes[0] path.append(ix) final_dists.append(d) prev_join_vector = self.current_join_rep[ix,:] self.stop_clock(start_time) return path
def multi_index_patches(self, patch_length): assert patch_length >= 2 self.patches = [] self.patch_length = patch_length m, n_resolutions = self.cluster_ixx.shape for i in xrange(n_resolutions): res_patches = {} data = segment_axis(self.cluster_ixx[:, i].flatten(), patch_length, overlap=patch_length - 1, axis=0) for (ix, vals) in enumerate(data): key = tuple(vals.tolist()) if key not in res_patches: res_patches[key] = [] res_patches[key].append(ix) self.patches.append(res_patches)
def spectrum(pcm, nwin=512, nfft=512, fs=16000, stepr=0.5): """Compute spectrum for give audio snippet. pcm: the mono form audio input. stepr: the step ratio for audio segmentation (aka. overlap). The shape of the returned spec is (frequency bin, temporal bin,) """ # Pre-emphasis factor (to take into account the -6dB/octave rolloff of the # radiation at the lips level) prefac = 0.98 overlap = nwin*(1-stepr) window = hamming(nwin, sym=0) extract = _preemp(pcm, prefac) framed = segment_axis(extract, nwin, overlap) * window # Compute the spectrum magnitude spec = np.log10(np.abs(fft(framed, nfft, axis=-1))) spec = spec.T[0:nfft/2+1] # Make it zero-mean, so the start-up transients for the filter are minimized spec = spec - np.ma.mean(spec) return spec
def main(width, overlap, samplerate, root, *audio): if not samplerate: samplerate = 16000 if not width: width = 512 if not overlap: overlap = 0.75 env = np.hanning(width)[None, :].astype('f') for f in audio: rate, samples = scipy.io.wavfile.read(f) logging.info('%s: %d samples at %.1fkHz', os.path.basename(f), len(samples), rate / 1000) assert rate == samplerate assert len(samples.shape) == 1 samples -= samples.mean() X = segmentaxis.segment_axis(samples, width, int(width * overlap)) * env s = os.path.join(root, os.path.basename(f).replace('.wav', '-wave.npy')) logging.info('saving %s: %s', s, X.shape) np.save(s, X.astype('f'))
def index_patches(self, max_patch_length): self.patches = {} for i in range(max_patch_length): length = i + 1 if length == 1: for (ix, val) in enumerate(self.cluster_ixx): if (val, ) not in self.patches: self.patches[(val, )] = [] self.patches[(val, )].append(ix) else: data = segment_axis(self.cluster_ixx, length, overlap=length - 1, axis=0) for (ix, vals) in enumerate(data): key = tuple(vals.tolist()) if key not in self.patches: self.patches[key] = [] self.patches[key].append(ix)
def get_tree_for_greedy_search(self): #! m,n = self.unit_start_data.shape self.prev_join_rep = self.unit_start_data ## !osw self.current_join_rep = self.unit_end_data ## !osw multiepoch = self.config.get('multiepoch', 1) if multiepoch > 1: t = self.start_clock('reshape data for multiepoch...') overlap = multiepoch-1 ### reshape target rep: m,n = self.train_unit_features.shape self.train_unit_features = segment_axis(self.train_unit_features, multiepoch, overlap=overlap, axis=0) self.train_unit_features = self.train_unit_features.reshape(m-overlap,n*multiepoch) if self.config.get('last_frame_as_target', False): ### !TODO: keep this option? print 'test -- take last frame only as target...' # self.train_unit_features = self.train_unit_features[:,-n:] self.train_unit_features = np.hstack([self.train_unit_features[:,:n], self.train_unit_features[:,-n:]]) ### alter join reps: -- first tried taking first vs. last m,n = self.current_join_rep.shape self.current_join_rep = self.current_join_rep[overlap:,:] self.prev_join_rep = self.prev_join_rep[:-overlap, :] ### !TODO: revisit this? ### then, whole comparison for join: # m,n = self.current_join_rep.shape # self.current_join_rep = segment_axis(self.current_join_rep, multiepoch, overlap=overlap, axis=0).reshape(m-overlap,n*multiepoch) # self.prev_join_rep = segment_axis(self.prev_join_rep, multiepoch, overlap=overlap, axis=0).reshape(m-overlap,n*multiepoch) self.stop_clock(t) t = self.start_clock('stack data to train joint tree...') combined_rep = np.hstack([self.prev_join_rep, self.train_unit_features]) self.stop_clock(t) t = self.start_clock('make joint join + target tree...') ### For now, build each time from scratch -- compare resurrection time with rebuild time. self.joint_tree = scipy.spatial.cKDTree(combined_rep, leafsize=100, balanced_tree=False) # , compact_nodes=False) self.stop_clock(t)
def get_phone_seq(self, subset, frame_length, overlap, id, shuffling = True): """ Given the subset id, the number of frames wanted, the frame length, the overlap and the id, return the associated waveform sequence. """ self.init_phones_iter(subset, shuffling) assert id < self.phone_to_seq_intervals[-1] # Get the sequence seq_id = np.digitize([id], self.phone_to_seq_intervals)[0] - 1 id_in_seq = id - self.phone_to_seq_intervals[seq_id] if self.shuffle_seq: seq_id = self.invert_shuffling[seq_id] id_plus_seq = self.__dict__[subset]["seq_to_phones"][seq_id,0] \ + id_in_seq wav_start_in_seq = \ self.__dict__[subset]["phones_intervals"][id_plus_seq, 0] wav_end_in_seq = \ self.__dict__[subset]["phones_intervals"][id_plus_seq, 1] wav_start = self.__dict__[subset]["intervals"][seq_id] \ + wav_start_in_seq wav_end = self.__dict__[subset]["intervals"][seq_id] \ + wav_end_in_seq wav = self.__dict__[subset]["wav"][wav_start:wav_end] # Get the phone phone = self.__dict__[subset]["phones_intervals"][id_plus_seq,2] # Find the speaker id spkr_id = self.__dict__[subset]["speaker_id"][seq_id] # Find the speaker info spkr_info = self.spkrinfo[spkr_id] # Segment into frames wav = segment_axis(wav, frame_length, overlap) return [wav, phone, spkr_info]
def multi_patch_over(self, cb_path): cb_path = segment_axis(cb_path, self.patch_length, overlap=0, axis=0) ## gives: npatch x patchlength + nres db_path = [] for chunk in cb_path: matched = False for (res, res_patch) in enumerate(chunk.transpose()): key = tuple(res_patch.tolist()) if key in self.patches[res]: start = self.patches[res][key][0] # take first! end = start + self.patch_length db_path.extend(range(start, end)) matched = True print 'res: %s' % (res) break if not matched: sys.exit('need back off strategy!') return db_path
def get_epoch_position_features(pms, rate, nsamples, seconds2samples=True, zero_uv_GCP=False): if seconds2samples: ## Convert seconds -> waveform sample numbers:- pms = np.asarray(np.round(pms * rate), dtype=int) ## make sure length compatible with the waveform:-- last = len(pms)-1 while pms[last] > nsamples: last -= 1 pms = pms[:last] if nsamples > pms[-1]: pms = np.concatenate([pms, np.array([nsamples])]) ## addd first 0 pms = np.concatenate([np.array([0]), pms]) start_end = segment_axis(pms, 2, overlap=1) lengths = start_end[:,1] - start_end[:,0] forwards = [] backwards = [] norm_forwards = [] for length in lengths: forward = np.arange(length) backward = np.flipud(forward) norm_forward = forward / float(length) forwards.append( forward ) backwards.append( backward ) norm_forwards.append( norm_forward ) forwards = np.concatenate(forwards).reshape((nsamples,1)) backwards = np.concatenate(backwards).reshape((nsamples,1)) norm_forwards = np.concatenate(norm_forwards).reshape((nsamples,1)) if zero_uv_GCP: #forwards[] = 0.0 sys.exit('not implemented : zero_uv_GCP') return (forwards, backwards, norm_forwards)
def replace_with_frames(self): for i in range(len(self.raw_wav)): frames = segment_axis(self.raw_wav[i], length=self.frame_length, overlap=self.overlap) self.raw_wav[i] = frames
def __init__(self, which_set, frame_length, overlap=0, frames_per_example=1, start=0, stop=None, audio_only=False, rng=_default_seed): """ Parameters ---------- which_set : str Either "train", "valid" or "test" frame_length : int Number of acoustic samples contained in a frame overlap : int, optional Number of overlapping acoustic samples for two consecutive frames. Defaults to 0, meaning frames don't overlap. frames_per_example : int, optional Number of frames in a training example. Defaults to 1. start : int, optional Starting index of the sequences to use. Defaults to 0. stop : int, optional Ending index of the sequences to use. Defaults to `None`, meaning sequences are selected all the way to the end of the array. audio_only : bool, optional Whether to load only the raw audio and no auxiliary information. Defaults to `False`. rng : object, optional A random number generator used for picking random indices into the design matrix when choosing minibatches. """ self.frame_length = frame_length self.overlap = overlap self.frames_per_example = frames_per_example self.offset = self.frame_length - self.overlap self.audio_only = audio_only # RNG initialization if hasattr(rng, 'random_integers'): self.rng = rng else: self.rng = numpy.random.RandomState(rng) # Load data from disk self._load_data(which_set) # Standardize data for i, sequence in enumerate(self.raw_wav): self.raw_wav[i] = (sequence - TIMIT._mean) / TIMIT._std # Slice data if stop is not None: self.raw_wav = self.raw_wav[start:stop] if not self.audio_only: self.phones = self.phones[start:stop] self.phonemes = self.phonemes[start:stop] self.words = self.words[start:stop] else: self.raw_wav = self.raw_wav[start:] if not self.audio_only: self.phones = self.phones[start:] self.phonemes = self.phonemes[start:] self.words = self.words[start:] examples_per_sequence = [0] for sequence_id, samples_sequence in enumerate(self.raw_wav): if not self.audio_only: # Phones segmentation phones_sequence = self.phones[sequence_id] phones_segmented_sequence = segment_axis(phones_sequence, frame_length, overlap) self.phones[sequence_id] = phones_segmented_sequence # phones_segmented_sequence = scipy.stats.mode( # phones_segmented_sequence, # axis=1 # )[0].flatten() # phones_segmented_sequence = numpy.asarray( # phones_segmented_sequence, # dtype='int' # ) # phones_sequence_list.append(phones_segmented_sequence) # Phonemes segmentation phonemes_sequence = self.phonemes[sequence_id] phonemes_segmented_sequence = segment_axis(phonemes_sequence, frame_length, overlap) self.phonemes[sequence_id] = phonemes_segmented_sequence # phonemes_segmented_sequence = scipy.stats.mode( # phonemes_segmented_sequence, # axis=1 # )[0].flatten() # phonemes_segmented_sequence = numpy.asarray( # phonemes_segmented_sequence, # dtype='int' # ) # phonemes_sequence_list.append(phonemes_segmented_sequence) # Words segmentation words_sequence = self.words[sequence_id] words_segmented_sequence = segment_axis(words_sequence, frame_length, overlap) self.words[sequence_id] = words_segmented_sequence # words_segmented_sequence = scipy.stats.mode( # words_segmented_sequence, # axis=1 # )[0].flatten() # words_segmented_sequence = numpy.asarray(words_segmented_sequence, # dtype='int') # words_sequence_list.append(words_segmented_sequence) # TODO: look at this, does it force copying the data? # Sequence segmentation samples_segmented_sequence = segment_axis(samples_sequence, frame_length, overlap) self.raw_wav[sequence_id] = samples_segmented_sequence # TODO: change me # Generate features/targets/phones/phonemes/words map num_frames = samples_segmented_sequence.shape[0] num_examples = num_frames - self.frames_per_example examples_per_sequence.append(num_examples) self.cumulative_example_indexes = numpy.cumsum(examples_per_sequence) self.samples_sequences = self.raw_wav if not self.audio_only: self.phones_sequences = self.phones self.phonemes_sequences = self.phonemes self.words_sequences = self.words self.num_examples = self.cumulative_example_indexes[-1] # DataSpecs features_space = VectorSpace( dim=self.frame_length * self.frames_per_example ) features_source = 'features' features_dtype = self.samples_sequences[0].dtype def features_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.samples_sequences[sequence_index][example_index:example_index + self.frames_per_example].ravel()) return rval targets_space = VectorSpace(dim=self.frame_length) targets_source = 'targets' targets_dtype = self.samples_sequences[0].dtype def targets_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.samples_sequences[sequence_index][example_index + self.frames_per_example].ravel()) return rval space_components = [features_space, targets_space] source_components = [features_source, targets_source] dtypes_components = [features_dtype, targets_dtype] map_fn_components = [features_map_fn, targets_map_fn] batch_components = [None, None] if not self.audio_only: phones_space = IndexSpace(max_labels=61, dim=1) phones_source = 'phones' phones_dtype = self.phones_sequences[0].dtype def phones_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(scipy.stats.mode(self.phones_sequences[sequence_index][example_index + self.frames_per_example])[0]) return rval phonemes_space = IndexSpace(max_labels=31, dim=1) phonemes_source = 'phonemes' phonemes_dtype = self.phonemes_sequences[0].dtype def phonemes_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.phonemes_sequences[sequence_index][example_index + self.frames_per_example].ravel()) return rval words_space = IndexSpace(max_labels=31, dim=1) words_source = 'words' words_dtype = self.words_sequences[0].dtype def words_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.words_sequences[sequence_index][example_index + self.frames_per_example].ravel()) return rval space_components.extend([phones_space, phonemes_space, words_space]) source_components.extend([phones_source, phonemes_source, words_source]) dtypes_components.extend([phones_dtype, phonemes_dtype, words_dtype]) map_fn_components.extend([phones_map_fn, phonemes_map_fn, words_map_fn]) batch_components.extend([None, None, None]) space = CompositeSpace(space_components) source = tuple(source_components) self.data_specs = (space, source) self.dtypes = tuple(dtypes_components) self.map_functions = tuple(map_fn_components) self.batch_buffers = batch_components # Defaults for iterators self._iter_mode = resolve_iterator_class('shuffled_sequential') self._iter_data_specs = (CompositeSpace((features_space, targets_space)), (features_source, targets_source))
def get_word_seq(self, subset, frame_length, overlap, id, shuffling = True): """ Given the subset id, the number of frames wanted, the frame length, the overlap and the id, return the associated waveform sequence. """ self.init_words_iter(subset, shuffling) assert id < self.word_to_seq_intervals[-1] # Get the sequence seq_id = np.digitize([id], self.word_to_seq_intervals)[0] - 1 id_in_seq = id - self.word_to_seq_intervals[seq_id] if self.shuffle_seq: seq_id = self.invert_shuffling[seq_id] id_plus_seq = self.__dict__[subset]["seq_to_words"][seq_id,0] \ + id_in_seq wav_start_in_seq = \ self.__dict__[subset]["words_intervals"][id_plus_seq, 0] wav_end_in_seq = \ self.__dict__[subset]["words_intervals"][id_plus_seq, 1] wav_start = self.__dict__[subset]["intervals"][seq_id] \ + wav_start_in_seq wav_end = self.__dict__[subset]["intervals"][seq_id] \ + wav_end_in_seq wav = self.__dict__[subset]["wav"][wav_start:wav_end] # Get the phones, phonemes and words phones = self.__dict__[subset]["phones"][wav_start:wav_end] phonemes = self.__dict__[subset]["phonemes"][wav_start:wav_end] word = self.__dict__[subset]["words_intervals"][id_plus_seq,2] # Find the speaker id spkr_id = self.__dict__[subset]["speaker_id"][seq_id] # Find the speaker info spkr_info = self.spkrinfo[spkr_id] # Segment into frames wav = segment_axis(wav, frame_length, overlap) # Take the most occurring phone in a sequence phones = segment_axis(phones, frame_length, overlap) phones = scipy.stats.mode(phones, axis=1)[0].flatten() phones = np.asarray(phones, dtype='int') # Take the most occurring phone in a sequence phonemes = segment_axis(phonemes, frame_length, overlap) phonemes = scipy.stats.mode(phonemes, axis=1)[0].flatten() phonemes = np.asarray(phonemes, dtype='int') # Binary variable announcing the end of the word or phoneme end_phn = np.zeros_like(phones) for i in range(len(phones) - 1): if phones[i] != phones[i+1]: end_phn[i] = 1 end_phn[-1] = 1 return [wav, phones, phonemes, end_phn, word, spkr_info]
def get_fixed_size_seq(self, subset, n_frames, frame_length, overlap, ids, \ shuffling = True, wav_only = False): """ Given the subset id, the number of frames wanted, the frame length, the overlap, and the ids, return multiple arrays corresponding to a minibatch of frame sequence of fixed size """ self.init_frames_iter(subset, n_frames, frame_length, overlap, \ shuffling) if isinstance(ids, collections.Iterable): ids = np.asarray(ids) else: ids = np.array([ids]) assert np.all(ids < self.frame_seq_intervals[-1]) # Get the sequence seq_ids = np.digitize(ids, self.frame_seq_intervals) - 1 if self.shuffle_seq: seq_ids = self.invert_shuffling[seq_ids] idx_in_seq = ids - self.__dict__[subset]["intervals"][seq_ids] wav_start = self.__dict__[subset]["intervals"][seq_ids] + idx_in_seq # wav_start = wav_start.reshape((wav_start.shape[0],1)) # indices = wav_start + np.arange(self.wav_length_required) wav = np.zeros((ids.shape[0], self.wav_length_required)) for i, idx in enumerate(wav_start): wav[i] = self.__dict__[subset]["wav"][idx:(idx + self.wav_length_required)] # wav = self.__dict__[subset]["wav"][indices] if (ids.shape[0]*self.wav_length_required > 100000): print "Waveforms loaded." if not wav_only: # Get the phones, phonemes and words # phones = self.__dict__[subset]["phones"][indices] # phonemes = self.__dict__[subset]["phonemes"][indices] # words = self.__dict__[subset]["words"][indices] phones = np.zeros((ids.shape[0], self.wav_length_required)) phonemes = np.zeros((ids.shape[0], self.wav_length_required)) words = np.zeros((ids.shape[0], self.wav_length_required)) # Find the speaker id spkr_id = self.__dict__[subset]["speaker_id"][seq_ids] # Find the speaker info spkr_info = self.spkrinfo[spkr_id] # Segment into frames wav = segment_axis(wav, frame_length, overlap, axis=1) # shape (n_ids, n_frames, frame_length) # Take the most occurring phone in a sequence phones = segment_axis(phones, frame_length, overlap, axis=1) phones = scipy.stats.mode(phones, axis=2)[0].reshape(ids.shape[0], \ n_frames) phones = np.asarray(phones, dtype='int') # Take the most occurring phone in a sequence phonemes = segment_axis(phonemes, frame_length, overlap, axis=1) phonemes = scipy.stats.mode(phonemes, axis=2)[0].reshape(ids.shape[0], \ n_frames) phonemes = np.asarray(phonemes, dtype='int') # Take the most occurring word in a sequence words = segment_axis(words, frame_length, overlap, axis=1) words = scipy.stats.mode(words, axis=2)[0].reshape(ids.shape[0], \ n_frames) words = np.asarray(words, dtype='int') # Binary variable announcing the end of the word or phoneme end_phn = np.zeros_like(phones) end_wrd = np.zeros_like(words) end_phn[:,:-1] = np.where(phones[:,:-1] != phones[:,1:], 1, 0) end_wrd[:,:-1] = np.where(words[:,:-1] != words[:,1:], 1, 0) return [wav, phones, phonemes, end_phn, words, end_wrd, spkr_info] else: return [wav]
def get_raw_seq(self, subset, seq_id, frame_length, overlap, \ shuffling = True): """ Given the id of the subset, the id of the sequence, the frame length and the overlap between frames, this method will return a frames sequence from a given set, the associated phonemes and words sequences (including a binary variable indicating change) and the information vector on the speaker. """ self.check_subset_value(subset) self.check_subset_presence(subset) # Check if the id is valid n_seq = self.__dict__[subset]["n_seq"] if seq_id >= n_seq: raise ValueError("This sequence does not exist.") # Get the sequence if shuffling: seq_id = self.shuffling[seq_id] wav_start = self.__dict__[subset]["intervals"][seq_id] wav_end = self.__dict__[subset]["intervals"][seq_id+1] wav = self.__dict__[subset]["wav"][wav_start:wav_end] # Get the phones, phonemes and words phones = self.__dict__[subset]["phones"][wav_start:wav_end] phonemes = self.__dict__[subset]["phonemes"][wav_start:wav_end] words = self.__dict__[subset]["words"][wav_start:wav_end] # Find the speaker id spkr_id = self.__dict__[subset]["speaker_id"][seq_id] # Find the speaker info spkr_info = self.spkrinfo[spkr_id] # Segment into frames wav = segment_axis(wav, frame_length, overlap) # Take the most occurring phone in a sequence phones = segment_axis(phones, frame_length, overlap) phones = scipy.stats.mode(phones, axis=1)[0].flatten() phones = np.asarray(phones, dtype='int') # Take the most occurring phone in a sequence phonemes = segment_axis(phonemes, frame_length, overlap) phonemes = scipy.stats.mode(phonemes, axis=1)[0].flatten() phonemes = np.asarray(phonemes, dtype='int') # Take the most occurring word in a sequence words = segment_axis(words, frame_length, overlap) words = scipy.stats.mode(words, axis=1)[0].flatten() words = np.asarray(words, dtype='int') # Binary variable announcing the end of the word or phoneme end_phn = np.zeros_like(phones) end_wrd = np.zeros_like(words) for i in range(len(words) - 1): if phones[i] != phones[i+1]: end_phn[i] = 1 if words[i] != words[i+1]: end_wrd[i] = 1 end_phn[-1] = 1 end_wrd[-1] = 1 return [wav, phones, phonemes, end_phn, words, end_wrd, spkr_info]
def get_path_information(self, target_features, path, waveform): context = self.config['wave_context_length'] gen_wave = self.nextsample[path, :] print gen_wave.shape print '====' padded_wave = np.concatenate([np.zeros((context, 1)), gen_wave]) print padded_wave.shape wavefrags = segment_axis(padded_wave.flatten(), context + 1, overlap=context, axis=0) join_features = wavefrags[:, :-1] nextsamples = wavefrags[:, -1] print join_features.shape print target_features.shape combined_features = np.hstack([join_features, target_features]) dists, samples = self.joint_tree.query(combined_features, k=1, eps=2) print samples.shape print dists.shape print dists selected = self.train_unit_features[samples, :] dists2 = np.sqrt(((combined_features - selected)**2).sum(axis=1)) ### stream contributions... raw_dists = (combined_features - selected)**2 history_contrib = np.sqrt(raw_dists[:, :context].sum()) print 'history' print history_contrib start = context for stream in self.stream_list_target: width = self.datadims_target[stream] end = start + width stream_contrib = np.sqrt(raw_dists[:, start:end].sum()) print stream print stream_contrib start = end ### natural joins: pairs = copy.copy(segment_axis(path, 2, 1, axis=0)) pairs[:, 0] += 1 pairs[:, 0] *= -1 diff = pairs.sum(axis=1) breaks = (diff != 0) breaks = np.array(breaks, dtype=int) breaks = np.concatenate([np.ones(1), breaks]) print breaks sys.exit('sesrbsfrb') # pylab.plot(dists) # pylab.plot(dists2) # pylab.show() ### density: distance_thresh = dists.mean() * 2 ## 1) how many points within twice average distance from targets? neighbours = self.joint_tree.query_ball_point(combined_features, distance_thresh, eps=2) n_neighbours_target = [len(thing) for thing in neighbours] ## 2) how many points within twice average distance from selected things? neighbours = self.joint_tree.query_ball_point(selected, distance_thresh, eps=2) n_neighbours_selected = [len(thing) for thing in neighbours] #print path pylab.subplot(411) pylab.plot(dists) pylab.subplot(412) pylab.plot(n_neighbours_target) pylab.plot(n_neighbours_selected) pylab.subplot(413) pylab.plot(breaks) pylab.subplot(414) pylab.plot(waveform) pylab.show()
def replace_with_frames( self ): for i in range(len(self.raw_wav)): frames = segment_axis( self.raw_wav[i], length=self.frame_length, overlap=self.overlap ) self.raw_wav[i] = frames
def mfcc(input, nwin=256, nfft=512, fs=16000, nceps=13): """Compute Mel Frequency Cepstral Coefficients. Parameters ---------- input: ndarray input from which the coefficients are computed Returns ------- ceps: ndarray Mel-cepstrum coefficients mspec: ndarray Log-spectrum in the mel-domain. Notes ----- MFCC are computed as follows: * Pre-processing in time-domain (pre-emphasizing) * Compute the spectrum amplitude by windowing with a Hamming window * Filter the signal in the spectral domain with a triangular filter-bank, whose filters are approximatively linearly spaced on the mel scale, and have equal bandwith in the mel scale * Compute the DCT of the log-spectrum References ---------- .. [1] S.B. Davis and P. Mermelstein, "Comparison of parametric representations for monosyllabic word recognition in continuously spoken sentences", IEEE Trans. Acoustics. Speech, Signal Proc. ASSP-28 (4): 357-366, August 1980.""" # MFCC parameters: taken from auditory toolbox over = 170 # Pre-emphasis factor (to take into account the -6dB/octave rolloff of the # radiation at the lips level) prefac = 0.97 #lowfreq = 400 / 3. lowfreq = 133.33 #highfreq = 6855.4976 linsc = 200/3. logsc = 1.0711703 nlinfil = 13 nlogfil = 27 nfil = nlinfil + nlogfil w = hamming(nwin, sym=0) fbank = trfbank(fs, nfft, lowfreq, linsc, logsc, nlinfil, nlogfil)[0] #------------------ # Compute the MFCC #------------------ extract = preemp(input, prefac) framed = segment_axis(extract, nwin, over) * w # Compute the spectrum magnitude spec = np.abs(fft(framed, nfft, axis=-1)) # Filter the spectrum through the triangle filterbank mspec = np.log10(np.clip(np.dot(spec, fbank.T), 1e-9, np.inf)) # Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain) ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:, :nceps] return ceps, mspec, spec
def main_work(): ################################################# # ======== Get stuff from command line ========== a = ArgumentParser() a.add_argument('-wav', dest='wavdir', required=True) a.add_argument('-exc', dest='excdir', required=True) a.add_argument('-testpattern', default='') a.add_argument('-trainpattern', default='') a.add_argument('-chunksize', type=int, default=2000) a.add_argument('-overlap', type=int, default=100) a.add_argument('-code', action='store_true', default=False) a.add_argument('-o', dest='outdir', required=True) opts = a.parse_args() # =============================================== for direc in [opts.outdir]: if not os.path.isdir(direc): os.makedirs(direc) ### TODO: don't have everything in memory! flist = sorted(glob.glob(opts.wavdir + '/*.wav')) bases = [get_basename(fname) for fname in flist] if opts.code: ## sort by speaker first codes = [base.split('_')[-2] for base in bases] bases = [base for (code, base) in sorted(zip(codes, bases))] speaker_map = sorted(dict(zip(codes, codes)).keys()) speaker_map = dict(zip(speaker_map, range(len(speaker_map)))) if opts.testpattern: trainbases = [base for base in bases if opts.testpattern not in base] bases = trainbases print '%s files matching %s held out for testing ' % ( len(bases) - len(trainbases), opts.testpattern) else: print 'no test pattern supplied -- no files held out' if opts.trainpattern: trainbases = [base for base in bases if opts.trainpattern in base] bases = trainbases bases = [ base for base in bases if os.path.isfile(os.path.join(opts.excdir, base + '.wav')) ] sample_rate = None ## will be set when first wave is opened, and others checked for consistency condition_name = 'data_c%s_o%s.hdf' % (opts.chunksize, opts.overlap) outfile = os.path.join(opts.outdir, condition_name) f = h5py.File(outfile, 'w') todo_list = [(opts.wavdir, 'wave'), (opts.excdir, 'excitation')] for (datadir, name) in todo_list: wavedata = [] print 'Reading from %s...' % (datadir) for base in tqdm(bases): fname = os.path.join(datadir, base + '.wav') wave, fs = soundfile.read( fname, dtype='int16') ## TODO: check wave read/load @343948 if not sample_rate: sample_rate = fs else: assert fs == sample_rate wavedata.append(wave) print 'concatenate and reshape...' wavedata = np.concatenate(wavedata) wavedata = segment_axis(wavedata, opts.chunksize, overlap=opts.overlap, end='cut', axis=0) print 'Write to HDF...' dset = f.create_dataset(name, wavedata.shape, dtype=wavedata.dtype, track_times=False) dset[:, :] = wavedata print 'Done' print if opts.code: wavedata = [] print 'Adding codes...' datadir = opts.wavdir for base in tqdm(bases): fname = os.path.join(datadir, base + '.wav') wave, fs = soundfile.read( fname, dtype='int16') ## TODO: check wave read/load @343948 speaker_id = base.split('_')[-2] codes = np.ones(wave.shape, dtype=wave.dtype) * speaker_map[speaker_id] wavedata.append(codes) print 'concatenate and reshape...' wavedata = np.concatenate(wavedata) wavedata = segment_axis(wavedata, opts.chunksize, overlap=opts.overlap, end='cut', axis=0) print 'Write to HDF...' dset = f.create_dataset('speaker_code', wavedata.shape, dtype=wavedata.dtype, track_times=False) dset[:, :] = wavedata print 'Done' print print speaker_map print f.close() print 'Wrote ' + outfile