Пример #1
0
    def read_pm(self, fname):

        f = open(fname, 'r')
        lines = f.readlines()
        f.close()

        for (i,line) in enumerate(lines):
            if line.startswith('EST_Header_End'):
                start = i+1
                break
        lines = lines[start:]
        lines = [float(re.split('\s+',line)[0]) for line in lines]

        lines = np.array(lines)

        ## debug: make sure monotonic increase
        start_end = segment_axis(lines, 2, overlap=1)
        diffs = start_end[:,1] - start_end[:,0]
        neg_diffs = (diffs < 0.0)

        if sum(neg_diffs) > 0:
            print ('WARNING: pitch marks not monotonically increasing in %s'%(fname))
            # return np.ones((1,1))
            lines = lines[:-1]

            start_end = segment_axis(lines, 2, overlap=1)
            diffs = start_end[:,1] - start_end[:,0]
            neg_diffs = (diffs < 0.0)

            if sum(neg_diffs) > 0:
                print ('WARNING: pitch marks not monotonically increasing in %s'%(fname))

        return lines
Пример #2
0
    def get_file_data_from_one_file(self):
        '''
        Here is most of the database specific stuff.
        This one should be provided by subclasses to manipulate data appropriately.
        '''
        #print '----> get_file_data_from_one_file (%s)'%(self.operation)
        (wave_file, pitch_mark_file) = self.filelist[self.file_index]
        pitchmarks = read_pm(pitch_mark_file) * 48000
        if pitchmarks.size == 1:
            return (np.zeros((0, 0)), np.zeros((0, 0)))
        pitchmarks = np.array(pitchmarks, dtype='int')
        pitchmark_triples = segment_axis(pitchmarks, 3, 2, axis=0)
        wave, sample_rate = read_wave(wave_file)

        pad_length = 1000
        frags = [
            get_pm_wavefrag(pm_trip, wave, pad_length)
            for pm_trip in pitchmark_triples
        ]
        frags = np.vstack(frags)

        ### do 0-1 range normalisation, asssume 16bit signed audio:
        data_range = math.pow(2, 16)
        frags -= (data_range / 2) * -1.0
        frags /= data_range

        return (frags, frags)
Пример #3
0
 def window_signal(self):
     """
     Takes the signal field, divides it into frames and creates a window with the frame and two adjacent frames to create a window.  
     Pads the end with zeros if the array size is not properly divisible.
     Does NOT modify original signal.
     """
     samples_per_frame = self.get_frame_size() * self.get_fs()
     window_length = samples_per_frame * 3
     return segment_axis(self.get_signal(), window_length, samples_per_frame, end="pad") 
Пример #4
0
 def cache(data):
   # first sort by the given dim:
   xdim_index = self.dims.index(progression_dim)
   sorted_data = self.data[self.data[:,xdim_index].argsort(),]
   # create windows:
   from segmentaxis import segment_axis
   seg_data = segment_axis(sorted_data, window_size, overlap, axis=0)
   med_data = np.median(seg_data, axis=1)   
   data.table = DataTable(med_data, self.dims)
Пример #5
0
    def __init__(self, 
                 which_set='train',
                 seq_len=3010,
                 frame_size=320,
                 hop_size=32,
                 axes=('b', 0, 1, 'c'),
                 noutput_frames=1,
                 ninput_frames=9,
                 overlap=False, # not implemented yet
                 preprocessing=True):

        self.__dict__.update(locals())
        del self.self

        # cut sequences down to seq_len
        dat = np.load('/data/lisa_ubi/speech/onomatopoeia/dataset/per_phone_timit/wav_aa.npy')
        lengths = [len(i) for i in dat]
        daat = A([A(dat[i][:seq_len]) for i in range(len(dat)) if lengths[i] > seq_len])
        if preprocessing:
            self.mean = np.mean(daat)
            daat -= self.mean
            self.std = np.std(daat)
            daat /= self.std

        # Convert to Spectral
        daat = A([np.abs(stft(ex,1,frame_size,hop_size)).flatten() for ex in daat])
        #print daat.shape
        #aat = A([arr.reshape(len(arr),-1) for arr in daat])
        print daat.shape
        # Stride and flatten
        #print ninput_frames*frame_size
        daat = segment_axis(daat, ninput_frames*frame_size, (ninput_frames-1)*frame_size, axis=1)
        print daat.shape

        if which_set == 'train':
            daat = daat[:int(.8*len(daat))]

        if which_set == 'valid':
            daat = daat[int(.8*len(daat)):int(.9*len(daat))]

        if which_set == 'test':
            daat = daat[int(.9*len(daat)):]

        features = daat[:,:-noutput_frames,:].reshape(-1,frame_size*ninput_frames)
        targets = daat[:,noutput_frames:,-frame_size*noutput_frames:].reshape(-1,frame_size*noutput_frames)

        # FIXME BELOW HERE for CNNs
        #--------------------------

        IMAGES_SHAPE = features.shape + (1,)
        print targets.shape
        print features.shape

        X, y = features, targets
        view_converter = DefaultViewConverter(shape=IMAGES_SHAPE, axes=axes)
        super(AA, self).__init__(X=X, y=y, view_converter=view_converter)
Пример #6
0
 def test_merge_frames(self):
     """
     Merges frames back into a single array presenting a reconstructed audio signal.
     """
     expected_array = np.array([0, 1, 2, 3, 3.4, 4.25, 6, 7, 6.8, 7.65, 10, 11, 10.2, 11.05, 14, 15, 0, 0])
     input_array = segment_axis(np.arange(0, 16), 6, 2, end="pad")
     fake_lpc_frame_array = LPCFrameArray(1, 2, input_array)
     fake_synthesizer = Synthesizer(fake_lpc_frame_array)
     result_array = fake_synthesizer._merge_frames(input_array)
     assert_array_almost_equal(expected_array, result_array)
Пример #7
0
def frame_signal(input, nwin=256, over=None):
    # MFCC parameters: taken from auditory toolbox
    if over is None:
        over = nwin - 160
    # Pre-emphasis factor (to take into account the -6dB/octave rolloff of the
    # radiation at the lips level)
    prefac = 0.97
    extract = preemp(input, prefac)
    w = hamming(nwin, sym=0)
    framed = segment_axis(extract, nwin, over) * w
    return framed
Пример #8
0
    def window_agg(self,
                   progression_dim,
                   window_size=1000,
                   overlap=500,
                   agg_method='median'):
        """ Creates a sliding window that moves over the specified
    dimension and aggregates all values per window
    """
        # First try using bottleneck, ignoring overlap
        try:
            return self.window_agg_using_bottleneck(progression_dim,
                                                    window_size, overlap,
                                                    agg_method)
        except ImportError:
            pass

        window_size = int(window_size)
        overlap = int(overlap)
        # first sort by the given dim:
        xdim_index = self.dims.index(progression_dim)
        sorted_data = self.data[self.data[:, xdim_index].argsort(), ]
        # create windows:
        from segmentaxis import segment_axis

        with Timer('segment_axis'):
            seg_data = segment_axis(sorted_data, window_size, overlap, axis=0)

        # we want to calculate the median in iterations, so that we don't run
        # out of memory when sorting the strides. Maybe this is unnecessary
        # for average, but we do it for it as well.
        min_values_per_iteration = 50 * 1000000.
        values_per_window = seg_data.shape[1] * seg_data.shape[2]
        windows_per_iteration = np.floor(min_values_per_iteration /
                                         values_per_window)
        start_win = 0
        agg_data = None
        while start_win < seg_data.shape[0]:
            iteration_data = seg_data[start_win:start_win +
                                      windows_per_iteration]
            if agg_method == 'median':
                with Timer('Median over %d windows' % windows_per_iteration):
                    iteration_agg_data = np.median(seg_data, axis=1)
            elif agg_method == 'average':
                iteration_agg_data = np.average(seg_data, axis=1)
            else:
                raise Exception('Unknown agg method')
            if agg_data == None:
                agg_data = iteration_agg_data
            else:
                agg_data = np.concatenate((agg_data, iteration_agg_data))
            start_win += windows_per_iteration
            print start_win

        return DataTable(agg_data, self.dims, self.legends, self.tags.copy())
Пример #9
0
    def get_multiple_trees_for_greedy_search(self):
        '''
        Partition data in hard way with k-means, build 1 KD-tree per partition
        '''

        #! m,n = self.unit_start_data.shape

        self.prev_join_rep = self.unit_start_data    ## !osw
        self.current_join_rep = self.unit_end_data   ## !osw

        start_time = self.start_clock('build multiple joint KD trees')
        ## Needs to be stored synthesis options specified (due to weights applied before tree construction):
        treefile = get_data_dump_name(self.config) + '_' + make_synthesis_condition_name(self.config) + '_joint_tree.pkl' 

        multiepoch = self.config.get('multiepoch', 1)
        overlap = 0
        if multiepoch > 1:
            overlap = multiepoch-1
            ### reshape target rep:
            m,n = self.train_unit_features.shape
            self.train_unit_features = segment_axis(self.train_unit_features, multiepoch, overlap=overlap, axis=0)
            self.train_unit_features = self.train_unit_features.reshape(m-overlap,n*multiepoch)

            if self.config.get('last_frame_as_target', False):
                print 'test -- take last frame only as target...'  ## TODO99
                # self.train_unit_features = self.train_unit_features[:,-n:]
                self.train_unit_features = np.hstack([self.train_unit_features[:,:n], self.train_unit_features[:,-n:]])

            ### alter join reps: -- first tried taking first vs. last
            m,n = self.current_join_rep.shape
            self.current_join_rep = self.current_join_rep[overlap:,:]
            self.prev_join_rep = self.prev_join_rep[:-overlap, :]

        #### ---- cluster self.train_unit_features
        self.cluster(self.config.get('multiple_search_trees', 1), limit=self.config.get('cluster_data_on_npoints', 10000))
        ### ^--- populates self.cluster_ixx

        combined_rep = np.hstack([self.prev_join_rep, self.train_unit_features])

        self.joint_trees = []
        self.node_maps = []

        for cluster_number in range(self.config.get('multiple_search_trees', 1)):
            t = self.start_clock('make joint join + target tree for cluster %s...'%(cluster_number))
            self.joint_trees.append(scipy.spatial.cKDTree(combined_rep[self.cluster_ixx==cluster_number, :], leafsize=100, balanced_tree=False, compact_nodes=False))
            self.node_maps.append(np.arange(self.number_of_units-overlap)[self.cluster_ixx==cluster_number])
            self.stop_clock(t)
Пример #10
0
  def window_agg(self, progression_dim, window_size=1000, overlap=500, agg_method='median'):
    """ Creates a sliding window that moves over the specified
    dimension and aggregates all values per window
    """
    # First try using bottleneck, ignoring overlap
    try:
       return self.window_agg_using_bottleneck(progression_dim, window_size, overlap, agg_method)
    except ImportError:
      pass
      
    window_size = int(window_size)
    overlap = int(overlap)
    # first sort by the given dim:
    xdim_index = self.dims.index(progression_dim)
    sorted_data = self.data[self.data[:,xdim_index].argsort(),]
    # create windows:
    from segmentaxis import segment_axis
    
    with Timer('segment_axis'):
      seg_data = segment_axis(sorted_data, window_size, overlap, axis=0)
    
    # we want to calculate the median in iterations, so that we don't run 
    # out of memory when sorting the strides. Maybe this is unnecessary 
    # for average, but we do it for it as well.
    min_values_per_iteration = 50*1000000.
    values_per_window = seg_data.shape[1] * seg_data.shape[2]
    windows_per_iteration = np.floor(min_values_per_iteration / values_per_window)
    start_win = 0
    agg_data = None
    while start_win < seg_data.shape[0]:
      iteration_data = seg_data[start_win:start_win + windows_per_iteration]
      if agg_method == 'median':
        with Timer('Median over %d windows' % windows_per_iteration):
          iteration_agg_data = np.median(seg_data, axis=1)   
      elif agg_method == 'average':
        iteration_agg_data = np.average(seg_data, axis=1)   
      else:
        raise Exception('Unknown agg method')
      if agg_data == None:
        agg_data = iteration_agg_data
      else:
        agg_data = np.concatenate((agg_data, iteration_agg_data))
      start_win += windows_per_iteration
      print start_win

    return DataTable(agg_data, self.dims, self.legends, self.tags.copy())
Пример #11
0
 def match(p):
     x = np.clip(p, 1e-6, 1 - 1e-6) * scales + mins
     #print(' '.join(str(s) for s in x), end=' ')
     model = trm.TubeModel(trm.Parameters(
         file_format=0,
         sample_rate_hz=22050.0,
         control_rate_hz=25.0,
         volume_db=60.0,
         channels=1,
         balance=0.0,
         waveform=0,
         pulse_rise=x[0],
         pulse_fall_min=x[1],
         pulse_fall_max=x[2],
         breathiness=x[3],
         length_cm=x[4],
         temperature_degc=25.0,
         loss_factor=x[5],
         aperture_scale_cm=x[6],
         mouth_coeff_hz=x[7],
         nose_coeff_hz=x[8],
         nose_radii_cm=x[9:14],
         throat_lowpass_cutoff_hz=1500.0,
         throat_volume_db=6.0,
         modulation=1,
         noise_crossmix_offset_db=54.0))
     if not model.is_ok():
         return 1e200
     T = int(np.ceil(seconds * 28))
     frames = np.zeros((T, 16), float)
     timet = x[14]
     for i, (level0, levelt, levelT) in enumerate(np.split(x[15:], 16)):
         frames[:, i] = scipy.interpolate.interp1d(
             [0, timet, 1], [level0, levelt, levelT])(np.linspace(0, 1, T))
     #print(x[:14])
     #[print(' '.join(str(x) for x in f)) for f in frames]
     with timeout(1):
         x = np.array(model.synthesize(frames))
     y = scipy.interpolate.interp1d(
         np.linspace(0, 1, len(x)), x / abs(x).max())(
             np.linspace(0, 1, int(len(x) * fs / 22050)))
     z = segmentaxis.segment_axis(y, width, int(width * overlap))
     spec = abs(scipy.fftpack.fft(z * env))
     err = np.linalg.norm(target - spec[:len(target), :1 + width // 2])
     #print(err)
     return err
Пример #12
0
    def greedy_joint_search(self, unit_features, start_state=-1, holdout=[]):

        assert self.config['target_representation'] == 'epoch'

        start_time = self.start_clock('Greedy search')
        path = []
        m,n = self.current_join_rep.shape
        #m,n = self.join_contexts_unweighted.shape

        if start_state < 0:
            prev_join_vector = np.zeros((n,))
        else:
            prev_join_vector = self.prev_join_rep[start_state, :]


        multiepoch = self.config.get('multiepoch', 1)
        if multiepoch > 1:
            ### reshape target rep:
            m,n = unit_features.shape
            unit_features = segment_axis(unit_features, multiepoch, overlap=0, axis=0)
            unit_features = unit_features.reshape(m/multiepoch,n*multiepoch)

            if self.config.get('last_frame_as_target', False):
                print 'test -- take last frame only as target...'  ## TODO99
                # unit_features = unit_features[:,-n:]
                unit_features = np.hstack([unit_features[:,:n], unit_features[:,-n:]])

        ix = -1 
        final_dists = []   ### for debugging
        for target_vector in unit_features:
            both = np.concatenate([prev_join_vector, target_vector]).reshape((1,-1))
            # dists, indexes = self.joint_tree.query(both, k=7 + len(holdout)) # , n_jobs=4)
            dists, indexes = self.joint_tree.query(both, k=1+len(holdout), eps=self.config.get('search_epsilon', 0.0)) # , n_jobs=4)
            
            dindexes = zip(dists.flatten(), indexes.flatten())
            # if ix > -1:
            #     ## TODO: forbid regression -- configurable
            #     dindexes = [(d,i) for (d,i) in dindexes if i not in range(ix-5, ix+1)]
            #     dindexes = [(d,i) for (d,i) in dindexes if i not in holdout]
            
            (d, ix) = dindexes[0]
            path.append(ix)
            final_dists.append(d)
            prev_join_vector = self.current_join_rep[ix,:]
        self.stop_clock(start_time)
        return path
Пример #13
0
    def multi_index_patches(self, patch_length):

        assert patch_length >= 2
        self.patches = []
        self.patch_length = patch_length
        m, n_resolutions = self.cluster_ixx.shape
        for i in xrange(n_resolutions):
            res_patches = {}
            data = segment_axis(self.cluster_ixx[:, i].flatten(),
                                patch_length,
                                overlap=patch_length - 1,
                                axis=0)
            for (ix, vals) in enumerate(data):
                key = tuple(vals.tolist())
                if key not in res_patches:
                    res_patches[key] = []
                res_patches[key].append(ix)
            self.patches.append(res_patches)
Пример #14
0
def spectrum(pcm, nwin=512, nfft=512, fs=16000, stepr=0.5):
    """Compute spectrum for give audio snippet.
    pcm: the mono form audio input. 
    stepr: the step ratio for audio segmentation (aka. overlap).
    The shape of the returned spec is (frequency bin, temporal bin,)
    """
    # Pre-emphasis factor (to take into account the -6dB/octave rolloff of the
    # radiation at the lips level)
    prefac = 0.98
    overlap = nwin*(1-stepr)
    window = hamming(nwin, sym=0)
    extract = _preemp(pcm, prefac)
    framed = segment_axis(extract, nwin, overlap) * window
    # Compute the spectrum magnitude
    spec = np.log10(np.abs(fft(framed, nfft, axis=-1)))
    spec = spec.T[0:nfft/2+1]
    # Make it zero-mean, so the start-up transients for the filter are minimized
    spec = spec - np.ma.mean(spec)
    return spec
Пример #15
0
def main(width, overlap, samplerate, root, *audio):
    if not samplerate:
        samplerate = 16000
    if not width:
        width = 512
    if not overlap:
        overlap = 0.75
    env = np.hanning(width)[None, :].astype('f')
    for f in audio:
        rate, samples = scipy.io.wavfile.read(f)
        logging.info('%s: %d samples at %.1fkHz',
                     os.path.basename(f), len(samples), rate / 1000)
        assert rate == samplerate
        assert len(samples.shape) == 1
        samples -= samples.mean()
        X = segmentaxis.segment_axis(samples, width, int(width * overlap)) * env
        s = os.path.join(root, os.path.basename(f).replace('.wav', '-wave.npy'))
        logging.info('saving %s: %s', s, X.shape)
        np.save(s, X.astype('f'))
Пример #16
0
    def index_patches(self, max_patch_length):

        self.patches = {}
        for i in range(max_patch_length):
            length = i + 1
            if length == 1:
                for (ix, val) in enumerate(self.cluster_ixx):
                    if (val, ) not in self.patches:
                        self.patches[(val, )] = []
                    self.patches[(val, )].append(ix)
            else:
                data = segment_axis(self.cluster_ixx,
                                    length,
                                    overlap=length - 1,
                                    axis=0)
                for (ix, vals) in enumerate(data):
                    key = tuple(vals.tolist())
                    if key not in self.patches:
                        self.patches[key] = []
                    self.patches[key].append(ix)
Пример #17
0
    def get_tree_for_greedy_search(self):

        #! m,n = self.unit_start_data.shape

        self.prev_join_rep = self.unit_start_data    ## !osw
        self.current_join_rep = self.unit_end_data   ## !osw

        multiepoch = self.config.get('multiepoch', 1)
        if multiepoch > 1:
            t = self.start_clock('reshape data for multiepoch...')
            overlap = multiepoch-1
            ### reshape target rep:
            m,n = self.train_unit_features.shape
            self.train_unit_features = segment_axis(self.train_unit_features, multiepoch, overlap=overlap, axis=0)
            self.train_unit_features = self.train_unit_features.reshape(m-overlap,n*multiepoch)

            if self.config.get('last_frame_as_target', False):   ### !TODO: keep this option?
                print 'test -- take last frame only as target...'  
                # self.train_unit_features = self.train_unit_features[:,-n:]
                self.train_unit_features = np.hstack([self.train_unit_features[:,:n], self.train_unit_features[:,-n:]])

            ### alter join reps: -- first tried taking first vs. last
            m,n = self.current_join_rep.shape
            self.current_join_rep = self.current_join_rep[overlap:,:]
            self.prev_join_rep = self.prev_join_rep[:-overlap, :]

            ### !TODO: revisit this?
            ### then, whole comparison for join:
            # m,n = self.current_join_rep.shape
            # self.current_join_rep = segment_axis(self.current_join_rep, multiepoch, overlap=overlap, axis=0).reshape(m-overlap,n*multiepoch)
            # self.prev_join_rep = segment_axis(self.prev_join_rep, multiepoch, overlap=overlap, axis=0).reshape(m-overlap,n*multiepoch)
            self.stop_clock(t)

        t = self.start_clock('stack data to train joint tree...')
        combined_rep = np.hstack([self.prev_join_rep, self.train_unit_features])
        self.stop_clock(t)
        
        t = self.start_clock('make joint join + target tree...')
        ### For now, build each time from scratch -- compare resurrection time with rebuild time.
        self.joint_tree = scipy.spatial.cKDTree(combined_rep, leafsize=100, balanced_tree=False) # , compact_nodes=False)
        self.stop_clock(t)
Пример #18
0
 def get_phone_seq(self, subset, frame_length, overlap, id, shuffling = True):
     """
     Given the subset id, the number of frames wanted, the frame length, 
     the overlap and the id, return the associated waveform sequence. 
     
     """
     self.init_phones_iter(subset, shuffling)
     assert id < self.phone_to_seq_intervals[-1]
     
     # Get the sequence
     seq_id = np.digitize([id], self.phone_to_seq_intervals)[0] - 1
     id_in_seq = id - self.phone_to_seq_intervals[seq_id]
     if self.shuffle_seq:
         seq_id = self.invert_shuffling[seq_id]
     
     id_plus_seq = self.__dict__[subset]["seq_to_phones"][seq_id,0] \
                     + id_in_seq
     wav_start_in_seq = \
             self.__dict__[subset]["phones_intervals"][id_plus_seq, 0]
     wav_end_in_seq = \
             self.__dict__[subset]["phones_intervals"][id_plus_seq, 1]
     wav_start = self.__dict__[subset]["intervals"][seq_id] \
                 + wav_start_in_seq
     wav_end = self.__dict__[subset]["intervals"][seq_id] \
                 + wav_end_in_seq
     
     wav = self.__dict__[subset]["wav"][wav_start:wav_end]
     
     # Get the phone
     phone = self.__dict__[subset]["phones_intervals"][id_plus_seq,2]
     
     # Find the speaker id
     spkr_id = self.__dict__[subset]["speaker_id"][seq_id]
     # Find the speaker info
     spkr_info = self.spkrinfo[spkr_id]
     
     # Segment into frames
     wav = segment_axis(wav, frame_length, overlap)
     
     return [wav, phone, spkr_info]
Пример #19
0
    def multi_patch_over(self, cb_path):
        cb_path = segment_axis(cb_path, self.patch_length, overlap=0, axis=0)
        ## gives: npatch x patchlength + nres

        db_path = []
        for chunk in cb_path:

            matched = False
            for (res, res_patch) in enumerate(chunk.transpose()):

                key = tuple(res_patch.tolist())
                if key in self.patches[res]:
                    start = self.patches[res][key][0]  # take first!
                    end = start + self.patch_length
                    db_path.extend(range(start, end))
                    matched = True
                    print 'res: %s' % (res)
                    break

            if not matched:
                sys.exit('need back off strategy!')

        return db_path
Пример #20
0
def get_epoch_position_features(pms, rate, nsamples, seconds2samples=True, zero_uv_GCP=False):

    if seconds2samples:
        ## Convert seconds -> waveform sample numbers:-
        pms = np.asarray(np.round(pms * rate), dtype=int)
  
    ## make sure length compatible with the waveform:--
    last = len(pms)-1
    while pms[last] > nsamples:
        last -= 1
    pms = pms[:last]
    if nsamples > pms[-1]:
        pms = np.concatenate([pms, np.array([nsamples])])
    ## addd first 0
    pms = np.concatenate([np.array([0]), pms])
    
    start_end = segment_axis(pms, 2, overlap=1)
    lengths = start_end[:,1] - start_end[:,0]
    
    forwards = []
    backwards = []
    norm_forwards = []
    for length in lengths:
        forward = np.arange(length)
        backward = np.flipud(forward)
        norm_forward = forward / float(length)
        forwards.append( forward )
        backwards.append( backward )
        norm_forwards.append(  norm_forward )
    forwards = np.concatenate(forwards).reshape((nsamples,1))
    backwards = np.concatenate(backwards).reshape((nsamples,1))
    norm_forwards = np.concatenate(norm_forwards).reshape((nsamples,1))

    if zero_uv_GCP:
        #forwards[] = 0.0
        sys.exit('not implemented : zero_uv_GCP')
    return (forwards, backwards, norm_forwards)
Пример #21
0
 def replace_with_frames(self):
     for i in range(len(self.raw_wav)):
         frames = segment_axis(self.raw_wav[i],
                               length=self.frame_length,
                               overlap=self.overlap)
         self.raw_wav[i] = frames
Пример #22
0
    def __init__(self, which_set, frame_length, overlap=0,
                 frames_per_example=1, start=0, stop=None, audio_only=False,
                 rng=_default_seed):
        """
        Parameters
        ----------
        which_set : str
            Either "train", "valid" or "test"
        frame_length : int
            Number of acoustic samples contained in a frame
        overlap : int, optional
            Number of overlapping acoustic samples for two consecutive frames.
            Defaults to 0, meaning frames don't overlap.
        frames_per_example : int, optional
            Number of frames in a training example. Defaults to 1.
        start : int, optional
            Starting index of the sequences to use. Defaults to 0.
        stop : int, optional
            Ending index of the sequences to use. Defaults to `None`, meaning
            sequences are selected all the way to the end of the array.
        audio_only : bool, optional
            Whether to load only the raw audio and no auxiliary information.
            Defaults to `False`.
        rng : object, optional
            A random number generator used for picking random indices into the
            design matrix when choosing minibatches.
        """
        self.frame_length = frame_length
        self.overlap = overlap
        self.frames_per_example = frames_per_example
        self.offset = self.frame_length - self.overlap
        self.audio_only = audio_only

        # RNG initialization
        if hasattr(rng, 'random_integers'):
            self.rng = rng
        else:
            self.rng = numpy.random.RandomState(rng)

        # Load data from disk
        self._load_data(which_set)
        # Standardize data
        for i, sequence in enumerate(self.raw_wav):
            self.raw_wav[i] = (sequence - TIMIT._mean) / TIMIT._std

        # Slice data
        if stop is not None:
            self.raw_wav = self.raw_wav[start:stop]
            if not self.audio_only:
                self.phones = self.phones[start:stop]
                self.phonemes = self.phonemes[start:stop]
                self.words = self.words[start:stop]
        else:
            self.raw_wav = self.raw_wav[start:]
            if not self.audio_only:
                self.phones = self.phones[start:]
                self.phonemes = self.phonemes[start:]
                self.words = self.words[start:]

        examples_per_sequence = [0]

        for sequence_id, samples_sequence in enumerate(self.raw_wav):
            if not self.audio_only:
                # Phones segmentation
                phones_sequence = self.phones[sequence_id]
                phones_segmented_sequence = segment_axis(phones_sequence,
                                                         frame_length,
                                                         overlap)
                self.phones[sequence_id] = phones_segmented_sequence
                # phones_segmented_sequence = scipy.stats.mode(
                #     phones_segmented_sequence,
                #     axis=1
                # )[0].flatten()
                # phones_segmented_sequence = numpy.asarray(
                #     phones_segmented_sequence,
                #     dtype='int'
                # )
                # phones_sequence_list.append(phones_segmented_sequence)
                # Phonemes segmentation
                phonemes_sequence = self.phonemes[sequence_id]
                phonemes_segmented_sequence = segment_axis(phonemes_sequence,
                                                           frame_length,
                                                           overlap)
                self.phonemes[sequence_id] = phonemes_segmented_sequence
                # phonemes_segmented_sequence = scipy.stats.mode(
                #     phonemes_segmented_sequence,
                #     axis=1
                # )[0].flatten()
                # phonemes_segmented_sequence = numpy.asarray(
                #     phonemes_segmented_sequence,
                #     dtype='int'
                # )
                # phonemes_sequence_list.append(phonemes_segmented_sequence)
                # Words segmentation
                words_sequence = self.words[sequence_id]
                words_segmented_sequence = segment_axis(words_sequence,
                                                        frame_length,
                                                        overlap)
                self.words[sequence_id] = words_segmented_sequence
                # words_segmented_sequence = scipy.stats.mode(
                #     words_segmented_sequence,
                #     axis=1
                # )[0].flatten()
                # words_segmented_sequence = numpy.asarray(words_segmented_sequence,
                #                                          dtype='int')
                # words_sequence_list.append(words_segmented_sequence)

            # TODO: look at this, does it force copying the data?
            # Sequence segmentation
            samples_segmented_sequence = segment_axis(samples_sequence,
                                                      frame_length,
                                                      overlap)
            self.raw_wav[sequence_id] = samples_segmented_sequence

            # TODO: change me
            # Generate features/targets/phones/phonemes/words map
            num_frames = samples_segmented_sequence.shape[0]
            num_examples = num_frames - self.frames_per_example
            examples_per_sequence.append(num_examples)

        self.cumulative_example_indexes = numpy.cumsum(examples_per_sequence)
        self.samples_sequences = self.raw_wav
        if not self.audio_only:
            self.phones_sequences = self.phones
            self.phonemes_sequences = self.phonemes
            self.words_sequences = self.words
        self.num_examples = self.cumulative_example_indexes[-1]

        # DataSpecs
        features_space = VectorSpace(
            dim=self.frame_length * self.frames_per_example
        )
        features_source = 'features'
        features_dtype = self.samples_sequences[0].dtype
        def features_map_fn(indexes):
            rval = []
            for sequence_index, example_index in self._fetch_index(indexes):
                rval.append(self.samples_sequences[sequence_index][example_index:example_index
                    + self.frames_per_example].ravel())
            return rval

        targets_space = VectorSpace(dim=self.frame_length)
        targets_source = 'targets'
        targets_dtype = self.samples_sequences[0].dtype
        def targets_map_fn(indexes):
            rval = []
            for sequence_index, example_index in self._fetch_index(indexes):
                rval.append(self.samples_sequences[sequence_index][example_index
                    + self.frames_per_example].ravel())
            return rval

        space_components = [features_space, targets_space]
        source_components = [features_source, targets_source]
        dtypes_components = [features_dtype, targets_dtype]
        map_fn_components = [features_map_fn, targets_map_fn]
        batch_components = [None, None]

        if not self.audio_only:
            phones_space = IndexSpace(max_labels=61, dim=1)
            phones_source = 'phones'
            phones_dtype = self.phones_sequences[0].dtype
            def phones_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    rval.append(scipy.stats.mode(self.phones_sequences[sequence_index][example_index + self.frames_per_example])[0])
                return rval

            phonemes_space = IndexSpace(max_labels=31, dim=1)
            phonemes_source = 'phonemes'
            phonemes_dtype = self.phonemes_sequences[0].dtype
            def phonemes_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    rval.append(self.phonemes_sequences[sequence_index][example_index
                        + self.frames_per_example].ravel())
                return rval

            words_space = IndexSpace(max_labels=31, dim=1)
            words_source = 'words'
            words_dtype = self.words_sequences[0].dtype
            def words_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    rval.append(self.words_sequences[sequence_index][example_index
                        + self.frames_per_example].ravel())
                return rval

            space_components.extend([phones_space, phonemes_space,
                                     words_space])
            source_components.extend([phones_source, phonemes_source,
                                     words_source])
            dtypes_components.extend([phones_dtype, phonemes_dtype,
                                     words_dtype])
            map_fn_components.extend([phones_map_fn, phonemes_map_fn,
                                     words_map_fn])
            batch_components.extend([None, None, None])

        space = CompositeSpace(space_components)
        source = tuple(source_components)
        self.data_specs = (space, source)
        self.dtypes = tuple(dtypes_components)
        self.map_functions = tuple(map_fn_components)
        self.batch_buffers = batch_components

        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('shuffled_sequential')
        self._iter_data_specs = (CompositeSpace((features_space,
                                                 targets_space)),
                                 (features_source, targets_source))
Пример #23
0
 def get_word_seq(self, subset, frame_length, overlap, id, shuffling = True):
     """
     Given the subset id, the number of frames wanted, the frame length, 
     the overlap and the id, return the associated waveform sequence. 
     
     """
     self.init_words_iter(subset, shuffling)
     assert id < self.word_to_seq_intervals[-1]
     
     # Get the sequence
     seq_id = np.digitize([id], self.word_to_seq_intervals)[0] - 1
     id_in_seq = id - self.word_to_seq_intervals[seq_id]
     if self.shuffle_seq:
         seq_id = self.invert_shuffling[seq_id]
     
     id_plus_seq = self.__dict__[subset]["seq_to_words"][seq_id,0] \
                     + id_in_seq
     wav_start_in_seq = \
             self.__dict__[subset]["words_intervals"][id_plus_seq, 0]
     wav_end_in_seq = \
             self.__dict__[subset]["words_intervals"][id_plus_seq, 1]
     wav_start = self.__dict__[subset]["intervals"][seq_id] \
                 + wav_start_in_seq
     wav_end = self.__dict__[subset]["intervals"][seq_id] \
                 + wav_end_in_seq
     
     wav = self.__dict__[subset]["wav"][wav_start:wav_end]
     
     # Get the phones, phonemes and words
     phones = self.__dict__[subset]["phones"][wav_start:wav_end]
     phonemes = self.__dict__[subset]["phonemes"][wav_start:wav_end]
     word = self.__dict__[subset]["words_intervals"][id_plus_seq,2]
     
     # Find the speaker id
     spkr_id = self.__dict__[subset]["speaker_id"][seq_id]
     # Find the speaker info
     spkr_info = self.spkrinfo[spkr_id]
     
     # Segment into frames
     wav = segment_axis(wav, frame_length, overlap)
     
     # Take the most occurring phone in a sequence
     phones = segment_axis(phones, frame_length, overlap)
     phones = scipy.stats.mode(phones, axis=1)[0].flatten()
     phones = np.asarray(phones, dtype='int')
     
     # Take the most occurring phone in a sequence
     phonemes = segment_axis(phonemes, frame_length, overlap)
     phonemes = scipy.stats.mode(phonemes, axis=1)[0].flatten()
     phonemes = np.asarray(phonemes, dtype='int')
     
     
     # Binary variable announcing the end of the word or phoneme
     end_phn = np.zeros_like(phones)
     
     for i in range(len(phones) - 1):
         if phones[i] != phones[i+1]:
             end_phn[i] = 1
     
     end_phn[-1] = 1
     
     return [wav, phones, phonemes, end_phn, word, spkr_info]
Пример #24
0
 def get_fixed_size_seq(self, subset, n_frames, frame_length, overlap, ids, \
                         shuffling = True, wav_only = False):
     """
     Given the subset id, the number of frames wanted, the frame length, 
     the overlap, and the ids, return multiple arrays corresponding to
     a minibatch of frame sequence of fixed size
     
     """
     
     self.init_frames_iter(subset, n_frames, frame_length, overlap, \
                             shuffling)
     if isinstance(ids, collections.Iterable):
         ids = np.asarray(ids)
     else:
         ids = np.array([ids])
     
     assert np.all(ids < self.frame_seq_intervals[-1])
     
     # Get the sequence
     seq_ids = np.digitize(ids, self.frame_seq_intervals) - 1
     if self.shuffle_seq:
         seq_ids = self.invert_shuffling[seq_ids]
     
     idx_in_seq = ids - self.__dict__[subset]["intervals"][seq_ids]
     wav_start = self.__dict__[subset]["intervals"][seq_ids] + idx_in_seq
     # wav_start = wav_start.reshape((wav_start.shape[0],1))
     # indices = wav_start + np.arange(self.wav_length_required)
     
     wav = np.zeros((ids.shape[0], self.wav_length_required))
     for i, idx in enumerate(wav_start):
         wav[i] = self.__dict__[subset]["wav"][idx:(idx + self.wav_length_required)]
     # wav = self.__dict__[subset]["wav"][indices]
     
     if (ids.shape[0]*self.wav_length_required > 100000):
         print "Waveforms loaded."
     
     if not wav_only:
         # Get the phones, phonemes and words
         # phones = self.__dict__[subset]["phones"][indices]
         # phonemes = self.__dict__[subset]["phonemes"][indices]
         # words = self.__dict__[subset]["words"][indices]
         
         phones = np.zeros((ids.shape[0], self.wav_length_required))
         phonemes = np.zeros((ids.shape[0], self.wav_length_required))
         words = np.zeros((ids.shape[0], self.wav_length_required))
         
         # Find the speaker id
         spkr_id = self.__dict__[subset]["speaker_id"][seq_ids]
         # Find the speaker info
         spkr_info = self.spkrinfo[spkr_id]
     
         # Segment into frames
         wav = segment_axis(wav, frame_length, overlap, axis=1)
         # shape (n_ids, n_frames, frame_length)
     
         # Take the most occurring phone in a sequence
         phones = segment_axis(phones, frame_length, overlap, axis=1)
         phones = scipy.stats.mode(phones, axis=2)[0].reshape(ids.shape[0], \
                     n_frames)
         phones = np.asarray(phones, dtype='int')
     
         # Take the most occurring phone in a sequence
         phonemes = segment_axis(phonemes, frame_length, overlap, axis=1)
         phonemes = scipy.stats.mode(phonemes, axis=2)[0].reshape(ids.shape[0], \
                     n_frames)
         phonemes = np.asarray(phonemes, dtype='int')
     
         # Take the most occurring word in a sequence
         words = segment_axis(words, frame_length, overlap, axis=1)
         words = scipy.stats.mode(words, axis=2)[0].reshape(ids.shape[0], \
                     n_frames)
         words = np.asarray(words, dtype='int')
     
         # Binary variable announcing the end of the word or phoneme
         end_phn = np.zeros_like(phones)
         end_wrd = np.zeros_like(words)
     
         end_phn[:,:-1] = np.where(phones[:,:-1] != phones[:,1:], 1, 0)
         end_wrd[:,:-1] = np.where(words[:,:-1] != words[:,1:], 1, 0)
     
         return [wav, phones, phonemes, end_phn, words, end_wrd, spkr_info]
     
     else:
         return [wav]
Пример #25
0
 def get_raw_seq(self, subset, seq_id, frame_length, overlap, \
                 shuffling = True):
     """
     Given the id of the subset, the id of the sequence, the frame length and 
     the overlap between frames, this method will return a frames sequence 
     from a given set, the associated phonemes and words sequences (including 
     a binary variable indicating change) and the information vector on the 
     speaker.
     
     """
     self.check_subset_value(subset)
     self.check_subset_presence(subset)
     
     # Check if the id is valid
     n_seq = self.__dict__[subset]["n_seq"]
     if seq_id >= n_seq:
         raise ValueError("This sequence does not exist.")
     
     # Get the sequence
     if shuffling:
         seq_id = self.shuffling[seq_id]
     
     wav_start = self.__dict__[subset]["intervals"][seq_id]
     wav_end = self.__dict__[subset]["intervals"][seq_id+1]
     wav = self.__dict__[subset]["wav"][wav_start:wav_end]
     
     # Get the phones, phonemes and words
     phones = self.__dict__[subset]["phones"][wav_start:wav_end]
     phonemes = self.__dict__[subset]["phonemes"][wav_start:wav_end]
     words = self.__dict__[subset]["words"][wav_start:wav_end]
     
     # Find the speaker id
     spkr_id = self.__dict__[subset]["speaker_id"][seq_id]
     # Find the speaker info
     spkr_info = self.spkrinfo[spkr_id]
     
     # Segment into frames
     wav = segment_axis(wav, frame_length, overlap)
     
     # Take the most occurring phone in a sequence
     phones = segment_axis(phones, frame_length, overlap)
     phones = scipy.stats.mode(phones, axis=1)[0].flatten()
     phones = np.asarray(phones, dtype='int')
     
     # Take the most occurring phone in a sequence
     phonemes = segment_axis(phonemes, frame_length, overlap)
     phonemes = scipy.stats.mode(phonemes, axis=1)[0].flatten()
     phonemes = np.asarray(phonemes, dtype='int')
     
     # Take the most occurring word in a sequence
     words = segment_axis(words, frame_length, overlap)
     words = scipy.stats.mode(words, axis=1)[0].flatten()
     words = np.asarray(words, dtype='int')
     
     # Binary variable announcing the end of the word or phoneme
     end_phn = np.zeros_like(phones)
     end_wrd = np.zeros_like(words)
     
     for i in range(len(words) - 1):
         if phones[i] != phones[i+1]:
             end_phn[i] = 1
         if words[i] != words[i+1]:
             end_wrd[i] = 1
     
     end_phn[-1] = 1
     end_wrd[-1] = 1
     
     return [wav, phones, phonemes, end_phn, words, end_wrd, spkr_info]
Пример #26
0
    def get_path_information(self, target_features, path, waveform):
        context = self.config['wave_context_length']
        gen_wave = self.nextsample[path, :]
        print gen_wave.shape
        print '===='
        padded_wave = np.concatenate([np.zeros((context, 1)), gen_wave])
        print padded_wave.shape
        wavefrags = segment_axis(padded_wave.flatten(),
                                 context + 1,
                                 overlap=context,
                                 axis=0)
        join_features = wavefrags[:, :-1]
        nextsamples = wavefrags[:, -1]

        print join_features.shape
        print target_features.shape
        combined_features = np.hstack([join_features, target_features])
        dists, samples = self.joint_tree.query(combined_features, k=1, eps=2)
        print samples.shape
        print dists.shape
        print dists

        selected = self.train_unit_features[samples, :]
        dists2 = np.sqrt(((combined_features - selected)**2).sum(axis=1))

        ### stream contributions...
        raw_dists = (combined_features - selected)**2
        history_contrib = np.sqrt(raw_dists[:, :context].sum())
        print 'history'
        print history_contrib
        start = context
        for stream in self.stream_list_target:
            width = self.datadims_target[stream]
            end = start + width
            stream_contrib = np.sqrt(raw_dists[:, start:end].sum())
            print stream
            print stream_contrib
            start = end

        ### natural joins:
        pairs = copy.copy(segment_axis(path, 2, 1, axis=0))
        pairs[:, 0] += 1
        pairs[:, 0] *= -1
        diff = pairs.sum(axis=1)
        breaks = (diff != 0)
        breaks = np.array(breaks, dtype=int)
        breaks = np.concatenate([np.ones(1), breaks])
        print breaks

        sys.exit('sesrbsfrb')
        # pylab.plot(dists)
        # pylab.plot(dists2)
        # pylab.show()

        ### density:
        distance_thresh = dists.mean() * 2

        ## 1) how many points within twice average distance from targets?
        neighbours = self.joint_tree.query_ball_point(combined_features,
                                                      distance_thresh,
                                                      eps=2)
        n_neighbours_target = [len(thing) for thing in neighbours]

        ## 2) how many points within twice average distance from selected things?
        neighbours = self.joint_tree.query_ball_point(selected,
                                                      distance_thresh,
                                                      eps=2)
        n_neighbours_selected = [len(thing) for thing in neighbours]

        #print path

        pylab.subplot(411)
        pylab.plot(dists)
        pylab.subplot(412)
        pylab.plot(n_neighbours_target)
        pylab.plot(n_neighbours_selected)
        pylab.subplot(413)
        pylab.plot(breaks)
        pylab.subplot(414)
        pylab.plot(waveform)

        pylab.show()
Пример #27
0
 def replace_with_frames( self ):
     for i in range(len(self.raw_wav)):
         frames = segment_axis( self.raw_wav[i], length=self.frame_length, overlap=self.overlap )
         self.raw_wav[i] = frames
Пример #28
0
def mfcc(input, nwin=256, nfft=512, fs=16000, nceps=13):
    """Compute Mel Frequency Cepstral Coefficients.

    Parameters
    ----------
    input: ndarray
        input from which the coefficients are computed

    Returns
    -------
    ceps: ndarray
        Mel-cepstrum coefficients
    mspec: ndarray
        Log-spectrum in the mel-domain.

    Notes
    -----
    MFCC are computed as follows:
        * Pre-processing in time-domain (pre-emphasizing)
        * Compute the spectrum amplitude by windowing with a Hamming window
        * Filter the signal in the spectral domain with a triangular
        filter-bank, whose filters are approximatively linearly spaced on the
        mel scale, and have equal bandwith in the mel scale
        * Compute the DCT of the log-spectrum

    References
    ----------
    .. [1] S.B. Davis and P. Mermelstein, "Comparison of parametric
           representations for monosyllabic word recognition in continuously
           spoken sentences", IEEE Trans. Acoustics. Speech, Signal Proc.
           ASSP-28 (4): 357-366, August 1980."""

    # MFCC parameters: taken from auditory toolbox
    over = 170
    # Pre-emphasis factor (to take into account the -6dB/octave rolloff of the
    # radiation at the lips level)
    prefac = 0.97

    #lowfreq = 400 / 3.
    lowfreq = 133.33
    #highfreq = 6855.4976
    linsc = 200/3.
    logsc = 1.0711703

    nlinfil = 13
    nlogfil = 27
    nfil = nlinfil + nlogfil

    w = hamming(nwin, sym=0)

    fbank = trfbank(fs, nfft, lowfreq, linsc, logsc, nlinfil, nlogfil)[0]

    #------------------
    # Compute the MFCC
    #------------------
    extract = preemp(input, prefac)
    framed = segment_axis(extract, nwin, over) * w
    # Compute the spectrum magnitude
    spec = np.abs(fft(framed, nfft, axis=-1))
    # Filter the spectrum through the triangle filterbank
    mspec = np.log10(np.clip(np.dot(spec, fbank.T), 1e-9, np.inf))
    # Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain)
    ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:, :nceps]

    return ceps, mspec, spec
Пример #29
0
def main_work():

    #################################################

    # ======== Get stuff from command line ==========

    a = ArgumentParser()
    a.add_argument('-wav', dest='wavdir', required=True)
    a.add_argument('-exc', dest='excdir', required=True)
    a.add_argument('-testpattern', default='')
    a.add_argument('-trainpattern', default='')
    a.add_argument('-chunksize', type=int, default=2000)
    a.add_argument('-overlap', type=int, default=100)
    a.add_argument('-code', action='store_true', default=False)
    a.add_argument('-o', dest='outdir', required=True)

    opts = a.parse_args()
    # ===============================================

    for direc in [opts.outdir]:
        if not os.path.isdir(direc):
            os.makedirs(direc)

    ### TODO: don't have everything in memory!
    flist = sorted(glob.glob(opts.wavdir + '/*.wav'))
    bases = [get_basename(fname) for fname in flist]

    if opts.code:  ## sort by speaker first
        codes = [base.split('_')[-2] for base in bases]
        bases = [base for (code, base) in sorted(zip(codes, bases))]
        speaker_map = sorted(dict(zip(codes, codes)).keys())
        speaker_map = dict(zip(speaker_map, range(len(speaker_map))))

    if opts.testpattern:
        trainbases = [base for base in bases if opts.testpattern not in base]
        bases = trainbases
        print '%s files matching %s held out for testing ' % (
            len(bases) - len(trainbases), opts.testpattern)
    else:
        print 'no test pattern supplied -- no files held out'

    if opts.trainpattern:
        trainbases = [base for base in bases if opts.trainpattern in base]
        bases = trainbases

    bases = [
        base for base in bases
        if os.path.isfile(os.path.join(opts.excdir, base + '.wav'))
    ]

    sample_rate = None  ## will be set when first wave is opened, and others checked for consistency

    condition_name = 'data_c%s_o%s.hdf' % (opts.chunksize, opts.overlap)
    outfile = os.path.join(opts.outdir, condition_name)

    f = h5py.File(outfile, 'w')

    todo_list = [(opts.wavdir, 'wave'), (opts.excdir, 'excitation')]

    for (datadir, name) in todo_list:

        wavedata = []
        print 'Reading from %s...' % (datadir)
        for base in tqdm(bases):
            fname = os.path.join(datadir, base + '.wav')
            wave, fs = soundfile.read(
                fname, dtype='int16')  ## TODO: check wave read/load @343948

            if not sample_rate:
                sample_rate = fs
            else:
                assert fs == sample_rate

            wavedata.append(wave)

        print 'concatenate and reshape...'

        wavedata = np.concatenate(wavedata)
        wavedata = segment_axis(wavedata,
                                opts.chunksize,
                                overlap=opts.overlap,
                                end='cut',
                                axis=0)

        print 'Write to HDF...'
        dset = f.create_dataset(name,
                                wavedata.shape,
                                dtype=wavedata.dtype,
                                track_times=False)
        dset[:, :] = wavedata
        print 'Done'
        print

    if opts.code:
        wavedata = []
        print 'Adding codes...'
        datadir = opts.wavdir
        for base in tqdm(bases):
            fname = os.path.join(datadir, base + '.wav')
            wave, fs = soundfile.read(
                fname, dtype='int16')  ## TODO: check wave read/load @343948

            speaker_id = base.split('_')[-2]
            codes = np.ones(wave.shape,
                            dtype=wave.dtype) * speaker_map[speaker_id]
            wavedata.append(codes)

        print 'concatenate and reshape...'
        wavedata = np.concatenate(wavedata)
        wavedata = segment_axis(wavedata,
                                opts.chunksize,
                                overlap=opts.overlap,
                                end='cut',
                                axis=0)

        print 'Write to HDF...'
        dset = f.create_dataset('speaker_code',
                                wavedata.shape,
                                dtype=wavedata.dtype,
                                track_times=False)
        dset[:, :] = wavedata
        print 'Done'
        print
        print speaker_map
        print

    f.close()
    print 'Wrote ' + outfile