Exemplo n.º 1
0
def make_hdf_magphase(datadir, database_fname, fftlength):

    HALFFFTLEN = (fftlength / 2) + 1
    for stream in ['mag', 'real', 'imag', 'f0']:
        assert os.path.isdir(os.path.join(datadir, stream))

    f = h5py.File(database_fname, "w")

    for magfile in sorted(glob.glob(os.path.join(datadir, 'mag/*.mag'))):
        base = basename(magfile)
        print base
        skip_file = False
        for stream in ['mag', 'real', 'imag', 'f0']:
            if not os.path.isfile(
                    os.path.join(datadir, stream, base + '.' + stream)):
                skip_file = True
        if skip_file:
            print '  ---> skip!'
            continue

        utt_group = f.create_group(base)
        for stream in ['mag', 'real', 'imag']:
            speech = get_speech(
                os.path.join(datadir, stream, base + '.' + stream), HALFFFTLEN)
            utt_group.create_dataset(stream, data=speech)
        f0 = get_speech(os.path.join(datadir, 'f0', base + '.f0'), 1)
        f0_interp, vuv = speech_manip.lin_interp_f0(f0)
        utt_group.create_dataset('f0_interp', data=f0_interp)
        utt_group.create_dataset('vuv', data=vuv)

    f.close()
Exemplo n.º 2
0
 def preload_all_magphase_utts(self):
     start_time = self.start_clock('Preload magphase utts for corpus')
     for base in np.unique(self.train_filenames):
         print base
         mag_full = get_speech(os.path.join(self.config['full_magphase_dir'], 'mag', base + '.mag'), FFTHALFLEN)
         real_full = get_speech(os.path.join(self.config['full_magphase_dir'], 'real',  base + '.real'), FFTHALFLEN)
         imag_full = get_speech(os.path.join(self.config['full_magphase_dir'], 'imag',  base + '.imag'), FFTHALFLEN)
         f0_full = get_speech(os.path.join(self.config['full_magphase_dir'], 'f0', base + '.f0'), 1)            
         f0_interp, vuv = speech_manip.lin_interp_f0(f0_full)
         self.waveforms[base] = (mag_full, real_full, imag_full, f0_interp, vuv)
     self.stop_clock(start_time) 
Exemplo n.º 3
0
 def preload_magphase_utts(self, path):
     '''
     preload utts used for a given path
     '''
     for index in path:
         if self.train_filenames[index] in self.waveforms: # self.config['hold_waves_in_memory']:  ### i.e. waves or magphase FFT spectra
             (mag_full, real_full, imag_full, f0_interp, vuv) = self.waveforms[self.train_filenames[index]]  
         else:     
             mag_full = get_speech(os.path.join(self.config['full_magphase_dir'], 'mag', self.train_filenames[index] + '.mag'), FFTHALFLEN)
             real_full = get_speech(os.path.join(self.config['full_magphase_dir'], 'real',  self.train_filenames[index] + '.real'), FFTHALFLEN)
             imag_full = get_speech(os.path.join(self.config['full_magphase_dir'], 'imag',  self.train_filenames[index] + '.imag'), FFTHALFLEN)
             f0_full = get_speech(os.path.join(self.config['full_magphase_dir'], 'f0',  self.train_filenames[index] + '.f0'), 1)            
             f0_interp, vuv = speech_manip.lin_interp_f0(f0_full)
             self.waveforms[self.train_filenames[index]] = (mag_full, real_full, imag_full, f0_interp, vuv)
Exemplo n.º 4
0
def get_mean(flist, dim, exclude_uv=False):
    '''
    Take mean over each coeff, to centre their trajectories around zero.
    '''
    frame_sum = np.zeros(dim)
    frame_count = 0
    for fname in flist:
        if not os.path.isfile(fname):
            continue
        print 'mean: ' + fname

        speech = get_speech(fname, dim)
        if np.sum(np.isnan(speech)) + np.sum(np.isinf(speech)) > 0:
            print 'EXCLUDE ' + fname
            continue

        if exclude_uv:
            ## remove speech where first column is <= 0.0
            speech = speech[speech[:, 0] > 0.0, :]

        frame_sum += speech.sum(axis=0)
        m, n = np.shape(speech)
        frame_count += m

    mean_vec = frame_sum / float(frame_count)
    return mean_vec, frame_count
Exemplo n.º 5
0
def world_synth(cmpfile, wavefile, config, denorm=False):

    stream_names = config['stream_names']
    datadims = dict(zip(stream_names, config['datadims_list']))

    datadims['vuv'] = 1
    speech = get_speech(cmpfile, sum(datadims.values()) + 1)

    #print config
    if denorm:

        speech = destandardise(speech, config)

    streams = split_into_streams(speech, stream_names, datadims)
    #print streams

    if 'lf0' in streams:
        fzero = numpy.exp(streams['lf0'])

        vuv_thresh = 0.5
        if 'vuv' in streams:
            vuv = streams['vuv']
            lf0 = streams['lf0']
            fzero[vuv <= vuv_thresh] = 0.0

        #fzero *= fzero_scale

        streams['lf0'] = fzero

    streams2wav(streams, wavefile, config)
Exemplo n.º 6
0
def test():
    mfccs = glob.glob(
        '/afs/inf.ed.ac.uk/user/o/owatts/sim2/oliver/slm_data_work/fls_hybrid/feat_29/world_reaper/mfcc/*.mfcc'
    )
    for mfcc in mfccs:
        wavfile = mfcc.replace('.mfcc', '.wav').replace(
            '/mfcc/', '/tmp/'
        )  # '/afs/inf.ed.ac.uk/user/o/owatts/sim2/oliver/slm_data_work/fls_hybrid/feat_29/world_reaper/tmp/AMidsummerNightsDream_011_000.wav'
        wave, sample_rate = read_wave(wavfile)
        mf = get_speech(mfcc, 13, remove_htk_header=True)
        c = get_mfcc_frame_centres(len(wave), 48000, 0.002, 0.010)
        print c
        print len(c)
        print mf.shape
Exemplo n.º 7
0
def load_data(feat_dir, lab_dir, categories):
    '''
    '''

    assert len(
        glob.glob(lab_dir + '/*.lab')) > 0, 'no labels in %s' % (lab_dir)

    feats = glob.glob(feat_dir + '/*.mfcc')  # [:3]
    #labs = glob.glob(lab_dir + '/*.lab')

    feats_only = []
    unlabelled_names = []
    labelled_feats = []
    all_labels = []

    for feat in feats[:6]:
        _, base = os.path.split(feat)
        base = base.replace('.mfcc', '')
        print 'loading %s' % (base)

        lab = os.path.join(lab_dir, base + '.lab')

        features = get_speech(feat, 13, remove_htk_header=True)

        feats_only.append(features)
        unlabelled_names.append(base)

        if not os.path.isfile(lab):
            continue

        print 'getting label for %s' % (base)

        labels = read_labels(lab, categories)

        feat_frames, n = features.shape
        lab_frames, = labels.shape

        frames = min(feat_frames, lab_frames)

        features = features[:frames, :]
        labels = labels[:frames]

        labelled_feats.append(features)
        all_labels.append(labels)

    #feats_only = np.vstack(feats_only)
    labelled_feats = np.vstack(labelled_feats)
    all_labels = np.concatenate(all_labels)

    return labelled_feats, all_labels, feats_only, unlabelled_names
Exemplo n.º 8
0
def get_std(flist, dim, mean_vec, exclude_uv=False):
    '''
    Unlike mean, use single std value over all coeffs in stream, to preserve relative differences in range of coeffs within a stream
    The value we use is the largest std across the coeffs, which means that this stream when normalised
    will have std of 1.0, and other streams decreasing. 
    Reduplicate this single value to vector the width of the stream.
    '''
    diff_sum = np.zeros(dim)
    frame_count = 0
    for fname in flist:
        if not os.path.isfile(fname):
            continue
        print 'std: ' + fname

        speech = get_speech(fname, dim)
        if np.sum(np.isnan(speech)) + np.sum(np.isinf(speech)) > 0:
            print 'EXCLUDE ' + fname
            continue

        if exclude_uv:
            ## remove speech where first column is <= 0.0
            speech = speech[speech[:, 0] > 0.0, :]

        m, n = np.shape(speech)
        #mean_mat = np.tile(mean_vec,(m,1))
        mean_vec = mean_vec.reshape((1, -1))
        sq_diffs = (speech - mean_vec)**2
        diff_sum += sq_diffs.sum(axis=0)
        frame_count += m

    max_diff_sum = diff_sum.max()
    print mean_vec.tolist()
    print max_diff_sum.tolist()
    std_val = (max_diff_sum / float(frame_count))**0.5
    std_vec = np.ones((1, dim)) * std_val
    return std_vec
Exemplo n.º 9
0
def compose_speech(feat_dir_dict,
                   base,
                   stream_list,
                   datadims,
                   ignore_streams=['triphone']):
    '''
    where there is trouble, signal this by returning a 1 x 1 matrix
    '''

    stream_list = [
        stream for stream in stream_list if stream not in ignore_streams
    ]
    # mgc_fn = os.path.join(indir, 'mgc', base+'.mgc' )
    # f0_fn = os.path.join(indir, 'f0', base+'.f0' )
    # ap_fn = os.path.join(indir, 'ap', base+'.ap' )

    stream_data_list = []
    for stream in stream_list:
        stream_fname = os.path.join(feat_dir_dict[stream], base + '.' + stream)
        if not os.path.isfile(stream_fname):
            print stream_fname + ' does not exist'
            return np.zeros((1, 1))
        stream_data = get_speech(stream_fname, datadims[stream])
        if stream == 'aef':
            stream_data = np.vstack([
                np.zeros((1, datadims[stream])), stream_data,
                np.zeros((1, datadims[stream]))
            ])
        ### previously:
        # if stream in vuv_stream_names:
        #     uv_ix = np.arange(stream_data.shape[0])[stream_data[:,0]<=0.0]
        #     vuv = np.ones(stream_data.shape)
        #     vuv[uv_ix, :] = 0.0
        #     ## set F0 to utterance's voiced frame mean in unvoiced frames:
        #     voiced = stream_data[stream_data>0.0]
        #     if voiced.size==0:
        #         voiced_mean = 100.0 ### TODO: fix artibrary nnumber!
        #     else:
        #         voiced_mean = voiced.mean()
        #     stream_data[stream_data<=0.0] = voiced_mean
        #     stream_data_list.append(stream_data)
        #     stream_data_list.append(vuv)

        ### Now, just set unvoiced frames to -1.0 (they will be specially weighted later):
        if stream in vuv_stream_names:
            # uv_ix = np.arange(stream_data.shape[0])[stream_data[:,0]<=0.0]
            # vuv = np.ones(stream_data.shape)
            # vuv[uv_ix, :] = 0.0
            ## set F0 to utterance's voiced frame mean in unvoiced frames:
            # voiced = stream_data[stream_data>0.0]
            # if voiced.size==0:
            #     voiced_mean = 100.0 ### TODO: fix artibrary nnumber!
            # else:
            #     voiced_mean = voiced.mean()
            stream_data[stream_data <= 0.0] = const.special_uv_value
            stream_data_list.append(stream_data)
            # stream_data_list.append(vuv)
        else:
            stream_data_list.append(stream_data)

    ## where data has different number of frames per stream, chop off the extra frames:
    frames = [np.shape(data)[0] for data in stream_data_list]
    nframe = min(frames)
    stream_data_list = [data[:nframe, :] for data in stream_data_list]

    speech = np.hstack(stream_data_list)

    return speech
Exemplo n.º 10
0
def main_work():

    #################################################

    # ======== Get stuff from command line ==========

    a = ArgumentParser()
    a.add_argument('-w', dest='wavfile', required=True)
    a.add_argument('-f', dest='feature_dir', required=True)
    a.add_argument('-p', dest='pm_dir', required=True)
    a.add_argument('-o', dest='outdir', required=True)
    a.add_argument('-x', dest='feature_extension', required=True)
    a.add_argument('-d', dest='feature_dim', type=int, required=True)
    a.add_argument('-s',
                   dest='fshift_seconds',
                   type=float,
                   default=0.005,
                   required=False)
    a.add_argument('-l',
                   dest='labdir',
                   default=None,
                   help='not currently used')
    a.add_argument(
        '-win',
        dest='windowing_convention',
        default='',
        help=
        'How to determine locations of windows, by default, guessed based on feature_extension'
    )
    opts = a.parse_args()

    # ===============================================

    ## temporary check not to use labels:
    assert opts.labdir == None

    if not os.path.isdir(opts.outdir):
        os.makedirs(opts.outdir)

    junk, base = os.path.split(opts.wavfile)
    base = base.replace('.wav', '')

    pm_fname = os.path.join(opts.pm_dir, base + '.pm')

    feature_fname = os.path.join(opts.feature_dir,
                                 base + '.' + opts.feature_extension)

    for fname in [opts.wavfile, pm_fname, feature_fname]:
        if not os.path.isfile(fname):
            sys.exit('File does not exist: %s' % (fname))

    ## read data from files
    wave, sample_rate = read_wave(opts.wavfile)

    if opts.feature_extension == 'mfcc':
        features = get_speech(feature_fname,
                              opts.feature_dim,
                              remove_htk_header=True)
    else:
        features = get_speech(feature_fname,
                              opts.feature_dim,
                              remove_htk_header=False)

    pms_seconds = read_pm(pm_fname)

    ## Convert seconds -> waveform sample numbers:-
    pms = np.asarray(np.round(pms_seconds * sample_rate), dtype=int)
    len_wave = len(wave)

    if opts.windowing_convention:
        windowing_convention = opts.windowing_convention
    else:
        if opts.feature_extension == 'mfcc':
            windowing_convention = 'HTK'
        elif opts.feature_extension in ['formfreq', 'formband']:
            windowing_convention = 'snack'
        else:
            windowing_convention = 'world'

    if opts.feature_extension in vuv_stream_names:
        ## then we need to handle voicing decision specially:
        features, vuv = interp_fzero(features)
        ps_features = pitch_synchronous_resample(
            len_wave,
            sample_rate,
            opts.fshift_seconds,
            pms,
            features,
            windowing_convention=windowing_convention)
        ps_vuv = pitch_synchronous_resample(
            len_wave,
            sample_rate,
            opts.fshift_seconds,
            pms,
            vuv,
            int_type='nearest',
            windowing_convention=windowing_convention)
        assert ps_features.shape == ps_vuv.shape
        ## reimpose voicing decision on resampled F0:
        ps_features[ps_vuv == 0] = 0
    else:
        ps_features = pitch_synchronous_resample(
            len_wave,
            sample_rate,
            opts.fshift_seconds,
            pms,
            features,
            windowing_convention=windowing_convention)

    # ps_mgc = pitch_synchronous_resample(len_wave, sample_rate, fshift_seconds, pms, mgc)
    # ps_ap = pitch_synchronous_resample(len_wave, sample_rate, fshift_seconds, pms, ap)

    # put_speech(ps_fz, os.path.join(opts.outdir, 'f0', base + '.f0'))
    # put_speech(ps_mgc, os.path.join(opts.outdir, 'mgc', base + '.mgc'))
    # put_speech(ps_ap, os.path.join(opts.outdir, 'ap', base + '.ap'))

    put_speech(ps_features,
               os.path.join(opts.outdir, base + '.' + opts.feature_extension))

    if opts.labdir != None:
        labfile = os.path.join(opts.labdir, base + '.lab')
        print 'TODO -- labels!'
        pms_htkunit = np.asarray(np.round(pms_seconds * 10000000), dtype=int)
        label = read_label(labfile)
        assign_pm_to_labels(pms_htkunit, label)
Exemplo n.º 11
0
def main_work(config, overwrite_existing_data=False):

    assert config['target_representation'] == 'epoch'

    database_fname = get_data_dump_name(config)

    if os.path.isfile(database_fname):
        if not overwrite_existing_data:
            sys.exit(
                'Data already exists at %s -- run with -X to overwrite it' %
                (database_fname))
        else:
            os.system('rm ' + database_fname)

    n_train_utts = config.get('n_train_utts',
                              0)  ## default (0): use all sentences

    target_feat_dirs = config['target_datadirs']
    datadims_target = config['datadims_target']
    stream_list_target = config['stream_list_target']
    ## get dicts mapping e.g. 'mgc': '/path/to/mgc/' : -
    target_stream_dirs = locate_stream_directories(target_feat_dirs,
                                                   stream_list_target)

    join_feat_dirs = config['join_datadirs']
    datadims_join = config['datadims_join']
    stream_list_join = config['stream_list_join']
    ## get dicts mapping e.g. 'mgc': '/path/to/mgc/' : -
    join_stream_dirs = locate_stream_directories(join_feat_dirs,
                                                 stream_list_join)

    ## First, work out initial list of training utterances based on files present in first stream subdir:
    first_stream = stream_list_target[
        0]  ## <-- typically, mgc, but not really important
    utt_list = sorted(
        glob.glob(target_stream_dirs[first_stream] + '/*.' + first_stream))
    flist = [
        os.path.split(fname)[-1].replace('.' + first_stream, '')
        for fname in utt_list
    ]

    ## Next, limit training utterances by number or by pattern:
    if type(n_train_utts) == int:
        if (n_train_utts == 0 or n_train_utts > len(flist)):
            n_train_utts = len(flist)
        flist = flist[:n_train_utts]
    elif type(n_train_utts) == str:
        match_expression = n_train_utts
        flist = [name for name in flist if match_expression in name]
        print 'Selected %s utts with pattern %s' % (len(flist),
                                                    match_expression)

    ## Also filter for test material, in case they are in same directory:
    if 'test_patterns' in config:
        test_flist = []
        for fname in flist:
            for pattern in config['test_patterns']:
                if pattern in fname:
                    test_flist.append(fname)
        flist = [name for name in flist if name not in test_flist]

    ## Finally, only take utterances which occur in train_list, if it is given in config:
    if 'train_list' in config:
        assert os.path.isfile(
            config['train_list']), 'File %s does not exist' % (
                config['train_list'])
        train_list = readlist(config['train_list'])
        train_list = dict(zip(train_list, train_list))
        flist = [name for name in flist if name in train_list]

    assert len(flist) > 0

    ## 1A) First pass: get mean and std per stream for each of {target,join}
    (mean_vec_target, std_vec_target) = get_mean_std(target_stream_dirs,
                                                     stream_list_target,
                                                     datadims_target, flist)
    (mean_vec_join, std_vec_join) = get_mean_std(join_stream_dirs,
                                                 stream_list_join,
                                                 datadims_join, flist)

    ## 1B) Initialise HDF5; store mean and std in HDF5:

    f = h5py.File(database_fname, "w")

    mean_target_dset = f.create_dataset("mean_target",
                                        np.shape(mean_vec_target),
                                        dtype='f',
                                        track_times=False)
    std_target_dset = f.create_dataset("std_target",
                                       np.shape(std_vec_target),
                                       dtype='f',
                                       track_times=False)
    mean_join_dset = f.create_dataset("mean_join",
                                      np.shape(mean_vec_join),
                                      dtype='f',
                                      track_times=False)
    std_join_dset = f.create_dataset("std_join",
                                     np.shape(std_vec_join),
                                     dtype='f',
                                     track_times=False)

    mean_target_dset[:] = mean_vec_target[:]
    std_target_dset[:] = std_vec_target[:]
    mean_join_dset[:] = mean_vec_join[:]
    std_join_dset[:] = std_vec_join[:]

    ## Set some values....
    target_dim = mean_vec_target.shape[0]
    join_dim = mean_vec_join.shape[0]

    target_rep_size = target_dim * target_rep_widths[config.get(
        'target_representation', 'epoch')]

    fshift_seconds = (0.001 * config['frameshift_ms'])
    fshift = int(config['sample_rate'] * fshift_seconds)
    samples_per_frame = fshift

    print 'Go through data to find number of units:- '

    n_units = 0
    new_flist = []
    first_stream, first_streamdir = sorted(target_stream_dirs.items())[0]
    for base in flist:
        featfile = os.path.join(first_streamdir, base + '.' + first_stream)
        if not os.path.exists(featfile):
            print 'skipping %s' % (featfile)
            continue
        speech = get_speech(featfile, datadims_target[first_stream])
        npoint, _ = speech.shape
        n_units += npoint
        new_flist.append(base)
    flist = new_flist

    print '%s units (%s)' % (n_units,
                             config.get('target_representation', 'epoch'))

    ## 2) Get ready to store data in HDF5:
    total_target_dim = target_rep_size

    ## maxshape makes a dataset resizable
    train_dset = f.create_dataset("train_unit_features",
                                  (n_units, total_target_dim),
                                  maxshape=(n_units, total_target_dim),
                                  dtype='f',
                                  track_times=False)

    phones_dset = f.create_dataset("train_unit_names", (n_units, ),
                                   maxshape=(n_units, ),
                                   dtype='|S50',
                                   track_times=False)
    filenames_dset = f.create_dataset("filenames", (n_units, ),
                                      maxshape=(n_units, ),
                                      dtype='|S50',
                                      track_times=False)
    unit_index_within_sentence_dset = f.create_dataset(
        "unit_index_within_sentence_dset", (n_units, ),
        maxshape=(n_units, ),
        dtype='i',
        track_times=False)
    join_contexts_dset = f.create_dataset("join_contexts",
                                          (n_units + 1, join_dim),
                                          maxshape=(n_units + 1, join_dim),
                                          dtype='f',
                                          track_times=False)

    ### TODO: use?
    if config.get('store_full_magphase', False):
        mp_mag_dset = f.create_dataset("mp_mag", (n_units, 513),
                                       maxshape=(n_units, 513),
                                       dtype='f',
                                       track_times=False)
        mp_imag_dset = f.create_dataset("mp_imag", (n_units, 513),
                                        maxshape=(n_units, 513),
                                        dtype='f',
                                        track_times=False)
        mp_real_dset = f.create_dataset("mp_real", (n_units, 513),
                                        maxshape=(n_units, 513),
                                        dtype='f',
                                        track_times=False)
        mp_fz_dset = f.create_dataset("mp_fz", (n_units, 1),
                                      maxshape=(n_units, 1),
                                      dtype='f',
                                      track_times=False)

    ## Standardise data (within streams), compose, add VUV, fill F0 gaps with utterance mean voiced value:
    start = 0

    print 'Composing ....'
    print flist
    new_flist = []
    for base in flist:

        print base

        #! pm_file = os.path.join(config['pm_datadir'], base + '.pm')
        # if not(os.path.isfile(pm_file)):
        #     print 'Warning: no pm -- skip!'
        #     continue

        #! ## Get pitchmarks (to join halfphones on detected GCIs):-
        # pms_seconds = read_pm(pm_file)
        # if pms_seconds.shape == (1,1):
        #     print 'Warning: trouble reading pm file -- skip!'
        #     continue

        ### Get speech params for target cost (i.e. probably re-generated speech for consistency):
        t_speech = compose_speech(target_stream_dirs, base, stream_list_target,
                                  datadims_target)
        if t_speech.shape == [1, 1]:  ## bad return value
            continue

        t_speech = standardise(t_speech, mean_vec_target, std_vec_target)

        ### Get speech params for join cost (i.e. probably natural speech).
        ### These are expected to have already been resampled so that they are pitch-synchronous.
        j_speech = compose_speech(join_stream_dirs, base, stream_list_join,
                                  datadims_join)
        if j_speech.size == 1:  ## bad return value
            continue
        j_speech = standardise(j_speech, mean_vec_join, std_vec_join)

        j_frames, j_dim = j_speech.shape
        # if j_frames != len(pms_seconds):
        #     print (j_frames, len(pms_seconds))
        #     print 'Warning: number of rows in join cost features not same as number of pitchmarks:'
        #     print 'these features should be pitch synchronous. Skipping utterance!'
        #     continue

        t_frames, t_dim = t_speech.shape
        if j_frames != t_frames:
            print(j_frames, t_frames)
            print 'Warning: number of rows in target cost features not same as number in join cost features:'
            print ' Skipping utterance!'
            continue

        first_sentence_in_corpus = base == flist[0]
        if config.get('REPLICATE_IS2018_EXP', False):
            unit_features = t_speech[1:
                                     -1, :]  ## Representations for target cost

            if first_sentence_in_corpus:
                context_data = j_speech[:-1, :]
            else:
                context_data = j_speech[1:-1, :]
        else:  ## this should be consistent with how hi-dim frames are selected and remove a bug
            unit_features = t_speech  ## do not trim frames

            if first_sentence_in_corpus:
                initial_history = j_speech[0, :].reshape(
                    (1, -1))  ### assume that first frame is silence
                context_data = np.vstack([initial_history, j_speech])
            else:
                context_data = j_speech

        ## TODO: reinstate this?:--
        ADD_PHONETIC_EPOCH = False
        if ADD_PHONETIC_EPOCH:
            labfile = os.path.join(config['label_datadir'],
                                   base + '.' + config['lab_extension'])
            labs = read_label(labfile, config['quinphone_regex'])
            unit_names = resample_labels.pitch_synchronous_resample_label(
                48000, 0.005, pms_samples, labs)
        else:
            unit_names = ['_'] * (t_speech.shape[0])
            unit_names = np.array(unit_names)
        #
        #
        if config.get('REPLICATE_IS2018_EXP', False):
            unit_names = unit_names[1:-1]

        m, n = unit_features.shape
        filenames = [base] * m

        unit_index_within_sentence = np.arange(m)

        ## TODO: reinstate this as hi-dim writer?:--
        CHECK_MAGPHASE_SIZES = False
        if CHECK_MAGPHASE_SIZES:  # config.get('store_full_magphase', False):
            print 'CHECK_MAGPHASE_SIZES'
            for extn in ['mag', 'imag', 'real', 'f0']:
                direc = extn + '_full'
                if extn == 'f0':
                    sdim = 1
                else:
                    sdim = 513
                fname = os.path.join(config['full_magphase_dir'], direc,
                                     base + '.' + extn)
                full_stream = get_speech(fname, sdim)
                #full_stream = full_stream[1:-1,:]
                print direc
                print full_stream.shape

        ## TODO: reinstate this as hi-dim writer?:--
        if config.get('store_full_magphase', False):
            mp_data = []
            for extn in ['mag', 'imag', 'real', 'f0']:
                direc = extn + '_full'
                if extn == 'f0':
                    sdim = 1
                else:
                    sdim = 513
                fname = os.path.join(config['full_magphase_dir'], direc,
                                     base + '.' + extn)
                full_stream = get_speech(fname, sdim)
                full_stream = full_stream[1:-1, :]
                print direc
                print full_stream.shape
                mp_data.append(full_stream)

        ## Add everything to database:
        train_dset[start:start + m, :] = unit_features

        phones_dset[start:start + m] = unit_names
        filenames_dset[start:start + m] = filenames
        unit_index_within_sentence_dset[start:start +
                                        m] = unit_index_within_sentence
        #! cutpoints_dset[start:start+m,:] = cutpoints

        ### join_contexts has extra initial frame of history -- deal with it:
        if first_sentence_in_corpus:
            join_contexts_dset[start:start + m + 1, :] = context_data
        else:
            join_contexts_dset[start + 1:start + m + 1, :] = context_data

        ### TODO: use?
        if config.get('store_full_magphase', False):
            (mp_mag, mp_imag, mp_real, mp_fz) = mp_data

            mp_mag_dset[start:start + m, :] = mp_mag
            mp_imag_dset[start:start + m, :] = mp_imag
            mp_real_dset[start:start + m, :] = mp_real
            mp_fz_dset[start:start + m, :] = mp_fz

        start += m
        new_flist.append(base)

    ## Number of units was computed before without considering dropped utterances, actual number
    ## will be smaller. Resize the data:
    actual_nframes = start
    print '\n\n\nNumber of units actually written:'
    print actual_nframes
    print

    train_dset.resize(actual_nframes, axis=0)

    phones_dset.resize(actual_nframes, axis=0)
    filenames_dset.resize(actual_nframes, axis=0)
    unit_index_within_sentence_dset.resize(actual_nframes, axis=0)

    join_contexts_dset.resize(actual_nframes + 1, axis=0)

    ### TODO
    if config.get('store_full_magphase', False):
        mp_mag_dset.resize(actual_nframes, axis=0)
        mp_imag_dset.resize(actual_nframes, axis=0)
        mp_real_dset.resize(actual_nframes, axis=0)
        mp_fz_dset.resize(actual_nframes, axis=0)

    print
    print 'Storing hybrid voice data:'
    for thing in f.values():
        print thing

    f.close()

    print 'Stored training data for %s sentences to %s' % (n_train_utts,
                                                           database_fname)
Exemplo n.º 12
0
if os.path.isdir(mfcc12):
    sys.exit('%s already exists'%(mfcc12))

if os.path.isdir(energy):
    sys.exit('%s already exists'%(energy))

os.makedirs(mfcc12)
os.makedirs(energy)


for mfcc_fname in sorted(glob.glob(mfcc_dir + '/*.mfcc')):
    _,base = os.path.split(mfcc_fname)
    base = base.replace('.mfcc','')
    print base
    speech = get_speech(mfcc_fname, 13)

    ## remove outlying values which make later standardisation of the data crazy:
    speech[speech<-100.0] = 0.0
    speech[speech>100.0] = 0.0
    
    e = speech[:,0].reshape(-1,1)
    m = speech[:,1:]
    put_speech(e, os.path.join(energy, base+'.energy'))
    put_speech(m, os.path.join(mfcc12, base+'.mfcc12'))





Exemplo n.º 13
0
# ## this is the training data as regenerated by LSTM trained on it (for target cost):
# streams_dir = '/afs/inf.ed.ac.uk/group/cstr/projects/blizzard_entries/blizzard2017/hybrid_voice/data/predicted_params/train/'

# topoutdir = '/tmp/testpad'

## --------

## HTS style labels used in Blizzard:-
hts_quinphone_regex = '([^~]+)~([^-]+)-([^\+]+)\+([^\=]+)\=([^:]+)'
stream_list = ['mgc', 'lf0']
stream_dims = {'mgc': 60, 'lf0': 1}

for labfname in glob.glob(labdir + '/*.lab'):
    print labfname

    lab = read_label(labfname, hts_quinphone_regex)

    base = basename(labfname)
    for stream in stream_list:
        stream_file = os.path.join(streams_dir, stream, base + '.' + stream)
        if not os.path.isfile(stream_file):
            print 'skip!'
            continue
        speech = get_speech(stream_file, stream_dims[stream])
        speech = reinsert_terminal_silence(speech, lab)

        outdir = topoutdir + '/' + stream
        safe_makedir(outdir)
        put_speech(speech, outdir + '/' + base + '.' + stream)
Exemplo n.º 14
0
    def retrieve_magphase_frag(self, index, extra_frames=0):

        if 0:
            print 'retrieving fragment'
            print self.train_filenames[index]
            print self.unit_index_within_sentence[index]

        if self.use_hdf_magphase:
            base = self.train_filenames[index]
            mag_full = self.hdf_magphase_pointer[base]['mag'][:]
            real_full = self.hdf_magphase_pointer[base]['real'][:]
            imag_full = self.hdf_magphase_pointer[base]['imag'][:]
            f0_interp = self.hdf_magphase_pointer[base]['f0_interp'][:]
            vuv = self.hdf_magphase_pointer[base]['vuv'][:]

        else:
            ## side effect -- data persists in self.waveforms. TODO: Protect against mem errors
            if False: # self.train_filenames[index] in self.waveforms: # self.config['hold_waves_in_memory']:  ### i.e. waves or magphase FFT spectra
                (mag_full, real_full, imag_full, f0_interp, vuv) = self.waveforms[self.train_filenames[index]]  
            else:     
                mag_full = get_speech(os.path.join(self.config['full_magphase_dir'], 'mag', self.train_filenames[index] + '.mag'), FFTHALFLEN)
                real_full = get_speech(os.path.join(self.config['full_magphase_dir'], 'real',  self.train_filenames[index] + '.real'), FFTHALFLEN)
                imag_full = get_speech(os.path.join(self.config['full_magphase_dir'], 'imag',  self.train_filenames[index] + '.imag'), FFTHALFLEN)
                f0_full = get_speech(os.path.join(self.config['full_magphase_dir'], 'f0',  self.train_filenames[index] + '.f0'), 1)            
                f0_interp, vuv = speech_manip.lin_interp_f0(f0_full)
                self.waveforms[self.train_filenames[index]] = (mag_full, real_full, imag_full, f0_interp, vuv)

        start_index = self.unit_index_within_sentence[index]
        #start_index -= 1  ### because magphase have extra pms beginning and end
        multiepoch = self.config.get('multiepoch', 1)
        end_index = start_index + multiepoch

        ## 
        start_pad = 0
        end_pad = 0        
        if extra_frames > 0:
            new_start_index = start_index - extra_frames
            new_end_index = end_index + extra_frames

            ## check out of bounds and record to zero pad later if necessary:
            nframes, _ = mag_full.shape

            if new_start_index < 0:
                start_pad = new_start_index * -1
            if new_end_index > nframes:
                end_pad = new_end_index - nframes 

            if start_pad > 0:
                start_index = 0
            else:
                start_index = new_start_index

            if end_pad > 0:
                end_index = nframes
            else:
                end_index = new_end_index

        if 0:
            print 'se'
            print (start_pad, end_pad)

        if 0:
            print '-----indices:  '
            print start_index, end_index
            print end_index - start_index
            print mag_full.shape

        mag_frag = mag_full[start_index:end_index, :]
        real_frag = real_full[start_index:end_index, :]
        imag_frag = imag_full[start_index:end_index, :]
        f0_frag = f0_interp[start_index:end_index, :]
        # f0_frag = f0_full[start_index:end_index, :]  ## !!!!!!!!!!!!!!!!!!!!!!!!!!
        vuv_frag = vuv[start_index:end_index, :]

        ### add zero padding where :
        mag_frag = zero_pad_matrix(mag_frag, start_pad, end_pad)
        real_frag = zero_pad_matrix(real_frag, start_pad, end_pad)
        imag_frag = zero_pad_matrix(imag_frag, start_pad, end_pad)
        f0_frag = zero_pad_matrix(f0_frag, start_pad, end_pad)
        vuv_frag = zero_pad_matrix(vuv_frag, start_pad, end_pad)
        # print mag_frag.shape

        # print '======'
        # print extra_frames
        
        # print new_start_index
        # print new_end_index

        # print start_pad
        # print end_pad


        ## sanity check dimensions
        m,n = mag_frag.shape
        if 0:
            print multiepoch
            print extra_frames
            print m
        assert m == multiepoch + (extra_frames*2)


        ### add taper (weighting for cross-fade):
        if extra_frames > 0:
            mag_frag = taper_matrix(mag_frag, extra_frames*2)
            real_frag = taper_matrix(real_frag, extra_frames*2)
            imag_frag = taper_matrix(imag_frag, extra_frames*2)
            #pylab.plot(f0_frag)            
            f0_frag = taper_matrix(f0_frag, extra_frames*2) 
            #print 'welvinrbo90'
            #pylab.plot(f0_frag)
            #pylab.show()
            vuv_frag = taper_matrix(vuv_frag, extra_frames*2)            


        return (mag_frag, real_frag, imag_frag, f0_frag, vuv_frag)