def generate_wav(gen_dir, file_id_list, cfg):

    logger = logging.getLogger("wav_generation")

    SPTK = cfg.SPTK
    #    NND      = cfg.NND
    STRAIGHT = cfg.STRAIGHT
    WORLD = cfg.WORLD

    ## to be moved
    pf_coef = cfg.pf_coef
    if isinstance(cfg.fw_alpha, basestring):
        if cfg.fw_alpha == 'Bark':
            fw_coef = bark_alpha(cfg.sr)
        elif cfg.fw_alpha == 'ERB':
            fw_coef = bark_alpha(cfg.sr)
        else:
            raise ValueError(
                'cfg.fw_alpha=' + cfg.fw_alpha +
                ' not implemented, the frequency warping coefficient "fw_coef" cannot be deduced.'
            )
    else:
        fw_coef = cfg.fw_alpha
    co_coef = cfg.co_coef
    fl_coef = cfg.fl

    if cfg.apply_GV:
        io_funcs = BinaryIOCollection()

        logger.info('loading global variance stats from %s' % (cfg.GV_dir))

        ref_gv_mean_file = os.path.join(cfg.GV_dir, 'ref_gv.mean')
        gen_gv_mean_file = os.path.join(cfg.GV_dir, 'gen_gv.mean')
        ref_gv_std_file = os.path.join(cfg.GV_dir, 'ref_gv.std')
        gen_gv_std_file = os.path.join(cfg.GV_dir, 'gen_gv.std')

        ref_gv_mean, frame_number = io_funcs.load_binary_file_frame(
            ref_gv_mean_file, 1)
        gen_gv_mean, frame_number = io_funcs.load_binary_file_frame(
            gen_gv_mean_file, 1)
        ref_gv_std, frame_number = io_funcs.load_binary_file_frame(
            ref_gv_std_file, 1)
        gen_gv_std, frame_number = io_funcs.load_binary_file_frame(
            gen_gv_std_file, 1)

    counter = 1
    max_counter = len(file_id_list)

    for filename in file_id_list:

        logger.info('creating waveform for %4d of %4d: %s' %
                    (counter, max_counter, filename))
        counter = counter + 1

        base = filename
        files = {
            'sp': base + cfg.sp_ext,
            'mgc': base + cfg.mgc_ext,
            'f0': base + '.f0',
            'lf0': base + cfg.lf0_ext,
            'ap': base + '.ap',
            'bap': base + cfg.bap_ext,
            'shape': base + cfg.shape_ext,
            'texture': base + cfg.texture_ext,
            'wav': base + '.wav'
        }

        mgc_file_name = files['mgc']
        bap_file_name = files['bap']

        cur_dir = os.getcwd()
        os.chdir(gen_dir)

        ### post-filtering
        if cfg.do_post_filtering and cfg.audio:
            line = "echo 1 1 "
            for i in range(2, cfg.mgc_dim):
                line = line + str(pf_coef) + " "

            run_process('{line} | {x2x} +af > {weight}'.format(
                line=line,
                x2x=SPTK['X2X'],
                weight=os.path.join(gen_dir, 'weight')))

            run_process(
                '{freqt} -m {order} -a {fw} -M {co} -A 0 < {mgc} | {c2acr} -m {co} -M 0 -l {fl} > {base_r0}'
                .format(freqt=SPTK['FREQT'],
                        order=cfg.mgc_dim - 1,
                        fw=fw_coef,
                        co=co_coef,
                        mgc=files['mgc'],
                        c2acr=SPTK['C2ACR'],
                        fl=fl_coef,
                        base_r0=files['mgc'] + '_r0'))

            run_process(
                '{vopr} -m -n {order} < {mgc} {weight} | {freqt} -m {order} -a {fw} -M {co} -A 0 | {c2acr} -m {co} -M 0 -l {fl} > {base_p_r0}'
                .format(vopr=SPTK['VOPR'],
                        order=cfg.mgc_dim - 1,
                        mgc=files['mgc'],
                        weight=os.path.join(gen_dir, 'weight'),
                        freqt=SPTK['FREQT'],
                        fw=fw_coef,
                        co=co_coef,
                        c2acr=SPTK['C2ACR'],
                        fl=fl_coef,
                        base_p_r0=files['mgc'] + '_p_r0'))

            run_process(
                '{vopr} -m -n {order} < {mgc} {weight} | {mc2b} -m {order} -a {fw} | {bcp} -n {order} -s 0 -e 0 > {base_b0}'
                .format(vopr=SPTK['VOPR'],
                        order=cfg.mgc_dim - 1,
                        mgc=files['mgc'],
                        weight=os.path.join(gen_dir, 'weight'),
                        mc2b=SPTK['MC2B'],
                        fw=fw_coef,
                        bcp=SPTK['BCP'],
                        base_b0=files['mgc'] + '_b0'))

            run_process(
                '{vopr} -d < {base_r0} {base_p_r0} | {sopr} -LN -d 2 | {vopr} -a {base_b0} > {base_p_b0}'
                .format(vopr=SPTK['VOPR'],
                        base_r0=files['mgc'] + '_r0',
                        base_p_r0=files['mgc'] + '_p_r0',
                        sopr=SPTK['SOPR'],
                        base_b0=files['mgc'] + '_b0',
                        base_p_b0=files['mgc'] + '_p_b0'))

            run_process(
                '{vopr} -m -n {order} < {mgc} {weight} | {mc2b} -m {order} -a {fw} | {bcp} -n {order} -s 1 -e {order} | {merge} -n {order2} -s 0 -N 0 {base_p_b0} | {b2mc} -m {order} -a {fw} > {base_p_mgc}'
                .format(vopr=SPTK['VOPR'],
                        order=cfg.mgc_dim - 1,
                        mgc=files['mgc'],
                        weight=os.path.join(gen_dir, 'weight'),
                        mc2b=SPTK['MC2B'],
                        fw=fw_coef,
                        bcp=SPTK['BCP'],
                        merge=SPTK['MERGE'],
                        order2=cfg.mgc_dim - 2,
                        base_p_b0=files['mgc'] + '_p_b0',
                        b2mc=SPTK['B2MC'],
                        base_p_mgc=files['mgc'] + '_p_mgc'))

            mgc_file_name = files['mgc'] + '_p_mgc'

        if (cfg.vocoder_type == "STRAIGHT" or cfg.vocoder_type
                == "STRAIGHT_M_TRIAL") and cfg.apply_GV and cfg.audio:
            gen_mgc, frame_number = io_funcs.load_binary_file_frame(
                mgc_file_name, cfg.mgc_dim)

            gen_mu = np.reshape(np.mean(gen_mgc, axis=0), (-1, 1))
            gen_std = np.reshape(np.std(gen_mgc, axis=0), (-1, 1))

            local_gv = (ref_gv_std / gen_gv_std) * (gen_std -
                                                    gen_gv_mean) + ref_gv_mean

            enhanced_mgc = np.repeat(local_gv, frame_number, 1).T / np.repeat(
                gen_std, frame_number, 1).T * (gen_mgc - np.repeat(
                    gen_mu, frame_number, 1).T) + np.repeat(
                        gen_mu, frame_number, 1).T

            new_mgc_file_name = files['mgc'] + '_p_mgc'
            io_funcs.array_to_binary_file(enhanced_mgc, new_mgc_file_name)

            mgc_file_name = files['mgc'] + '_p_mgc'

        if cfg.do_post_filtering and cfg.apply_GV and cfg.audio:
            logger.critical(
                'Both smoothing techniques together can\'t be applied!!\n')
            raise

        ###mgc to sp to wav
        # if cfg.vocoder_type == 'STRAIGHT'  and cfg.audio:
        #     run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} > {sp}'
        #                 .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.mgc_dim-1, fl=cfg.fl, mgc=mgc_file_name, sp=files['sp']))
        #     run_process('{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} > {f0}'.format(sopr=SPTK['SOPR'], lf0=files['lf0'], f0=files['f0']))
        #     run_process('{x2x} +fa {f0} > {f0a}'.format(x2x=SPTK['X2X'], f0=files['f0'], f0a=files['f0'] + '.a'))

        #     if cfg.use_cep_ap:
        #         run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 0 {bap} > {ap}'
        #                     .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.bap_dim-1, fl=cfg.fl, bap=files['bap'], ap=files['ap']))
        #     else:
        #         run_process('{bndap2ap} {bap} > {ap}'
        #                      .format(bndap2ap=STRAIGHT['BNDAP2AP'], bap=files['bap'], ap=files['ap']))

        #     run_process('{synfft} -f {sr} -spec -fftl {fl} -shift {shift} -sigp 0.0 -cornf 400 -float -apfile {ap} {f0a} {sp} {wav}'
        #                 .format(synfft=STRAIGHT['SYNTHESIS_FFT'], sr=cfg.sr, fl=cfg.fl, shift=cfg.shift, ap=files['ap'], f0a=files['f0']+'.a', sp=files['sp'], wav=files['wav']))

        #     run_process('rm -f {sp} {f0} {f0a} {ap}'
        #                 .format(sp=files['sp'],f0=files['f0'],f0a=files['f0']+'.a',ap=files['ap']))

        ###mgc to sp to wav
        if cfg.vocoder_type == 'STRAIGHT' and cfg.audio:
            run_process(
                '{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} > {sp}'
                .format(mgc2sp=SPTK['MGC2SP'],
                        alpha=cfg.fw_alpha,
                        order=cfg.mgc_dim - 1,
                        fl=cfg.fl,
                        mgc=mgc_file_name,
                        sp=files['sp']))
            run_process(
                '{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} > {f0}'.format(
                    sopr=SPTK['SOPR'], lf0=files['lf0'], f0=files['f0']))
            run_process('{x2x} +fa {f0} > {f0a}'.format(x2x=SPTK['X2X'],
                                                        f0=files['f0'],
                                                        f0a=files['f0'] +
                                                        '.a'))

            if cfg.use_cep_ap:
                run_process(
                    '{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 0 {bap} > {ap}'
                    .format(mgc2sp=SPTK['MGC2SP'],
                            alpha=cfg.fw_alpha,
                            order=cfg.bap_dim - 1,
                            fl=cfg.fl,
                            bap=files['bap'],
                            ap=files['ap']))
            else:
                run_process('{bndap2ap} {bap} > {ap}'.format(
                    bndap2ap=STRAIGHT['BNDAP2AP'],
                    bap=files['bap'],
                    ap=files['ap']))

            size = os.path.getsize(files['f0'])
            size = size / 4

            straight_normalization = 1024.0 / (2200.0 * 32768.0)

            spectralUpdateInterval = 1000.0 * cfg.shift / cfg.sr
            synth_straight_file_name = base + '_synth.m'
            synth_straight_file = open(synth_straight_file_name, "w")
            synth_straight_file.write("addpath(path,'%s');\n" %
                                      cfg.STRAIGHT_DIR)
            synth_straight_file.write("prm.spectralUpdateInterval = %f;\n" %
                                      spectralUpdateInterval)
            synth_straight_file.write(
                "prm.levelNormalizationIndicator = 0;\n\n")
            synth_straight_file.write("fprintf(1,'\\nSynthesizing %s\\n');\n" %
                                      files['wav'])
            synth_straight_file.write("fid1 = fopen('%s','r','%s');\n" %
                                      (files['sp'], "ieee-le"))
            synth_straight_file.write("fid2 = fopen('%s','r','%s');\n" %
                                      (files['ap'], "ieee-le"))
            synth_straight_file.write("fid3 = fopen('%s','r','%s');\n" %
                                      (files['f0'], "ieee-le"))
            synth_straight_file.write("sp = fread(fid1,[%d, %d],'float');\n" %
                                      (cfg.sp_dim, size))
            synth_straight_file.write("ap = fread(fid2,[%d, %d],'float');\n" %
                                      (cfg.sp_dim, size))
            synth_straight_file.write("f0 = fread(fid3,[%d, %d],'float');\n" %
                                      (1, size))
            synth_straight_file.write("fclose(fid1);\n")
            synth_straight_file.write("fclose(fid2);\n")
            synth_straight_file.write("fclose(fid3);\n")

            synth_straight_file.write(
                "sp = sp/32768.0;\n")  # we use this normalization now

            synth_straight_file.write(
                "[sy] = exstraightsynth(f0,sp,ap,%d,prm);\n" % cfg.sr)
            synth_straight_file.write("wavwrite( sy, %d, '%s');\n\n" %
                                      (cfg.sr, files['wav']))
            synth_straight_file.write("quit;\n")

            synth_straight_file.close()
            os.system("%s < %s" %
                      (cfg.MATLAB_COMMAND, synth_straight_file_name))

        if cfg.visual == True:

            #   generate talking head
            synth_straight_file_name = base + '_vid_synth.m'
            synth_straight_file = open(synth_straight_file_name, "w")

            bytes_per_frame_shape = cfg.shape_dim * 4
            output_name_shape = base + '.shape_h'
            run_process(
                '/usr/bin/perl %s %d %d %d 9 %s > %s' %
                (cfg.addhtkheader, cfg.sr, cfg.shift, bytes_per_frame_shape,
                 files['shape'], output_name_shape))

            bytes_per_frame_texture = cfg.texture_dim * 4
            output_name_texture = base + '.texture_h'
            run_process(
                '/usr/bin/perl %s %d %d %d 9 %s > %s' %
                (cfg.addhtkheader, cfg.sr, cfg.shift, bytes_per_frame_texture,
                 files['texture'], output_name_texture))

            synth_straight_file.write("cd %s;\n" % cfg.aam_tools_path)
            synth_straight_file.write("parpool;\n")
            synth_straight_file.write("addpath mex\n")
            synth_straight_file.write("addpath utils\n")
            synth_straight_file.write("params.num_sh = %d;\n" % cfg.shape_dim)
            synth_straight_file.write("params.num_te = %d;\n" %
                                      cfg.texture_dim)
            synth_straight_file.write("params.fps = %f;\n" % 29.97)
            synth_straight_file.write("figure('Visible','Off')\n")
            synth_straight_file.write(
                "convert_to_vid('%s', '%s', '%s', '%s', '%s', '%s', params);\n"
                % (cfg.aam_model, os.path.join(gen_dir, output_name_shape),
                   os.path.join(gen_dir, output_name_texture),
                   os.path.join(gen_dir, (base + '.wav')),
                   os.path.join(gen_dir, (base + '.mkv')), gen_dir))

            # synth_straight_file.write("pause(1);\n")
            synth_straight_file.write("quit;\n")

            synth_straight_file.close()

            # run_process("%s < %s" % (cfg.MATLAB_COMMAND_V, synth_straight_file_name))
            os.system("%s < %s" %
                      (cfg.MATLAB_COMMAND_V, synth_straight_file_name))

        os.chdir(cur_dir)
Пример #2
0
def dnn_generation(valid_file_list,
                   nnets_file_name,
                   n_ins,
                   n_outs,
                   out_file_list,
                   target_mean_vector,
                   target_std_vector,
                   out_dimension_dict,
                   file_extension_dict,
                   vocoder='straight'):
    logger = logging.getLogger("dnn_generation")
    logger.debug('Starting dnn_generation')

    inf_float = -1.0e+10

    plotlogger = logging.getLogger("plotting")

    cfg.gen_wav_features

    if vocoder == 'straight':
        gen_wav_features = ['mgc', 'lf0', 'bap']
    elif vocoder == 'glotthmm':
        gen_wav_features = ['F0', 'Gain', 'HNR', 'LSF',
                            'LSFsource']  ## TODO: take this from config
    else:
        sys.exit('unsupported vocoder %s !' % (vocoder))

    stream_start_index = {}
    dimension_index = 0
    for feature_name in list(out_dimension_dict.keys()):
        stream_start_index[feature_name] = dimension_index
        dimension_index += out_dimension_dict[feature_name]

    dnn_model = pickle.load(open(nnets_file_name, 'rb'))

    file_number = len(valid_file_list)
    io_funcs = BinaryIOCollection()

    mlpg = MLParameterGenerationFast()

    for i in range(file_number):
        logger.info('generating %4d of %4d: %s' %
                    (i + 1, file_number, valid_file_list[i]))
        fid_lab = open(valid_file_list[i], 'rb')
        features = numpy.fromfile(fid_lab, dtype=numpy.float32)
        fid_lab.close()
        features = features[:(n_ins * (features.size / n_ins))]
        features = features.reshape((-1, n_ins))

        frame_number = features.shape[0]

        test_set_x = theano.shared(
            numpy.asarray(features, dtype=theano.config.floatX))

        mean_matrix = numpy.tile(target_mean_vector, (features.shape[0], 1))
        std_matrix = numpy.tile(target_std_vector, (features.shape[0], 1))

        predicted_mix = dnn_model.parameter_prediction_mix(
            test_set_x=test_set_x)
        max_index = numpy.argmax(predicted_mix, axis=1)

        temp_predicted_mu = dnn_model.parameter_prediction(
            test_set_x=test_set_x)
        temp_predicted_sigma = dnn_model.parameter_prediction_sigma(
            test_set_x=test_set_x)
        predicted_mu = numpy.zeros((temp_predicted_mu.shape[0], n_outs))
        predicted_sigma = numpy.zeros((temp_predicted_sigma.shape[0], n_outs))
        for kk in range(temp_predicted_mu.shape[0]):
            predicted_mu[kk, :] = temp_predicted_mu[kk, max_index[kk] *
                                                    n_outs:(max_index[kk] +
                                                            1) * n_outs]
            predicted_sigma[kk, :] = temp_predicted_sigma[
                kk, max_index[kk] * n_outs:(max_index[kk] + 1) * n_outs]
        #        print   predicted_mu.shape
        #        predicted_mu = predicted_mu[aa*n_outs:(aa+1)*n_outs]
        predicted_mu = predicted_mu * std_matrix + mean_matrix
        predicted_sigma = ((predicted_sigma**0.5) * std_matrix)**2

        dir_name = os.path.dirname(out_file_list[i])
        file_id = os.path.splitext(os.path.basename(out_file_list[i]))[0]

        mlpg = MLParameterGenerationFast()
        for feature_name in gen_wav_features:
            current_features = predicted_mu[:,
                                            stream_start_index[feature_name]:
                                            stream_start_index[feature_name] +
                                            out_dimension_dict[feature_name]]
            current_sigma = predicted_sigma[:,
                                            stream_start_index[feature_name]:
                                            stream_start_index[feature_name] +
                                            out_dimension_dict[feature_name]]

            gen_features = mlpg.generation(
                current_features, current_sigma,
                out_dimension_dict[feature_name] / 3)

            if feature_name in ['lf0', 'F0']:
                if 'vuv' in stream_start_index:
                    vuv_feature = predicted_mu[:, stream_start_index['vuv']:
                                               stream_start_index['vuv'] + 1]
                    for i in range(frame_number):
                        if vuv_feature[i, 0] < 0.5:
                            gen_features[i, 0] = inf_float
                        #                print   gen_features
            new_file_name = os.path.join(
                dir_name, file_id + file_extension_dict[feature_name])

            io_funcs.array_to_binary_file(gen_features, new_file_name)
    def acoustic_decomposition(self,
                               in_file_list,
                               dimension,
                               out_dimension_dict,
                               file_extension_dict,
                               var_file_dict,
                               do_MLPG=True,
                               cfg=None,
                               meta=None):
        logger = logging.getLogger('param_generation')

        logger.debug('acoustic_decomposition for %d files' % len(in_file_list))

        self.load_covariance(var_file_dict, out_dimension_dict)

        stream_start_index = {}
        dimension_index = 0
        recorded_vuv = False
        vuv_dimension = None

        for feature_name in sorted(out_dimension_dict.keys()):
            #            if feature_name != 'vuv':
            stream_start_index[feature_name] = dimension_index
            #            else:
            #                vuv_dimension = dimension_index
            #                recorded_vuv = True

            dimension_index += out_dimension_dict[feature_name]

        io_funcs = BinaryIOCollection()

        mlpg_algo = MLParameterGeneration()

        findex = 0
        flen = len(in_file_list)
        for file_name in in_file_list:

            findex = findex + 1

            dir_name = os.path.dirname(file_name)
            file_id = os.path.splitext(os.path.basename(file_name))[0]

            features, frame_number = io_funcs.load_binary_file_frame(
                file_name, dimension)
            logger.info('processing %4d of %4d: %s' %
                        (findex, flen, file_name))

            for feature_name in self.gen_wav_features:

                logger.debug(' feature: %s' % feature_name)

                current_features = features[:,
                                            stream_start_index[feature_name]:
                                            stream_start_index[feature_name] +
                                            out_dimension_dict[feature_name]]
                if FAST_MLPG:
                    ### fast version wants variance per frame, not single global one:
                    var = self.var[feature_name]
                    var = numpy.transpose(numpy.tile(var, frame_number))
                else:
                    var = self.var[feature_name]

                if feature_name == 'lf0' and meta is not None:
                    cur_ind = 60
                    for syllable in meta:
                        for note in syllable['notes']:
                            current_features[int(cur_ind):int(cur_ind) +
                                             int(note[1]), 0] = note[0]
                            cur_ind += note[1]

#                print  var.shape[1]
                if do_MLPG == False:
                    gen_features = current_features
                else:
                    gen_features = mlpg_algo.generation(
                        current_features, var,
                        out_dimension_dict[feature_name] // 3)


#                else:
#                    self.logger.critical("the dimensions do not match for MLPG: %d vs %d" %(var.shape[1], out_dimension_dict[feature_name]))
#                    raise

                logger.debug(' feature dimensions: %d by %d' %
                             (gen_features.shape[0], gen_features.shape[1]))

                if feature_name in ['lf0', 'F0']:
                    if 'vuv' in stream_start_index:
                        vuv_feature = features[:, stream_start_index['vuv']:
                                               stream_start_index['vuv'] + 1]

                        for i in range(frame_number):
                            if vuv_feature[i, 0] < 0.5 or gen_features[
                                    i, 0] < numpy.log(20):
                                gen_features[i, 0] = self.inf_float

                new_file_name = os.path.join(
                    dir_name, file_id + file_extension_dict[feature_name])

                if self.enforce_silence:
                    silence_pattern = cfg.silence_pattern
                    label_align_dir = cfg.in_label_align_dir
                    in_f = open(label_align_dir + '/' + file_id + '.lab', 'r')
                    for line in in_f.readlines():
                        line = line.strip()

                        if len(line) < 1:
                            continue
                        temp_list = re.split('\s+', line)
                        start_time = int(int(temp_list[0]) * (10**-4) / 5)
                        end_time = int(int(temp_list[1]) * (10**-4) / 5)

                        full_label = temp_list[2]

                        label_binary_flag = self.check_silence_pattern(
                            full_label, silence_pattern)

                        if label_binary_flag:
                            if feature_name in ['lf0', 'F0', 'mag']:
                                gen_features[
                                    start_time:end_time, :] = self.inf_float
                            else:
                                gen_features[start_time:end_time, :] = 0.0

                io_funcs.array_to_binary_file(gen_features, new_file_name)
                logger.debug(' wrote to file %s' % new_file_name)
Пример #4
0
    def modify_dur_from_phone_alignment_labels(self, label_file_name,
                                               gen_dur_file_name,
                                               gen_lab_file_name):
        logger = logging.getLogger("dur")

        dur_dim = 1

        io_funcs = BinaryIOCollection()
        dur_features, frame_number = io_funcs.load_binary_file_frame(
            gen_dur_file_name, dur_dim)

        fid = open(label_file_name)
        utt_labels = fid.readlines()
        fid.close()

        label_number = len(utt_labels)
        logger.info('loaded %s, %3d labels' % (label_file_name, label_number))

        out_fid = open(gen_lab_file_name, 'w')

        current_index = 0
        prev_end_time = 0
        for line in utt_labels:
            line = line.strip()

            if len(line) < 1:
                continue
            temp_list = re.split('\s+', line)

            if len(temp_list) == 1:
                start_time = 0
                end_time = 3000000  ## hard-coded silence duration
                full_label = temp_list[0]
            else:
                start_time = int(temp_list[0])
                end_time = int(temp_list[1])
                full_label = temp_list[2]

            label_binary_flag = self.check_silence_pattern(full_label)

            if label_binary_flag == 1:
                current_phone_dur = end_time - start_time
                out_fid.write(
                    str(prev_end_time) + ' ' +
                    str(prev_end_time + current_phone_dur) + ' ' + full_label +
                    '\n')
                prev_end_time = prev_end_time + current_phone_dur
                continue
            else:
                phone_dur = dur_features[current_index]
                phone_dur = int(phone_dur) * 5 * 10000
                out_fid.write(
                    str(prev_end_time) + ' ' + str(prev_end_time + phone_dur) +
                    ' ' + full_label + '\n')
                prev_end_time = prev_end_time + phone_dur

            current_index += 1

        logger.debug(
            'modifed label with predicted duration of %d frames x %d features'
            % dur_features.shape)
Пример #5
0
def read_data_from_file_list(inp_file_list,
                             out_file_list,
                             inp_dim,
                             out_dim,
                             sequential_training=True):
    io_funcs = BinaryIOCollection()

    num_of_utt = len(inp_file_list)

    file_length_dict = {'framenum2utt': {}, 'utt2framenum': {}}

    if sequential_training:
        temp_set_x = {}
        temp_set_y = {}
    else:
        temp_set_x = np.empty((FRAME_BUFFER_SIZE, inp_dim))
        temp_set_y = np.empty((FRAME_BUFFER_SIZE, out_dim))

    ### read file by file ###
    current_index = 0
    for i in xrange(num_of_utt):
        inp_file_name = inp_file_list[i]
        out_file_name = out_file_list[i]
        inp_features, inp_frame_number = io_funcs.load_binary_file_frame(
            inp_file_name, inp_dim)
        out_features, out_frame_number = io_funcs.load_binary_file_frame(
            out_file_name, out_dim)

        base_file_name = os.path.basename(inp_file_name).split(".")[0]

        if abs(inp_frame_number - out_frame_number) > 5:
            print 'the number of frames in input and output features are different: %d vs %d (%s)' % (
                inp_frame_number, out_frame_number, base_file_name)
            sys.exit(0)
        else:
            frame_number = min(inp_frame_number, out_frame_number)

        if sequential_training:
            temp_set_x[base_file_name] = inp_features[0:frame_number]
            temp_set_y[base_file_name] = out_features[0:frame_number]
        else:
            temp_set_x[current_index:current_index +
                       frame_number, ] = inp_features[0:frame_number]
            temp_set_y[current_index:current_index +
                       frame_number, ] = out_features[0:frame_number]
            current_index += frame_number

        if frame_number not in file_length_dict['framenum2utt']:
            file_length_dict['framenum2utt'][frame_number] = [base_file_name]
        else:
            file_length_dict['framenum2utt'][frame_number].append(
                base_file_name)

        file_length_dict['utt2framenum'][base_file_name] = frame_number

        drawProgressBar(i + 1, num_of_utt)

    sys.stdout.write("\n")

    if not sequential_training:
        temp_set_x = temp_set_x[0:current_index, ]
        temp_set_y = temp_set_y[0:current_index, ]

    return temp_set_x, temp_set_y, file_length_dict
Пример #6
0
    def load_next_batch(self):
        io_funcs = BinaryIOCollection()

        ## set sequence length for batch training
        if (self.training_algo == 1):
            # set seq length to maximum seq length from current batch
            self.set_seq_length_from_current_batch()
        elif (self.training_algo == 2):
            # set seq length to maximum seq length from current bucket
            while not self.current_bucket_size:
                self.get_next_bucket()
        elif (self.training_algo == 3):
            # seq length is set based on default/user configuration
            pass

        temp_set_x = numpy.zeros((self.buffer_size, self.n_ins))
        temp_set_y = numpy.zeros((self.buffer_size, self.n_outs))

        ### read file by file ###
        current_index = 0
        while True:
            if current_index >= self.buffer_size:
                print('buffer size reached by file index %d' %
                      (self.file_index))
                break

            if self.training_algo == 2:
                # choose utterance from current bucket list
                base_file_name = self.current_bucket_list[
                    self.bucket_file_index]
                self.utt_index = self.file_length_dict['utt2index'][
                    base_file_name]
            else:
                # choose utterance randomly from current file list
                #self.utt_index = numpy.random.randint(self.list_size)
                ## choose utterance in serial order
                self.utt_index = self.file_index
                base_file_name = os.path.basename(
                    self.x_files_list[self.utt_index]).split('.')[0]

            in_features, lab_frame_number = io_funcs.load_binary_file_frame(
                self.x_files_list[self.utt_index], self.n_ins)
            out_features, out_frame_number = io_funcs.load_binary_file_frame(
                self.y_files_list[self.utt_index], self.n_outs)

            frame_number = self.file_length_dict['utt2framenum'][
                base_file_name]

            temp_set_x[current_index:current_index +
                       frame_number, ] = in_features
            temp_set_y[current_index:current_index +
                       frame_number, ] = out_features
            current_index += frame_number

            if ((self.file_index + 1) % self.merge_size == 0):
                num_of_samples = int(
                    numpy.ceil(float(current_index) / float(self.seq_length)))
                current_index = self.seq_length * num_of_samples

            self.file_index += 1

            # break for any of the below conditions
            if self.training_algo == 2:
                self.bucket_file_index += 1
                if (self.bucket_file_index >= self.current_bucket_size):
                    self.current_bucket_size = 0
                    break
                if (self.bucket_file_index % self.batch_size == 0):
                    break
            else:
                if (self.file_index % self.batch_size
                        == 0) or (self.file_index >= self.list_size):
                    break

        if self.file_index >= self.list_size:
            self.end_reading = True
            self.file_index = 0

        num_of_samples = int(
            numpy.ceil(float(current_index) / float(self.seq_length)))

        temp_set_x = temp_set_x[0:num_of_samples * self.seq_length, ]
        temp_set_y = temp_set_y[0:num_of_samples * self.seq_length, ]

        temp_set_x = temp_set_x.reshape(num_of_samples, self.seq_length,
                                        self.n_ins)
        temp_set_y = temp_set_y.reshape(num_of_samples, self.seq_length,
                                        self.n_outs)

        shared_set_x = self.make_shared(temp_set_x, 'x')
        shared_set_y = self.make_shared(temp_set_y, 'y')

        shared_set_xy = (shared_set_x, shared_set_y)

        return shared_set_xy, temp_set_x, temp_set_y
Пример #7
0
def wavgen_straight_type_vocoder(gen_dir, file_id_list, cfg, logger):
    '''
    Waveform generation with STRAIGHT or WORLD vocoders.
    (whose acoustic parameters are: mgc, bap, and lf0)
    '''

    SPTK = cfg.SPTK
    #    NND      = cfg.NND
    STRAIGHT = cfg.STRAIGHT
    WORLD = cfg.WORLD

    ## to be moved
    pf_coef = cfg.pf_coef
    if isinstance(cfg.fw_alpha, str):
        if cfg.fw_alpha == 'Bark':
            fw_coef = bark_alpha(cfg.sr)
        elif cfg.fw_alpha == 'ERB':
            fw_coef = bark_alpha(cfg.sr)
        else:
            raise ValueError(
                'cfg.fw_alpha=' + cfg.fw_alpha +
                ' not implemented, the frequency warping coefficient "fw_coef" cannot be deduced.'
            )
    else:
        fw_coef = cfg.fw_alpha
    co_coef = cfg.co_coef
    fl_coef = cfg.fl

    if cfg.apply_GV:
        io_funcs = BinaryIOCollection()

        logger.info('loading global variance stats from %s' % (cfg.GV_dir))

        ref_gv_mean_file = os.path.join(cfg.GV_dir, 'ref_gv.mean')
        gen_gv_mean_file = os.path.join(cfg.GV_dir, 'gen_gv.mean')
        ref_gv_std_file = os.path.join(cfg.GV_dir, 'ref_gv.std')
        gen_gv_std_file = os.path.join(cfg.GV_dir, 'gen_gv.std')

        ref_gv_mean, frame_number = io_funcs.load_binary_file_frame(
            ref_gv_mean_file, 1)
        gen_gv_mean, frame_number = io_funcs.load_binary_file_frame(
            gen_gv_mean_file, 1)
        ref_gv_std, frame_number = io_funcs.load_binary_file_frame(
            ref_gv_std_file, 1)
        gen_gv_std, frame_number = io_funcs.load_binary_file_frame(
            gen_gv_std_file, 1)

    counter = 1
    max_counter = len(file_id_list)

    for filename in file_id_list:

        logger.info('creating waveform for %4d of %4d: %s' %
                    (counter, max_counter, filename))
        counter = counter + 1
        base = filename
        files = {
            'sp': base + cfg.sp_ext,
            'mgc': base + cfg.mgc_ext,
            'f0': base + '.f0',
            'lf0': base + cfg.lf0_ext,
            'ap': base + '.ap',
            'bap': base + cfg.bap_ext,
            'wav': base + '.wav'
        }

        mgc_file_name = files['mgc']
        bap_file_name = files['bap']

        cur_dir = os.getcwd()
        os.chdir(gen_dir)

        ### post-filtering
        if cfg.do_post_filtering:
            mgc_file_name = files['mgc'] + '_p_mgc'
            post_filter(files['mgc'], mgc_file_name, cfg.mgc_dim, pf_coef,
                        fw_coef, co_coef, fl_coef, gen_dir, cfg)

        if cfg.vocoder_type == "STRAIGHT" and cfg.apply_GV:
            gen_mgc, frame_number = io_funcs.load_binary_file_frame(
                mgc_file_name, cfg.mgc_dim)

            gen_mu = np.reshape(np.mean(gen_mgc, axis=0), (-1, 1))
            gen_std = np.reshape(np.std(gen_mgc, axis=0), (-1, 1))

            local_gv = (ref_gv_std / gen_gv_std) * (gen_std -
                                                    gen_gv_mean) + ref_gv_mean

            enhanced_mgc = np.repeat(local_gv, frame_number, 1).T / np.repeat(
                gen_std, frame_number, 1).T * (gen_mgc - np.repeat(
                    gen_mu, frame_number, 1).T) + np.repeat(
                        gen_mu, frame_number, 1).T

            new_mgc_file_name = files['mgc'] + '_p_mgc'
            io_funcs.array_to_binary_file(enhanced_mgc, new_mgc_file_name)

            mgc_file_name = files['mgc'] + '_p_mgc'

        if cfg.do_post_filtering and cfg.apply_GV:
            logger.critical(
                'Both smoothing techniques together can\'t be applied!!\n')
            raise

        ###mgc to sp to wav
        if cfg.vocoder_type == 'STRAIGHT':
            run_process(
                '{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} > {sp}'
                .format(mgc2sp=SPTK['MGC2SP'],
                        alpha=cfg.fw_alpha,
                        order=cfg.mgc_dim - 1,
                        fl=cfg.fl,
                        mgc=mgc_file_name,
                        sp=files['sp']))
            run_process(
                '{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} > {f0}'.format(
                    sopr=SPTK['SOPR'], lf0=files['lf0'], f0=files['f0']))
            run_process('{x2x} +fa {f0} > {f0a}'.format(x2x=SPTK['X2X'],
                                                        f0=files['f0'],
                                                        f0a=files['f0'] +
                                                        '.a'))

            if cfg.use_cep_ap:
                run_process(
                    '{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 0 {bap} > {ap}'
                    .format(mgc2sp=SPTK['MGC2SP'],
                            alpha=cfg.fw_alpha,
                            order=cfg.bap_dim - 1,
                            fl=cfg.fl,
                            bap=files['bap'],
                            ap=files['ap']))
            else:
                run_process('{bndap2ap} {bap} > {ap}'.format(
                    bndap2ap=STRAIGHT['BNDAP2AP'],
                    bap=files['bap'],
                    ap=files['ap']))

            run_process(
                '{synfft} -f {sr} -spec -fftl {fl} -shift {shift} -sigp 1.2 -cornf 4000 -float -apfile {ap} {f0a} {sp} {wav}'
                .format(synfft=STRAIGHT['SYNTHESIS_FFT'],
                        sr=cfg.sr,
                        fl=cfg.fl,
                        shift=cfg.shift,
                        ap=files['ap'],
                        f0a=files['f0'] + '.a',
                        sp=files['sp'],
                        wav=files['wav']))

            run_process('rm -f {sp} {f0} {f0a} {ap}'.format(sp=files['sp'],
                                                            f0=files['f0'],
                                                            f0a=files['f0'] +
                                                            '.a',
                                                            ap=files['ap']))
        elif cfg.vocoder_type == 'WORLD':

            run_process(
                '{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} | {x2x} +fd > {f0}'
                .format(sopr=SPTK['SOPR'],
                        lf0=files['lf0'],
                        x2x=SPTK['X2X'],
                        f0=files['f0']))

            run_process('{sopr} -c 0 {bap} | {x2x} +fd > {ap}'.format(
                sopr=SPTK['SOPR'],
                bap=files['bap'],
                x2x=SPTK['X2X'],
                ap=files['ap']))

            ### If using world v2, please comment above line and uncomment this
            # run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 0 {bap} | {sopr} -d 32768.0 -P | {x2x} +fd > {ap}'
            #            .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.bap_dim, fl=cfg.fl, bap=bap_file_name, sopr=SPTK['SOPR'], x2x=SPTK['X2X'], ap=files['ap']))

            run_process(
                '{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} | {sopr} -d 32768.0 -P | {x2x} +fd > {sp}'
                .format(mgc2sp=SPTK['MGC2SP'],
                        alpha=cfg.fw_alpha,
                        order=cfg.mgc_dim - 1,
                        fl=cfg.fl,
                        mgc=mgc_file_name,
                        sopr=SPTK['SOPR'],
                        x2x=SPTK['X2X'],
                        sp=files['sp']))

            run_process('{synworld} {fl} {sr} {f0} {sp} {ap} {wav}'.format(
                synworld=WORLD['SYNTHESIS'],
                fl=cfg.fl,
                sr=cfg.sr,
                f0=files['f0'],
                sp=files['sp'],
                ap=files['ap'],
                wav=files['wav']))

            run_process('rm -f {ap} {sp} {f0}'.format(ap=files['ap'],
                                                      sp=files['sp'],
                                                      f0=files['f0']))

        os.chdir(cur_dir)
Пример #8
0
    def predict(self,
                test_x,
                out_scaler,
                gen_test_file_list,
                sequential_training=False,
                stateful=False):
        #### compute predictions ####

        io_funcs = BinaryIOCollection()

        test_id_list = test_x.keys()
        test_id_list.sort()

        test_file_number = len(test_id_list)

        print("generating features on held-out test data...")
        with tf.Session() as sess:
            new_saver = tf.train.import_meta_graph(
                os.path.join(self.ckpt_dir, "mymodel.ckpt.meta"))
            print "loading the model parameters..."
            output_layer = tf.get_collection("output_layer")[0]
            input_layer = tf.get_collection("input_layer")[0]
            new_saver.restore(sess, os.path.join(self.ckpt_dir,
                                                 "mymodel.ckpt"))
            print "The model parameters are successfully restored"
            for utt_index in xrange(test_file_number):
                gen_test_file_name = gen_test_file_list[utt_index]
                temp_test_x = test_x[test_id_list[utt_index]]
                num_of_rows = temp_test_x.shape[0]
                if not sequential_training:
                    is_training_batch = tf.get_collection(
                        "is_training_batch")[0]
                    if self.dropout_rate != 0.0:
                        is_training_drop = tf.get_collection(
                            "is_training_drop")[0]
                        y_predict = sess.run(output_layer,
                                             feed_dict={
                                                 input_layer: temp_test_x,
                                                 is_training_drop: False,
                                                 is_training_batch: False
                                             })
                    else:
                        y_predict = sess.run(output_layer,
                                             feed_dict={
                                                 input_layer: temp_test_x,
                                                 is_training_batch: False
                                             })
                else:
                    temp_test_x = np.reshape(temp_test_x,
                                             [1, num_of_rows, self.n_in])
                    hybrid = 0
                    utt_length_placeholder = tf.get_collection("utt_length")[0]
                    if "tanh" in self.hidden_layer_type:
                        hybrid = 1
                        is_training_batch = tf.get_collection(
                            "is_training_batch")[0]
                    if self.dropout_rate != 0.0:
                        is_training_drop = tf.get_collection(
                            "is_training_drop")[0]
                        if hybrid:
                            y_predict = sess.run(output_layer,
                                                 feed_dict={
                                                     input_layer:
                                                     temp_test_x,
                                                     utt_length_placeholder:
                                                     [num_of_rows],
                                                     is_training_drop:
                                                     False,
                                                     is_training_batch:
                                                     False
                                                 })
                        else:
                            y_predict = sess.run(output_layer,
                                                 feed_dict={
                                                     input_layer:
                                                     temp_test_x,
                                                     utt_length_placeholder:
                                                     [num_of_rows],
                                                     is_training_drop:
                                                     False
                                                 })
                    elif hybrid:
                        y_predict = sess.run(output_layer,
                                             feed_dict={
                                                 input_layer:
                                                 temp_test_x,
                                                 utt_length_placeholder:
                                                 [num_of_rows],
                                                 is_training_batch:
                                                 False
                                             })
                    else:
                        y_predict = sess.run(output_layer,
                                             feed_dict={
                                                 input_layer:
                                                 temp_test_x,
                                                 utt_length_placeholder:
                                                 [num_of_rows]
                                             })
                data_utils.denorm_data(y_predict, out_scaler)
                io_funcs.array_to_binary_file(y_predict, gen_test_file_name)
                data_utils.drawProgressBar(utt_index + 1, test_file_number)
Пример #9
0
    def extract_base_features(self, feat_dir_path, feat_switch, list_of_files,
                              decomposition_unit, unit_dim):
        ### load Binary module ###
        io_funcs = BinaryIOCollection()
        htsclass = readHTSlabelFile()

        ### read file by file ###
        for i in range(len(list_of_files)):
            filename = list_of_files[i]
            print filename

            binary_label_dir = feat_dir_path['input_binary']
            label_align_dir = feat_dir_path['input_labfile']
            txt_dir = feat_dir_path['input_txt']
            out_feat_dir = feat_dir_path['output_feat']

            in_filename = os.path.join(binary_label_dir, filename + '.lab')
            in_lab_file = os.path.join(label_align_dir, filename + '.lab')
            in_txt_file = os.path.join(txt_dir, filename + '.txt')
            out_filename = os.path.join(out_feat_dir, filename + '.lab')

            word_embed_list = []
            binary_feat_list = []
            identity_vec_list = []
            dur_feat_list = []
            dur_list = []

            ### read text file ###
            if feat_switch['wordEmbed']:
                ip1 = open(in_txt_file, 'r')
                text_Data = ip1.readlines()
                ip1.close()

                norm_text = self.format_text(text_Data[0].strip())
                norm_text = norm_text.replace('OUF', 'O U F')
                norm_text = norm_text.replace('Mmm', 'M m m')
                norm_text = norm_text.replace('USA', 'U S A')
                list_of_words = norm_text.split()

            ### read label file ###
            [phone, st_arr, ph_arr,
             mean_f0_arr] = htsclass.read_state_align_label_file(in_lab_file)
            file_len = len(phone)

            ### read binary label file ###
            features = io_funcs.load_binary_file(in_filename, 1)

            ### take non-silence region ###
            ph_start = int(ph_arr[0][1] / (np.power(10, 4) * 5))
            ph_end = int(ph_arr[1][file_len - 2] / (np.power(10, 4) * 5))

            ### extract duration features ###
            frame_feat_list = features.reshape(
                len(features) / unit_dim['frame'], unit_dim['frame'])
            frame_feat_list = frame_feat_list[ph_start:ph_end, :]
            dur_feat_list = frame_feat_list[:, -9:]

            ### initialise common variables ###
            num_of_frames = 0

            ### initialise syllable variables ###
            #frame_indx=0;
            syl_num_of_frames = 0
            wc = 0
            phinsyl = 0
            syl_identity = self.zeros(300, 1)
            syl = ''

            j = 0
            while j < file_len:
                #### ignore silence ####
                if (phone[j] == '#' or phone[j] == 'pau'):
                    j = j + 1
                    continue

                ### extract boundaries of phone ###
                ph_start = int(ph_arr[0][j] / (np.power(10, 4) * 5))
                ph_end = int(ph_arr[1][j] / (np.power(10, 4) * 5))
                num_of_frames = sum(st_arr[j][:] / (np.power(10, 4) * 5))
                mid_frame = (ph_start + ph_end) / 2

                ### syllable ending information ###
                syl_end = 0
                if (mean_f0_arr[j + 1][3] - mean_f0_arr[j][3] != 0):
                    syl_end = 1

                ### word ending information ###
                word_end = 0
                if (mean_f0_arr[j + 1][5] - mean_f0_arr[j][5] != 0):
                    word_end = 1

                ### syllable duration ###
                syl_num_of_frames += num_of_frames

                ### extract binary phone-level features ###
                st_indx = unit_dim['frame'] * mid_frame
                mid_frame_feat = features[st_indx:st_indx + 592]
                mid_frame_feat = np.reshape(mid_frame_feat,
                                            len(mid_frame_feat))

                ### word embedding features ###
                if feat_switch['wordEmbed']:
                    ### word embeddings for syllable ###
                    word = list_of_words[wc]
                    if (word_end and phone[j] != 'pau'):
                        wc += 1
                    if (phone[j] == 'pau'):
                        word_vec = self.wrd_embeds['*UNKNOWN*']
                    elif word in self.wrd_embeds:
                        word_vec = self.wrd_embeds[word]
                    elif word.lower() in self.wrd_embeds:
                        word_vec = self.wrd_embeds[word.lower()]
                    else:
                        word_vec = self.wrd_embeds['*UNKNOWN*']

                ### identity features ###
                if feat_switch['identity']:
                    ### phone identity features ###
                    ph_identity = mid_frame_feat[99:148]

                    if decomposition_unit == 'syllable':
                        ### syllable identity features
                        st_indx = phinsyl * 50
                        syl_identity[st_indx:st_indx + 49] = ph_identity
                        syl = syl + phone[j]
                        ### to make nucleus centre ###
                        #if phone[j] in self.vlist:
                        #    vow_index = phinsyl

                        ### if silence is allowed ###
                        #if phone[j] == '#':
                        #    syl_identity[(phinsyl+1)*50-1] = 1
                        phinsyl += 1

                #### select features depending on decomposition unit ###

                ### frame-level features ###
                if (decomposition_unit == 'frame'):

                    ### duration features for phone ###
                    dur_list.append(num_of_frames)

                    ### frame level binary features ###
                    if feat_switch['binary'] and j + 2 == file_len:
                        ### load normalisation statistics ###
                        label_norm_float_file = os.path.join(
                            binary_label_dir, '../label_norm_float_HTS.dat')
                        fid = open(label_norm_float_file, 'r')
                        arr12 = [float(x.strip()) for x in fid.readlines()]
                        fid.close()
                        min_vector = np.array(arr12[0:len(arr12) / 2])
                        max_vector = np.array(arr12[len(arr12) / 2:len(arr12)])
                        max_range_vector = max_vector - min_vector
                        max_range_vector[max_range_vector == 0] = 1

                        ### normalise features ###
                        nrows = len(frame_feat_list)
                        for x in xrange(nrows):
                            norm_frame_feat = (
                                frame_feat_list[x, :] -
                                min_vector) / max_range_vector * 0.98 + 0.01
                            norm_frame_vec = ' '.join(
                                map(str, norm_frame_feat[:]))
                            binary_feat_list.append(norm_frame_vec)

                    ### embedding features ###
                    if feat_switch['wordEmbed']:
                        for x in xrange(num_of_frames):
                            word_embed_list.append(word_vec)

                ### phone-level features ###
                if (decomposition_unit == 'phone'):

                    ### duration features for phone ###
                    dur_list.append(num_of_frames)

                    ### phone level binary features ###
                    if feat_switch['binary']:
                        #ph_feat = np.concatenate((mid_frame_feat[0:99], mid_frame_feat[348:]), axis=0)
                        norm_ph_feat = [
                            0.99 if x == 1 else 0.01 for x in mid_frame_feat
                        ]
                        norm_ph_vec = ' '.join(map(str, norm_ph_feat[:]))
                        binary_feat_list.append(norm_ph_vec)

                    ### embedding features ###
                    if feat_switch['wordEmbed']:
                        word_embed_list.append(word_vec)

                    ### phone-identity features ###
                    if feat_switch['identity']:
                        extra_ph = 1 if phone[j] == 'o~' else 0
                        ph_identity = np.append(ph_identity, extra_ph)
                        #norm_ph_identity = [0.99 if x==1 else 0.01 for x in ph_identity]
                        norm_ph_identity = [int(x) for x in ph_identity]
                        norm_ph_identity_vec = ' '.join(
                            map(str, norm_ph_identity[:]))
                        identity_vec_list.append(norm_ph_identity_vec)

                ### syllable level features ###
                if (decomposition_unit == 'syllable' and syl_end):
                    #print syl

                    ### duration features for syllable ###
                    dur_list.append(syl_num_of_frames)

                    ### syllable and above level binary features ###
                    if feat_switch['binary']:
                        syl_feat = []
                        for x in range(len(mid_frame_feat)):
                            if (x < 348 or (x >= 405 and x < 421)):
                                continue
                            syl_feat.append(mid_frame_feat[x])
                        norm_syl_feat = [
                            0.99 if x == 1 else 0.01 for x in syl_feat
                        ]
                        norm_syl_vec = ' '.join(map(str, norm_syl_feat[:]))
                        binary_feat_list.append(norm_syl_vec)

                    if feat_switch['wordEmbed']:
                        word_embed_list.append(word_vec)

                    ### syllable-identity features ###
                    if feat_switch['identity']:
                        ### to make nucleus centre ###
                        #if(vow_index<=1):
                        #    syl_identity = np.roll(syl_identity, 50*(vow_index+1))
                        norm_syl_identity = [
                            0.99 if x == 1 else 0.01 for x in syl_identity
                        ]
                        norm_syl_identity_vec = ' '.join(
                            map(str, norm_syl_identity[:]))
                        identity_vec_list.append(norm_syl_identity_vec)

                    ### reset syllable information ###
                    phinsyl = 0
                    syl = ''
                    syl_num_of_frames = 0
                    syl_identity = self.zeros(300, 1)

                j += 1

            ### default vectors to use ###
            if feat_switch['identity'] and decomposition_unit == 'syllable':
                syl_identity = self.zeros(300, 1)
                norm_syl_identity = [
                    0.99 if x == 1 else 0.01 for x in syl_identity
                ]
                norm_syl_identity_vec = ' '.join(map(str,
                                                     norm_syl_identity[:]))
            if feat_switch['wordEmbed']:
                word_vec = self.wrd_embeds['*UNKNOWN*']

            ### writing features to output file ###
            op1 = open(out_filename, 'w')
            num_of_vectors = max(len(binary_feat_list), len(identity_vec_list),
                                 len(word_embed_list))
            for x in range(num_of_vectors):
                ### initialise feat vector ###
                feat_vec = ''

                ### binary features ###
                if feat_switch['binary']:
                    feat_vec = feat_vec + binary_feat_list[x] + ' '

                ### word embeddings ###
                if feat_switch['wordEmbed']:
                    if feat_switch['wordEmbed'] >= 3:
                        if (x - 1 < 0):
                            feat_vec = feat_vec + word_vec + ' '
                        else:
                            feat_vec = feat_vec + word_embed_list[x - 1] + ' '
                    feat_vec = feat_vec + word_embed_list[x] + ' '
                    if feat_switch['wordEmbed'] >= 3:
                        if (x + 1 >= len(binary_feat_list)):
                            feat_vec = feat_vec + word_vec + ' '
                        else:
                            feat_vec = feat_vec + word_embed_list[x + 1] + ' '

                ### identity features ###
                if feat_switch['identity']:
                    if feat_switch['identity'] >= 5:
                        if (x - 2 < 0):
                            feat_vec = feat_vec + norm_syl_identity_vec + ' '
                        else:
                            feat_vec = feat_vec + identity_vec_list[x -
                                                                    2] + ' '
                    if feat_switch['identity'] >= 3:
                        if (x - 1 < 0):
                            feat_vec = feat_vec + norm_syl_identity_vec + ' '
                        else:
                            feat_vec = feat_vec + identity_vec_list[x -
                                                                    1] + ' '
                    feat_vec = feat_vec + identity_vec_list[x] + ' '
                    if feat_switch['identity'] >= 3:
                        if (x + 1 >= len(binary_feat_list)):
                            feat_vec = feat_vec + norm_syl_identity_vec + ' '
                        else:
                            feat_vec = feat_vec + identity_vec_list[x +
                                                                    1] + ' '
                    if feat_switch['identity'] >= 5:
                        if (x + 2 >= len(binary_feat_list)):
                            feat_vec = feat_vec + norm_syl_identity_vec + ' '
                        else:
                            feat_vec = feat_vec + identity_vec_list[x +
                                                                    2] + ' '
                op1.write(feat_vec + '\n')
                #for z in range(dur_list[x]):
                #    op1.write(feat_vec + ' '.join(map(str, dur_feat_list[frame_indx+z,:]))+'\n')
                #frame_indx+=dur_list[x]
            op1.close()
Пример #10
0
def wavgen_straight_type_vocoder(gen_dir, file_id_list, cfg, logger):
    '''
    Waveform generation with STRAIGHT or WORLD vocoders.
    (whose acoustic parameters are: mgc, bap, and lf0)
    '''
    SPTK = cfg.SPTK
    #    NND      = cfg.NND
    STRAIGHT = cfg.STRAIGHT
    WORLD = cfg.WORLD

    # to be moved
    pf_coef = cfg.pf_coef
    if isinstance(cfg.fw_alpha, str):
        if cfg.fw_alpha == 'Bark':
            fw_coef = bark_alpha(cfg.sr)
        elif cfg.fw_alpha == 'ERB':
            fw_coef = bark_alpha(cfg.sr)
        else:
            raise ValueError(
                'cfg.fw_alpha=' + cfg.fw_alpha +
                ' not implemented, the frequency warping coefficient "fw_coef" cannot be deduced.'
            )
    else:
        fw_coef = cfg.fw_alpha
    co_coef = cfg.co_coef
    fl_coef = cfg.fl

    if cfg.apply_GV:
        io_funcs = BinaryIOCollection()

        logger.info('loading global variance stats from %s' % (cfg.GV_dir))

        ref_gv_mean_file = os.path.join(cfg.GV_dir, 'ref_gv.mean')
        gen_gv_mean_file = os.path.join(cfg.GV_dir, 'gen_gv.mean')
        ref_gv_std_file = os.path.join(cfg.GV_dir, 'ref_gv.std')
        gen_gv_std_file = os.path.join(cfg.GV_dir, 'gen_gv.std')

        ref_gv_mean, frame_number = io_funcs.load_binary_file_frame(
            ref_gv_mean_file, 1)
        gen_gv_mean, frame_number = io_funcs.load_binary_file_frame(
            gen_gv_mean_file, 1)
        ref_gv_std, frame_number = io_funcs.load_binary_file_frame(
            ref_gv_std_file, 1)
        gen_gv_std, frame_number = io_funcs.load_binary_file_frame(
            gen_gv_std_file, 1)

    # counter=1
    max_counter = len(file_id_list)

    for filename in file_id_list:
        #       logger.info('creating waveform for %4d of %4d: %s' % (max_counter, filename))
        # counter=counter+1
        #  pdb.set_trace()
        base = filename
        files = {
            'sp': base + cfg.sp_ext,
            'mgc': base + cfg.mgc_ext,
            'f0': base + '.f0',
            'lf0': base + cfg.lf0_ext,
            'ap': base + '.ap',
            'bap': base + cfg.bap_ext,
            'wav': base + '.wav'
        }

        mgc_file_name = files['mgc']
        bap_file_name = files['bap']
        #   pdb.set_trace()
        cur_dir = os.getcwd()
        os.chdir(gen_dir)

        # post-filtering
        if cfg.do_post_filtering:
            mgc_file_name = files['mgc'] + '_p_mgc'
            # pdb.set_trace()
            post_filter(files['mgc'], mgc_file_name, cfg.mgc_dim, pf_coef,
                        fw_coef, co_coef, fl_coef, gen_dir, cfg)

        if cfg.vocoder_type == "STRAIGHT" and cfg.apply_GV:
            gen_mgc, frame_number = io_funcs.load_binary_file_frame(
                mgc_file_name, cfg.mgc_dim)

            gen_mu = np.reshape(np.mean(gen_mgc, axis=0), (-1, 1))
            gen_std = np.reshape(np.std(gen_mgc, axis=0), (-1, 1))

            local_gv = (ref_gv_std / gen_gv_std) * \
                (gen_std - gen_gv_mean) + ref_gv_mean

            enhanced_mgc = np.repeat(local_gv, frame_number, 1).T / np.repeat(
                gen_std, frame_number, 1).T * (gen_mgc - np.repeat(
                    gen_mu, frame_number, 1).T) + np.repeat(
                        gen_mu, frame_number, 1).T

            new_mgc_file_name = files['mgc'] + '_p_mgc'
            io_funcs.array_to_binary_file(enhanced_mgc, new_mgc_file_name)

            mgc_file_name = files['mgc'] + '_p_mgc'

        if cfg.do_post_filtering and cfg.apply_GV:
            logger.critical(
                'Both smoothing techniques together can\'t be applied!!\n')
        # raise

        # mgc to sp to wav
        if cfg.vocoder_type == 'STRAIGHT':
            run_process(
                '{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} > {sp}'
                .format(mgc2sp=SPTK['MGC2SP'],
                        alpha=cfg.fw_alpha,
                        order=cfg.mgc_dim - 1,
                        fl=cfg.fl,
                        mgc=mgc_file_name,
                        sp=files['sp']))
            run_process(
                '{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} > {f0}'.format(
                    sopr=SPTK['SOPR'], lf0=files['lf0'], f0=files['f0']))
            run_process('{x2x} +fa {f0} > {f0a}'.format(x2x=SPTK['X2X'],
                                                        f0=files['f0'],
                                                        f0a=files['f0'] +
                                                        '.a'))

            if cfg.use_cep_ap:
                run_process(
                    '{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 0 {bap} > {ap}'
                    .format(mgc2sp=SPTK['MGC2SP'],
                            alpha=cfg.fw_alpha,
                            order=cfg.bap_dim - 1,
                            fl=cfg.fl,
                            bap=files['bap'],
                            ap=files['ap']))
            else:
                run_process('{bndap2ap} {bap} > {ap}'.format(
                    bndap2ap=STRAIGHT['BNDAP2AP'],
                    bap=files['bap'],
                    ap=files['ap']))

            run_process(
                '{synfft} -f {sr} -spec -fftl {fl} -shift {shift} -sigp 1.2 -cornf 4000 -float -apfile {ap} {f0a} {sp} {wav}'
                .format(synfft=STRAIGHT['SYNTHESIS_FFT'],
                        sr=cfg.sr,
                        fl=cfg.fl,
                        shift=cfg.shift,
                        ap=files['ap'],
                        f0a=files['f0'] + '.a',
                        sp=files['sp'],
                        wav=files['wav']))

            run_process('rm -f {sp} {f0} {f0a} {ap}'.format(sp=files['sp'],
                                                            f0=files['f0'],
                                                            f0a=files['f0'] +
                                                            '.a',
                                                            ap=files['ap']))
        elif cfg.vocoder_type == 'WORLD':

            run_process(
                '{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} | {x2x} +fd > {f0}'
                .format(sopr=SPTK['SOPR'],
                        lf0=files['lf0'],
                        x2x=SPTK['X2X'],
                        f0=files['f0']))

            run_process('{sopr} -c 0 {bap} | {x2x} +fd > {ap}'.format(
                sopr=SPTK['SOPR'],
                bap=files['bap'],
                x2x=SPTK['X2X'],
                ap=files['ap']))

            # If using world v2, please comment above line and uncomment this
            # run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 0 {bap} | {sopr} -d 32768.0 -P | {x2x} +fd > {ap}'
            #            .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.bap_dim, fl=cfg.fl, bap=bap_file_name, sopr=SPTK['SOPR'], x2x=SPTK['X2X'], ap=files['ap']))

            run_process(
                '{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} | {sopr} -d 32768.0 -P | {x2x} +fd > {sp}'
                .format(mgc2sp=SPTK['MGC2SP'],
                        alpha=cfg.fw_alpha,
                        order=cfg.mgc_dim - 1,
                        fl=cfg.fl,
                        mgc=mgc_file_name,
                        sopr=SPTK['SOPR'],
                        x2x=SPTK['X2X'],
                        sp=files['sp']))

            run_process('{synworld} {fl} {sr} {f0} {sp} {ap} {wav}'.format(
                synworld=WORLD['SYNTHESIS'],
                fl=cfg.fl,
                sr=cfg.sr,
                f0=files['f0'],
                sp=files['sp'],
                ap=files['ap'],
                wav=files['wav']))
            # y = pw.synthesize(f0, sp, ap, fs)

            run_process('rm -f {ap} {sp} {f0}'.format(ap=files['ap'],
                                                      sp=files['sp'],
                                                      f0=files['f0']))
        elif cfg.vocoder_type == 'WORLD_PY':
            logging.info(
                "generate speech with py world, sampling rate is {0}".format(
                    cfg.sr))
            # pdb.set_trace()
            lf0_file = files['lf0']
            lf0 = read_binfile(lf0_file, dim=1, dtype=np.float32)
            zeros_index = np.where(lf0 == -1E+10)
            nonzeros_index = np.where(lf0 != -1E+10)
            f0 = lf0.copy()
            f0[zeros_index] = 0
            f0[nonzeros_index] = np.exp(lf0[nonzeros_index])
            f0 = f0.astype(np.float64)
            if cfg.sr == 16000:
                bap_dim = 1
            elif cfg.sr == 48000:
                bap_dim = 5
            else:
                pass
            bap = read_binfile(bap_file_name, dim=bap_dim, dtype=np.float32)
            ap = pyworld.decode_aperiodicity(
                bap.astype(np.float64).reshape(-1, bap_dim), cfg.sr, cfg.fl)
            mc = read_binfile(mgc_file_name, dim=60, dtype=np.float32)
            alpha = pysptk.util.mcepalpha(cfg.sr)
            sp = pysptk.mc2sp(mc.astype(np.float64),
                              fftlen=cfg.fl,
                              alpha=alpha)
            wav = pyworld.synthesize(f0, sp, ap, cfg.sr, 5)
            x2 = wav / np.max(wav) * 32768
            x2 = x2.astype(np.int16)
            scipy.io.wavfile.write(files['wav'], cfg.sr, x2)
            os.chdir(cur_dir)
Пример #11
0
    def compute_distortion(self, file_id_list, reference_dir, generation_dir,
                           file_ext, feature_dim):
        total_voiced_frame_number = 0

        distortion = 0.0
        vuv_error = 0
        total_frame_number = 0

        io_funcs = BinaryIOCollection()

        ref_all_files_data = numpy.reshape(numpy.array([]), (-1, 1))
        gen_all_files_data = numpy.reshape(numpy.array([]), (-1, 1))
        for file_id in file_id_list:
            ref_file_name = reference_dir + '/' + file_id + file_ext
            gen_file_name = generation_dir + '/' + file_id + file_ext

            ref_data, ref_frame_number = io_funcs.load_binary_file_frame(
                ref_file_name, feature_dim)
            gen_data, gen_frame_number = io_funcs.load_binary_file_frame(
                gen_file_name, feature_dim)

            # accept the difference upto two frames
            if abs(ref_frame_number - gen_frame_number) <= 2:
                ref_frame_number = min(ref_frame_number, gen_frame_number)
                gen_frame_number = min(ref_frame_number, gen_frame_number)
                ref_data = ref_data[0:ref_frame_number, ]
                gen_data = gen_data[0:gen_frame_number, ]

            if ref_frame_number != gen_frame_number:
                self.logger.critical(
                    "The number of frames is not the same: %d vs %d (%s). Error in compute_distortion.py\n."
                    % (ref_frame_number, gen_frame_number, file_id))
                raise

            if file_ext == '.lf0':
                ref_all_files_data = numpy.concatenate(
                    (ref_all_files_data, ref_data), axis=0)
                gen_all_files_data = numpy.concatenate(
                    (gen_all_files_data, gen_data), axis=0)
                temp_distortion, temp_vuv_error, voiced_frame_number = self.compute_f0_mse(
                    ref_data, gen_data)
                vuv_error += temp_vuv_error
                total_voiced_frame_number += voiced_frame_number
            elif file_ext == '.dur':
                ref_data = numpy.reshape(numpy.sum(ref_data, axis=1), (-1, 1))
                gen_data = numpy.reshape(numpy.sum(gen_data, axis=1), (-1, 1))
                ref_all_files_data = numpy.concatenate(
                    (ref_all_files_data, ref_data), axis=0)
                gen_all_files_data = numpy.concatenate(
                    (gen_all_files_data, gen_data), axis=0)
                continue
            elif file_ext == '.mgc':
                temp_distortion = self.compute_mse(ref_data[:, 1:feature_dim],
                                                   gen_data[:, 1:feature_dim])
            else:
                temp_distortion = self.compute_mse(ref_data, gen_data)

            distortion += temp_distortion

            total_frame_number += ref_frame_number

        if file_ext == '.dur':
            dur_rmse = self.compute_rmse(ref_all_files_data,
                                         gen_all_files_data)
            dur_corr = self.compute_corr(ref_all_files_data,
                                         gen_all_files_data)

            return dur_rmse, dur_corr
        elif file_ext == '.lf0':
            distortion /= float(total_voiced_frame_number)
            vuv_error /= float(total_frame_number)

            distortion = numpy.sqrt(distortion)
            f0_corr = self.compute_f0_corr(ref_all_files_data,
                                           gen_all_files_data)

            return distortion, f0_corr, vuv_error
        else:
            distortion /= float(total_frame_number)

            return distortion
Пример #12
0
    def load_labels_with_phone_alignment(self, file_name, dur_file_name):

        # this is not currently used ??? -- it works now :D
        logger = logging.getLogger("labels")
        #logger.critical('unused function ???')
        #raise Exception

        if dur_file_name:
            io_funcs = BinaryIOCollection()
            dur_dim = 1  ## hard coded for now
            manual_dur_data = io_funcs.load_binary_file(dur_file_name, dur_dim)

        if self.add_frame_features:
            assert self.dimension == self.dict_size + self.frame_feature_size
        elif self.subphone_feats != 'none':
            assert self.dimension == self.dict_size + self.frame_feature_size
        else:
            assert self.dimension == self.dict_size

        label_feature_matrix = numpy.empty((100000, self.dimension))

        ph_count = 0
        label_feature_index = 0
        fid = open(file_name)
        for line in fid.readlines():
            line = line.strip()
            if len(line) < 1:
                continue
            temp_list = re.split('\s+', line)
            start_time = int(temp_list[0])
            end_time = int(temp_list[1])
            full_label = temp_list[2]

            # to do - support different frame shift - currently hardwired to 5msec
            # currently under beta testing: support different frame shift
            if dur_file_name:
                frame_number = manual_dur_data[ph_count]
            else:
                frame_number = int((end_time - start_time) / 50000)

            ph_count = ph_count + 1
            #label_binary_vector = self.pattern_matching(full_label)
            label_binary_vector = self.pattern_matching_binary(full_label)

            # if there is no CQS question, the label_continuous_vector will become to empty
            label_continuous_vector = self.pattern_matching_continous_position(
                full_label)
            label_vector = numpy.concatenate(
                [label_binary_vector, label_continuous_vector], axis=1)

            if self.subphone_feats == "coarse_coding":
                cc_feat_matrix = self.extract_coarse_coding_features_relative(
                    frame_number)

            if self.add_frame_features:
                current_block_binary_array = numpy.zeros(
                    (frame_number, self.dict_size + self.frame_feature_size))
                for i in range(frame_number):
                    current_block_binary_array[i,
                                               0:self.dict_size] = label_vector

                    if self.subphone_feats == 'minimal_phoneme':
                        ## features which distinguish frame position in phoneme
                        current_block_binary_array[
                            i, self.dict_size] = float(i + 1) / float(
                                frame_number
                            )  # fraction through phone forwards
                        current_block_binary_array[
                            i, self.dict_size +
                            1] = float(frame_number - i) / float(
                                frame_number
                            )  # fraction through phone backwards
                        current_block_binary_array[
                            i, self.dict_size + 2] = float(
                                frame_number)  # phone duration

                    elif self.subphone_feats == 'coarse_coding':
                        ## features which distinguish frame position in phoneme using three continous numerical features
                        current_block_binary_array[i, self.dict_size +
                                                   0] = cc_feat_matrix[i, 0]
                        current_block_binary_array[i, self.dict_size +
                                                   1] = cc_feat_matrix[i, 1]
                        current_block_binary_array[i, self.dict_size +
                                                   2] = cc_feat_matrix[i, 2]
                        current_block_binary_array[i, self.dict_size +
                                                   3] = float(frame_number)

                    elif self.subphone_feats == 'none':
                        pass

                    else:
                        sys.exit('unknown subphone_feats type')

                label_feature_matrix[
                    label_feature_index:label_feature_index +
                    frame_number, ] = current_block_binary_array
                label_feature_index = label_feature_index + frame_number

            elif self.subphone_feats == 'none':
                current_block_binary_array = label_vector
                label_feature_matrix[label_feature_index:label_feature_index +
                                     1, ] = current_block_binary_array
                label_feature_index = label_feature_index + 1

        fid.close()

        label_feature_matrix = label_feature_matrix[0:label_feature_index, ]

        logger.info('loaded %s, %3d labels' % (file_name, ph_count))
        logger.debug('made label matrix of %d frames x %d labels' %
                     label_feature_matrix.shape)
        return label_feature_matrix
Пример #13
0
def read_data_from_file_list_shared_2(speaker_id_list, inp_file_list, out_file_list, inp_dim, out_dim, sequential_training=False):
    io_funcs = BinaryIOCollection()

    num_of_utt = len(inp_file_list)
    num_of_spk = len(speaker_id_list)

    file_length_dict = {'framenum2utt':{}, 'utt2framenum':{}}

    if sequential_training:
        temp_set_x = {}
        temp_set_y = {}
    else:
        temp_set_x = np.empty((FRAME_BUFFER_SIZE, inp_dim))
        temp_set_y = []
        for i in range(num_of_spk):
            temp_set_y.append(np.empty((FRAME_BUFFER_SIZE, out_dim)))

    ### read file by file ###
    current_index = [0]*num_of_spk  # Keep index for each speaker
    for spk_i, speaker in enumerate(speaker_id_list):

        # Pull sublist of files matching current speaker
        logical_index = [speaker in name for name in inp_file_list]
        inp_file_sublist = np.array(inp_file_list)[logical_index]
        out_file_sublist = np.array(out_file_list)[logical_index]
        num_sub_utt = len(out_file_sublist)

        # Remember to index the speaker as well
        for i in range(num_sub_utt):
            inp_file_name = inp_file_sublist[i]
            out_file_name = out_file_sublist[i]
            inp_features, inp_frame_number = io_funcs.load_binary_file_frame(inp_file_name, inp_dim)
            out_features, out_frame_number = io_funcs.load_binary_file_frame(out_file_name, out_dim)

            base_file_name = os.path.basename(inp_file_name).split(".")[0]

            if abs(inp_frame_number-out_frame_number)>5:
                print('the number of frames in input and output features are different: %d vs %d (%s)' %(inp_frame_number, out_frame_number, base_file_name))
                sys.exit(0)
            else:
                frame_number = min(inp_frame_number, out_frame_number)

            if sequential_training:
                temp_set_x[base_file_name] = inp_features[0:frame_number]
                temp_set_y[speaker][base_file_name] = out_features[0:frame_number]
            else:
                temp_set_x[sum(current_index):sum(current_index)+frame_number, ] = inp_features[0:frame_number]
                temp_set_y[spk_i][current_index[spk_i]:current_index[spk_i]+frame_number, ] = out_features[0:frame_number]
                current_index[spk_i] += frame_number

            if frame_number not in file_length_dict['framenum2utt']:
                file_length_dict['framenum2utt'][frame_number] = [base_file_name]
            else:
                file_length_dict['framenum2utt'][frame_number].append(base_file_name)

            file_length_dict['utt2framenum'][base_file_name] = frame_number

            drawProgressBar(i+1, num_sub_utt)

    sys.stdout.write("\n")

    if not sequential_training:
        set_x = temp_set_x[0:sum(current_index), ]
        set_y =[]
        for i in range(num_of_spk):
            set_y.append(temp_set_y[i][0:current_index[i], ])
    else:
        set_x = temp_set_x
        set_y = temp_set_y

    return set_x, set_y, file_length_dict
Пример #14
0
def read_data_from_file_list(inp_file_list,
                             out_file_list,
                             inp_dim,
                             out_dim,
                             sequential_training=True):
    """
        read input and output files from file lists
        Args:
            inp_file_list: list of input files
        Returns:
            if sequential training
            temp_set_x: will be the list of dicts [{'arctic_a0001':features},...] where features are the input features with shape (T, nx)
            file_length_dict: example
            {'framenum2utt': {578: ['arctic_a0001'], 675: ['arctic_a0002'], 606: ['arctic_a0003']},
             'utt2framenum': {'arctic_a0001': 578, 'arctic_a0002': 675, 'arctic_a0003': 606}}
             tells us each utterance's # of frames
    """
    io_funcs = BinaryIOCollection()

    num_of_utt = len(inp_file_list)

    file_length_dict = {'framenum2utt': {}, 'utt2framenum': {}}

    if sequential_training:
        temp_set_x = {}
        temp_set_y = {}
    else:
        temp_set_x = np.empty((FRAME_BUFFER_SIZE, inp_dim))
        temp_set_y = np.empty((FRAME_BUFFER_SIZE, out_dim))

    ### read file by file ###
    current_index = 0
    for i in range(num_of_utt):
        inp_file_name = inp_file_list[i]
        out_file_name = out_file_list[i]
        logging.debug("read file from {}".format(inp_file_name))
        inp_features, inp_frame_number = io_funcs.load_binary_file_frame(
            inp_file_name, inp_dim)
        out_features, out_frame_number = io_funcs.load_binary_file_frame(
            out_file_name, out_dim)

        base_file_name = os.path.basename(inp_file_name).split(".")[0]
        # pdb.set_trace()
        if abs(inp_frame_number - out_frame_number) > 5:
            print(
                'the number of frames in input and output features are different: %d vs %d (%s)'
                % (inp_frame_number, out_frame_number, base_file_name))
            sys.exit(0)
        else:
            frame_number = min(inp_frame_number, out_frame_number)

        if sequential_training:
            temp_set_x[base_file_name] = inp_features[0:frame_number]
            temp_set_y[base_file_name] = out_features[0:frame_number]
        else:
            try:
                temp_set_x[current_index:current_index +
                           frame_number, ] = inp_features[0:frame_number]
                temp_set_y[current_index:current_index +
                           frame_number, ] = out_features[0:frame_number]
                current_index += frame_number
            except ValueError:
                pdb.set_trace()
                print(inp_file_name)

        if frame_number not in file_length_dict['framenum2utt']:
            file_length_dict['framenum2utt'][frame_number] = [base_file_name]
        else:
            file_length_dict['framenum2utt'][frame_number].append(
                base_file_name)

        file_length_dict['utt2framenum'][base_file_name] = frame_number

        drawProgressBar(i + 1, num_of_utt)

    sys.stdout.write("\n")

    if not sequential_training:
        temp_set_x = temp_set_x[0:current_index, ]
        temp_set_y = temp_set_y[0:current_index, ]

    return temp_set_x, temp_set_y, file_length_dict
Пример #15
0
    def make_labels(self,input_file_descriptors,out_file_name=None,\
                                    fill_missing_values=False,iterate_over_frames=False):

        ## input_file_descriptors is e.g. {'xpath': <open XML file for reading>}

        # file_descriptors is a dictionary of open label files all for the same utterance
        # currently supports XPATH or HTS file formats only
        # keys should be 'xpath' or 'hts'

        # an array in which to assemble all the features
        all_labels = None

        try:
            assert self.configuration
        except AssertionError:
            self.logger.critical(
                'no label configuration loaded, so cannot make labels')
            raise

        # now iterate through the features, and create the features from the appropriate open label file

        xpath_list = []  ## gather all here and extact all features in one pass
        mapper_list = []

        for (item_number,
             feature_specification) in enumerate(self.configuration.labels):

            #osw# self.logger.debug('constructing feature %.80s ...' % feature_specification )

            ## osw -- we'll append frame features to the data for the *LAST*
            ##        feature_specification in our list
            add_frame_features = False
            if item_number + 1 == len(self.configuration.labels):
                add_frame_features = True
                #osw# self.logger.debug('append frame features')

            # which label file should we use?
            if feature_specification.has_key('xpath'):
                # xpath and hts are mutually exclusive label styles
                assert not feature_specification.has_key('hts')
                #osw# self.logger.debug(' feature style: xpath ; XPATH: %s' % feature_specification['xpath']  )

                # actually make the features from this open file and the current XPATH

                try:
                    assert self.configuration.target_nodes
                except:
                    self.logger.critical(
                        'When using XPATH features, "target_nodes" must be defined in the label config file'
                    )
                    raise

                try:
                    xpath_list.append(feature_specification['xpath'])
                    if feature_specification.has_key('mapper'):
                        mapper_list.append(feature_specification['mapper'])
                    else:
                        mapper_list.append(None)
                except:
                    self.logger.critical(
                        'error creating XMLLabelNormalisation object for feature %s'
                        % feature_specification)
                    raise

            if feature_specification.has_key('hts'):
                assert not feature_specification.has_key('xpath')
                # not yet implemented !
                self.logger.warning(
                    'HTS features not implemented - ignoring them!')
                #these_labels=None
                # to do, with implementation: deal with fill_missing_values correctly

        ## Now extract all feats in one go -- go straight to all_labels -- don't compose from 'these_labels':
        label_normaliser = XMLLabelNormalisation(
            xpath=xpath_list,
            mapper=mapper_list,
            fill_missing_values=fill_missing_values,
            target_nodes=self.configuration.target_nodes,
            use_compiled_xpath=self.use_precompiled_xpaths,
            iterate_over_frames=iterate_over_frames)

        try:
            all_labels = label_normaliser.extract_linguistic_features(
                input_file_descriptors['xpath'],
                add_frame_features=add_frame_features)
        except KeyError:
            self.logger.critical(
                'no open xpath label file available to create feature %s' %
                feature_specification)
            raise


#             # add these_features as additional columns of all_features
#             if (these_labels != None):
#                 if all_labels != None:
#                     all_labels = numpy.hstack((all_labels,these_labels))
#                 else:
#                     all_labels= these_labels

        if all_labels != None:
            self.logger.debug(' composed features now have dimension %d' %
                              all_labels.shape[1])

        #osw# self.logger.debug( 'first line of labels: ' + str(all_labels[0,:]))

        # finally, save the labels
        if out_file_name:
            io_funcs = BinaryIOCollection()
            io_funcs.array_to_binary_file(all_labels, out_file_name)

            ## osw: useful for debugging:
            ##numpy.savetxt(out_file_name + '.TXT', all_labels, delimiter='\t')

            # debug
            # with printoptions(threshold=3000, linewidth=1000, edgeitems=1000, precision=1, suppress=True):
            #     # print all_labels
            #     print all_labels.sum(axis=1)

            self.logger.info('saved numerical features of shape %s to %s' %
                             (all_labels.shape, out_file_name))
        else:
            return all_features
Пример #16
0
    def load_next_batch_S2S(self):
        """Load the data for one utterance. This function will be called when utterance-by-utterance loading is required (e.g., sequential training).
        
        """

        temp_set_x = numpy.empty((self.buffer_size, self.n_ins))
        temp_set_y = numpy.empty((self.buffer_size, self.n_outs))
        temp_set_d = numpy.empty((self.buffer_size, 1))

        io_fun = BinaryIOCollection()

        lab_start_frame_number = 0
        lab_end_frame_number = 0

        out_start_frame_number = 0
        out_end_frame_number = 0

        new_x_files_list = self.x_files_list[self.file_index].split(',')
        new_y_files_list = self.y_files_list[self.file_index].split(',')
        new_dur_files_list = self.dur_files_list[self.file_index].split(',')

        for new_file_index in xrange(len(new_x_files_list)):
            in_features, lab_frame_number = io_fun.load_binary_file_frame(
                new_x_files_list[new_file_index], self.n_ins)
            out_features, out_frame_number = io_fun.load_binary_file_frame(
                new_y_files_list[new_file_index], self.n_outs)

            lab_end_frame_number += lab_frame_number
            out_end_frame_number += out_frame_number

            temp_set_x[lab_start_frame_number:lab_end_frame_number,
                       ] = in_features[0:lab_frame_number, ]
            temp_set_y[out_start_frame_number:out_end_frame_number,
                       ] = out_features[0:out_frame_number, ]
            if not self.dur_files_list:
                dur_frame_number = out_end_frame_number
                temp_set_d = numpy.array([dur_frame_number])
            else:
                dur_features, dur_frame_number = io_fun.load_binary_file_frame(
                    new_dur_files_list[new_file_index], 1)
                assert sum(dur_features) == out_frame_number
                temp_set_d[lab_start_frame_number:lab_end_frame_number,
                           ] = dur_features[0:lab_frame_number, ]

            lab_start_frame_number = lab_end_frame_number
            out_start_frame_number = out_end_frame_number

        temp_set_x = temp_set_x[0:lab_end_frame_number, ]
        temp_set_y = temp_set_y[0:out_end_frame_number, ]

        temp_set_d = temp_set_d[0:lab_end_frame_number, ]
        temp_set_d = numpy.reshape(temp_set_d, (-1, ))
        temp_set_d = temp_set_d.astype(int)

        self.file_index += 1

        if self.file_index >= self.list_size:
            self.end_reading = True
            self.file_index = 0

        shared_set_x = self.make_shared(temp_set_x, 'x')
        shared_set_y = self.make_shared(temp_set_y, 'y')
        shared_set_d = theano.shared(numpy.asarray(temp_set_d, dtype='int32'),
                                     name='d',
                                     borrow=True)

        shared_set_xyd = (shared_set_x, shared_set_y, shared_set_d)

        return shared_set_xyd, temp_set_x, temp_set_y, temp_set_d
Пример #17
0
def generate_wav(
        data,
        gen_dir='./results',
        base='sample',
        sptk_dir='/u/kumarrit/world.py/merlin/tools/bin/SPTK-3.9/',
        world_dir='/u/kumarrit/world.py/merlin/tools/bin/WORLD/',
        norm_info_file='/data/lisa/exp/kumarrit/vctk/norm_info_mgc_lf0_vuv_bap_63_MVN.dat',
        do_post_filtering=True,
        mgc_dim=60,
        fl=1024,
        sr=16000):

    io_funcs = BinaryIOCollection()
    file_name = os.path.join(gen_dir, base + ".cmp")

    fid = open(norm_info_file, 'rb')
    cmp_info = numpy.fromfile(fid, dtype=numpy.float32)
    fid.close()
    cmp_info = cmp_info.reshape((2, -1))
    cmp_mean = cmp_info[0, ]
    cmp_std = cmp_info[1, ]

    data = data * cmp_std + cmp_mean

    io_funcs.array_to_binary_file(data, file_name)

    # This code was adapted from Merlin. I should add the license.

    out_dimension_dict = {'bap': 1, 'lf0': 1, 'mgc': 60, 'vuv': 1}
    stream_start_index = {}
    file_extension_dict = {
        'mgc': '.mgc',
        'bap': '.bap',
        'lf0': '.lf0',
        'dur': '.dur',
        'cmp': '.cmp'
    }
    gen_wav_features = ['mgc', 'lf0', 'bap']

    dimension_index = 0
    for feature_name in out_dimension_dict.keys():
        stream_start_index[feature_name] = dimension_index
        dimension_index += out_dimension_dict[feature_name]

    dir_name = os.path.dirname(file_name)
    file_id = os.path.splitext(os.path.basename(file_name))[0]
    features, frame_number = io_funcs.load_binary_file_frame(file_name, 63)

    for feature_name in gen_wav_features:

        current_features = features[:, stream_start_index[feature_name]:
                                    stream_start_index[feature_name] +
                                    out_dimension_dict[feature_name]]

        gen_features = current_features

        if feature_name in ['lf0', 'F0']:
            if 'vuv' in stream_start_index.keys():
                vuv_feature = features[:, stream_start_index['vuv']:
                                       stream_start_index['vuv'] + 1]

                for i in xrange(frame_number):
                    if vuv_feature[i, 0] < 0.5:
                        gen_features[i, 0] = -1.0e+10  # self.inf_float

        new_file_name = os.path.join(
            dir_name, file_id + file_extension_dict[feature_name])

        io_funcs.array_to_binary_file(gen_features, new_file_name)

    pf_coef = 1.4
    fw_alpha = 0.58
    co_coef = 511

    sptk_path = {
        'SOPR': sptk_dir + 'sopr',
        'FREQT': sptk_dir + 'freqt',
        'VSTAT': sptk_dir + 'vstat',
        'MGC2SP': sptk_dir + 'mgc2sp',
        'MERGE': sptk_dir + 'merge',
        'BCP': sptk_dir + 'bcp',
        'MC2B': sptk_dir + 'mc2b',
        'C2ACR': sptk_dir + 'c2acr',
        'MLPG': sptk_dir + 'mlpg',
        'VOPR': sptk_dir + 'vopr',
        'B2MC': sptk_dir + 'b2mc',
        'X2X': sptk_dir + 'x2x',
        'VSUM': sptk_dir + 'vsum'
    }

    world_path = {
        'ANALYSIS': world_dir + 'analysis',
        'SYNTHESIS': world_dir + 'synth'
    }

    fw_coef = fw_alpha
    fl_coef = fl

    files = {
        'sp': base + '.sp',
        'mgc': base + '.mgc',
        'f0': base + '.f0',
        'lf0': base + '.lf0',
        'ap': base + '.ap',
        'bap': base + '.bap',
        'wav': base + '.wav'
    }

    mgc_file_name = files['mgc']

    cur_dir = os.getcwd()
    os.chdir(gen_dir)

    #  post-filtering
    if do_post_filtering:
        line = "echo 1 1 "
        for i in range(2, mgc_dim):
            line = line + str(pf_coef) + " "

        run_process('{line} | {x2x} +af > {weight}'.format(
            line=line,
            x2x=sptk_path['X2X'],
            weight=os.path.join(gen_dir, 'weight')))

        run_process('{freqt} -m {order} -a {fw} -M {co} -A 0 < {mgc} | '
                    '{c2acr} -m {co} -M 0 -l {fl} > {base_r0}'.format(
                        freqt=sptk_path['FREQT'],
                        order=mgc_dim - 1,
                        fw=fw_coef,
                        co=co_coef,
                        mgc=files['mgc'],
                        c2acr=sptk_path['C2ACR'],
                        fl=fl_coef,
                        base_r0=files['mgc'] + '_r0'))

        run_process('{vopr} -m -n {order} < {mgc} {weight} | '
                    '{freqt} -m {order} -a {fw} -M {co} -A 0 | '
                    '{c2acr} -m {co} -M 0 -l {fl} > {base_p_r0}'.format(
                        vopr=sptk_path['VOPR'],
                        order=mgc_dim - 1,
                        mgc=files['mgc'],
                        weight=os.path.join(gen_dir, 'weight'),
                        freqt=sptk_path['FREQT'],
                        fw=fw_coef,
                        co=co_coef,
                        c2acr=sptk_path['C2ACR'],
                        fl=fl_coef,
                        base_p_r0=files['mgc'] + '_p_r0'))

        run_process('{vopr} -m -n {order} < {mgc} {weight} | '
                    '{mc2b} -m {order} -a {fw} | '
                    '{bcp} -n {order} -s 0 -e 0 > {base_b0}'.format(
                        vopr=sptk_path['VOPR'],
                        order=mgc_dim - 1,
                        mgc=files['mgc'],
                        weight=os.path.join(gen_dir, 'weight'),
                        mc2b=sptk_path['MC2B'],
                        fw=fw_coef,
                        bcp=sptk_path['BCP'],
                        base_b0=files['mgc'] + '_b0'))

        run_process(
            '{vopr} -d < {base_r0} {base_p_r0} | '
            '{sopr} -LN -d 2 | {vopr} -a {base_b0} > {base_p_b0}'.format(
                vopr=sptk_path['VOPR'],
                base_r0=files['mgc'] + '_r0',
                base_p_r0=files['mgc'] + '_p_r0',
                sopr=sptk_path['SOPR'],
                base_b0=files['mgc'] + '_b0',
                base_p_b0=files['mgc'] + '_p_b0'))

        run_process('{vopr} -m -n {order} < {mgc} {weight} | '
                    '{mc2b} -m {order} -a {fw} | '
                    '{bcp} -n {order} -s 1 -e {order} | '
                    '{merge} -n {order2} -s 0 -N 0 {base_p_b0} | '
                    '{b2mc} -m {order} -a {fw} > {base_p_mgc}'.format(
                        vopr=sptk_path['VOPR'],
                        order=mgc_dim - 1,
                        mgc=files['mgc'],
                        weight=os.path.join(gen_dir, 'weight'),
                        mc2b=sptk_path['MC2B'],
                        fw=fw_coef,
                        bcp=sptk_path['BCP'],
                        merge=sptk_path['MERGE'],
                        order2=mgc_dim - 2,
                        base_p_b0=files['mgc'] + '_p_b0',
                        b2mc=sptk_path['B2MC'],
                        base_p_mgc=files['mgc'] + '_p_mgc'))

        mgc_file_name = files['mgc'] + '_p_mgc'

    # Vocoder WORLD

    run_process('{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} | '
                '{x2x} +fd > {f0}'.format(sopr=sptk_path['SOPR'],
                                          lf0=files['lf0'],
                                          x2x=sptk_path['X2X'],
                                          f0=files['f0']))

    run_process('{sopr} -c 0 {bap} | {x2x} +fd > {ap}'.format(
        sopr=sptk_path['SOPR'],
        bap=files['bap'],
        x2x=sptk_path['X2X'],
        ap=files['ap']))

    run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} | '
                '{sopr} -d 32768.0 -P | {x2x} +fd > {sp}'.format(
                    mgc2sp=sptk_path['MGC2SP'],
                    alpha=fw_alpha,
                    order=mgc_dim - 1,
                    fl=fl,
                    mgc=mgc_file_name,
                    sopr=sptk_path['SOPR'],
                    x2x=sptk_path['X2X'],
                    sp=files['sp']))

    run_process('{synworld} {fl} {sr} {f0} {sp} {ap} {wav}'.format(
        synworld=world_path['SYNTHESIS'],
        fl=fl,
        sr=sr,
        f0=files['f0'],
        sp=files['sp'],
        ap=files['ap'],
        wav=files['wav']))

    run_process('rm -f {ap} {sp} {f0} {bap} {lf0} {mgc} {mgc}_b0 {mgc}_p_b0 '
                '{mgc}_p_mgc {mgc}_p_r0 {mgc}_r0 {cmp} weight'.format(
                    ap=files['ap'],
                    sp=files['sp'],
                    f0=files['f0'],
                    bap=files['bap'],
                    lf0=files['lf0'],
                    mgc=files['mgc'],
                    cmp=base + '.cmp'))
    os.chdir(cur_dir)
Пример #18
0
    def load_next_batch_S2SML(self):
        """Load the data for one utterance. This function will be called when utterance-by-utterance loading is required (e.g., sequential training).
        
        """

        inp_length = (self.MLU_div['word'][1] - self.MLU_div['word'][0]) + (
            self.MLU_div['word'][3] - self.MLU_div['word'][2])
        af_length = self.MLU_div['length'][-1]

        new_temp_set_x = numpy.empty((self.buffer_size, inp_length))
        new_temp_set_y = numpy.empty((self.buffer_size, self.n_outs))
        new_temp_set_af = numpy.empty((self.buffer_size, af_length))
        new_temp_set_d = [
            numpy.array([], 'int32'),
            numpy.array([], 'int32'),
            numpy.array([], 'int32')
        ]

        io_fun = BinaryIOCollection()

        lab_start_frame_number = 0
        lab_end_frame_number = 0

        out_start_frame_number = 0
        out_end_frame_number = 0

        new_x_files_list = self.x_files_list[self.file_index].split(',')
        new_y_files_list = self.y_files_list[self.file_index].split(',')
        new_dur_files_list = self.dur_files_list[self.file_index].split(',')

        for new_file_index in xrange(len(new_x_files_list)):
            in_features, lab_frame_number = io_fun.load_binary_file_frame(
                new_x_files_list[new_file_index], self.n_ins)
            out_features, out_frame_number = io_fun.load_binary_file_frame(
                new_y_files_list[new_file_index], self.n_outs)
            dur_features, dur_frame_number = io_fun.load_binary_file_frame(
                new_dur_files_list[new_file_index], 1)

            ### MLU features sub-division ###
            temp_set_MLU = in_features[0:lab_frame_number, ]
            temp_set_y = out_features[0:out_frame_number, ]

            temp_set_phone = numpy.concatenate([
                temp_set_MLU[:, self.MLU_div['phone'][0]:self.
                             MLU_div['phone'][1]],
                temp_set_MLU[:,
                             self.MLU_div['phone'][2]:self.MLU_div['phone'][3]]
            ],
                                               axis=1)
            temp_set_syl = numpy.concatenate([
                temp_set_MLU[:, self.MLU_div['syl'][0]:self.MLU_div['syl'][1]],
                temp_set_MLU[:, self.MLU_div['syl'][2]:self.MLU_div['syl'][3]]
            ],
                                             axis=1)
            temp_set_word = numpy.concatenate([
                temp_set_MLU[:,
                             self.MLU_div['word'][0]:self.MLU_div['word'][1]],
                temp_set_MLU[:,
                             self.MLU_div['word'][2]:self.MLU_div['word'][3]]
            ],
                                              axis=1)

            ### duration array sub-division ###
            dur_features = numpy.reshape(dur_features, (-1, ))
            temp_set_d = dur_features.astype(int)
            dur_word_syl = temp_set_d[0:-lab_frame_number]

            num_ph = lab_frame_number
            num_syl = (numpy.where(
                numpy.cumsum(dur_word_syl[::-1]) == lab_frame_number)[0][0] +
                       1)
            num_words = len(dur_word_syl) - num_syl

            temp_set_dur_phone = temp_set_d[-num_ph:]
            temp_set_dur_word = dur_word_syl[0:num_words]
            temp_set_dur_syl = dur_word_syl[num_words:]

            ### additional feature matrix (syllable+phone+frame=432) ###
            num_frames = sum(temp_set_dur_phone)
            temp_set_af = numpy.empty((num_frames, self.MLU_div['length'][-1]))

            temp_set_af[0:num_syl, self.MLU_div['length'][0]:self.
                        MLU_div['length'][1]] = temp_set_syl[
                            numpy.cumsum(temp_set_dur_syl) - 1]
            temp_set_af[0:num_ph, self.MLU_div['length'][1]:self.
                        MLU_div['length'][2]] = temp_set_phone

            ### input word feature matrix ###
            temp_set_dur_word_segments = numpy.zeros(num_words, dtype='int32')
            syl_bound = numpy.cumsum(temp_set_dur_word)
            for indx in xrange(num_words):
                temp_set_dur_word_segments[indx] = int(
                    sum(temp_set_dur_syl[0:syl_bound[indx]]))
            temp_set_x = temp_set_word[temp_set_dur_word_segments - 1]

            ### for batch processing ###
            lab_end_frame_number += num_words
            out_end_frame_number += out_frame_number

            new_temp_set_x[lab_start_frame_number:lab_end_frame_number,
                           ] = temp_set_x[0:num_words, ]
            new_temp_set_y[out_start_frame_number:out_end_frame_number,
                           ] = temp_set_y[0:out_frame_number, ]
            new_temp_set_af[out_start_frame_number:out_end_frame_number,
                            ] = temp_set_af[0:out_frame_number, ]

            new_temp_set_d[0] = numpy.append(new_temp_set_d[0],
                                             temp_set_dur_word)
            new_temp_set_d[1] = numpy.append(new_temp_set_d[1],
                                             temp_set_dur_syl)
            new_temp_set_d[2] = numpy.append(new_temp_set_d[2],
                                             temp_set_dur_phone)

            lab_start_frame_number = lab_end_frame_number
            out_start_frame_number = out_end_frame_number

        new_temp_set_x = new_temp_set_x[0:lab_end_frame_number, ]
        new_temp_set_y = new_temp_set_y[0:out_end_frame_number, ]
        new_temp_set_af = new_temp_set_af[0:out_end_frame_number, ]

        new_temp_set_d = numpy.concatenate(
            (new_temp_set_d[0], new_temp_set_d[1], new_temp_set_d[2]))

        ### rest of the code similar to S2S ###
        self.file_index += 1

        if self.file_index >= self.list_size:
            self.end_reading = True
            self.file_index = 0

        shared_set_x = self.make_shared(new_temp_set_x, 'x')
        shared_set_y = self.make_shared(new_temp_set_y, 'y')
        shared_set_d = theano.shared(numpy.asarray(new_temp_set_d,
                                                   dtype='int32'),
                                     name='d',
                                     borrow=True)

        shared_set_xyd = (shared_set_x, shared_set_y, shared_set_d)

        return shared_set_xyd, new_temp_set_x, new_temp_set_y, new_temp_set_d, new_temp_set_af
Пример #19
0
    def load_next_partition(self):
        """Load one block data. The number of frames will be the buffer size set during intialisation.

        """

        self.logger.debug('loading next partition')

        temp_set_x = numpy.empty((self.buffer_size, self.n_ins))
        temp_set_y = numpy.empty((self.buffer_size, self.n_outs))
        current_index = 0

        ### first check whether there are remaining data from previous utterance
        if self.remain_frame_number > 0:
            temp_set_x[
                current_index:self.remain_frame_number, ] = self.remain_data_x
            temp_set_y[
                current_index:self.remain_frame_number, ] = self.remain_data_y
            current_index += self.remain_frame_number

            self.remain_frame_number = 0

        io_fun = BinaryIOCollection()
        while True:
            if current_index >= self.buffer_size:
                break
            if self.file_index >= self.list_size:
                self.end_reading = True
                self.file_index = 0
                break

            in_features, lab_frame_number = io_fun.load_binary_file_frame(
                self.x_files_list[self.file_index], self.n_ins)
            out_features, out_frame_number = io_fun.load_binary_file_frame(
                self.y_files_list[self.file_index], self.n_outs)

            frame_number = lab_frame_number
            if abs(
                    lab_frame_number - out_frame_number
            ) < 5:  ## we allow small difference here. may not be correct, but sometimes, there is one/two frames difference
                if lab_frame_number > out_frame_number:
                    frame_number = out_frame_number
            else:
                base_file_name = self.x_files_list[self.file_index].split(
                    '/')[-1].split('.')[0]
                self.logger.critical(
                    "the number of frames in label and acoustic features are different: %d vs %d (%s)"
                    % (lab_frame_number, out_frame_number, base_file_name))
                raise

            out_features = out_features[0:frame_number, ]
            in_features = in_features[0:frame_number, ]

            if current_index + frame_number <= self.buffer_size:
                temp_set_x[current_index:current_index +
                           frame_number, ] = in_features
                temp_set_y[current_index:current_index +
                           frame_number, ] = out_features

                current_index = current_index + frame_number
            else:  ## if current utterance cannot be stored in the block, then leave the remaining part for the next block
                used_frame_number = self.buffer_size - current_index
                temp_set_x[current_index:self.buffer_size, ] = in_features[
                    0:used_frame_number, ]
                temp_set_y[current_index:self.buffer_size, ] = out_features[
                    0:used_frame_number, ]
                current_index = self.buffer_size

                self.remain_data_x = in_features[
                    used_frame_number:frame_number, ]
                self.remain_data_y = out_features[
                    used_frame_number:frame_number, ]
                self.remain_frame_number = frame_number - used_frame_number

            self.file_index += 1

        temp_set_x = temp_set_x[0:current_index, ]
        temp_set_y = temp_set_y[0:current_index, ]

        numpy.random.seed(271639)
        numpy.random.shuffle(temp_set_x)
        numpy.random.seed(271639)
        numpy.random.shuffle(temp_set_y)

        shared_set_x = self.make_shared(temp_set_x, 'x')
        shared_set_y = self.make_shared(temp_set_y, 'y')

        shared_set_xy = (shared_set_x, shared_set_y)
        #        temp_set_x = self.make_shared(temp_set_x, 'x')
        #        temp_set_y = self.make_shared(temp_set_y, 'y')

        return shared_set_xy, temp_set_x, temp_set_y
Пример #20
0
def generate_wav(gen_dir, file_id_list, cfg):

    logger = logging.getLogger("wav_generation")

    SPTK = cfg.SPTK
    #    NND      = cfg.NND
    STRAIGHT = cfg.STRAIGHT
    WORLD = cfg.WORLD

    ## to be moved
    pf_coef = cfg.pf_coef
    if isinstance(cfg.fw_alpha, basestring):
        if cfg.fw_alpha == 'Bark':
            fw_coef = bark_alpha(cfg.sr)
        elif cfg.fw_alpha == 'ERB':
            fw_coef = bark_alpha(cfg.sr)
        else:
            raise ValueError(
                'cfg.fw_alpha=' + cfg.fw_alpha +
                ' not implemented, the frequency warping coefficient "fw_coef" cannot be deduced.'
            )
    else:
        fw_coef = cfg.fw_alpha
    co_coef = cfg.co_coef
    fl_coef = cfg.fl

    if cfg.apply_GV:
        io_funcs = BinaryIOCollection()

        logger.info('loading global variance stats from %s' % (cfg.GV_dir))

        ref_gv_mean_file = os.path.join(cfg.GV_dir, 'ref_gv.mean')
        gen_gv_mean_file = os.path.join(cfg.GV_dir, 'gen_gv.mean')
        ref_gv_std_file = os.path.join(cfg.GV_dir, 'ref_gv.std')
        gen_gv_std_file = os.path.join(cfg.GV_dir, 'gen_gv.std')

        ref_gv_mean, frame_number = io_funcs.load_binary_file_frame(
            ref_gv_mean_file, 1)
        gen_gv_mean, frame_number = io_funcs.load_binary_file_frame(
            gen_gv_mean_file, 1)
        ref_gv_std, frame_number = io_funcs.load_binary_file_frame(
            ref_gv_std_file, 1)
        gen_gv_std, frame_number = io_funcs.load_binary_file_frame(
            gen_gv_std_file, 1)

    counter = 1
    max_counter = len(file_id_list)

    for filename in file_id_list:

        logger.info('creating waveform for %4d of %4d: %s' %
                    (counter, max_counter, filename))
        counter = counter + 1
        base = filename
        files = {
            'sp': base + cfg.sp_ext,
            'mgc': base + cfg.mgc_ext,
            'f0': base + '.f0',
            'lf0': base + cfg.lf0_ext,
            'ap': base + '.ap',
            'bap': base + cfg.bap_ext,
            'wav': base + '.wav'
        }

        mgc_file_name = files['mgc']
        bap_file_name = files['bap']

        cur_dir = os.getcwd()
        os.chdir(gen_dir)

        ### post-filtering
        if cfg.do_post_filtering:
            line = "echo 1 1 "
            for i in range(2, cfg.mgc_dim):
                line = line + str(pf_coef) + " "

            run_process('{line} >{weighttxt}'.format(line=line,
                                                     weighttxt=os.path.join(
                                                         gen_dir,
                                                         'weight.txt')))
            run_process('{x2x} +af < {weighttxt} > {weight}'.format(
                x2x=SPTK['X2X'],
                weighttxt=os.path.join(gen_dir, 'weight.txt'),
                weight=os.path.join(gen_dir, 'weight.bin')))

            run_process(
                '{freqt} -m {order} -a {fw} -M {co} -A 0 < {mgc} > {temp1}'.
                format(freqt=SPTK['FREQT'],
                       order=cfg.mgc_dim - 1,
                       fw=fw_coef,
                       co=co_coef,
                       mgc=files['mgc'],
                       temp1=files['mgc'] + '_r0temp1'))
            run_process(
                '{c2acr} -m {co} -M 0 -l {fl} <{temp1} > {base_r0}'.format(
                    co=co_coef,
                    c2acr=SPTK['C2ACR'],
                    fl=fl_coef,
                    base_r0=files['mgc'] + '_r0',
                    temp1=files['mgc'] + '_r0temp1'))

            run_process(
                '{vopr} -m -n {order} < {mgc} {weight} > {temp2}'.format(
                    vopr=SPTK['VOPR'],
                    order=cfg.mgc_dim - 1,
                    mgc=files['mgc'],
                    weight=os.path.join(gen_dir, 'weight.bin'),
                    temp2=files['mgc'] + '_mgctemp2'))
            run_process(
                '{freqt} -m {order} -a {fw} -M {co} -A 0 < {temp2} > {temp3}'.
                format(order=cfg.mgc_dim - 1,
                       freqt=SPTK['FREQT'],
                       fw=fw_coef,
                       co=co_coef,
                       temp2=files['mgc'] + '_mgctemp2',
                       temp3=files['mgc'] + '_mgctemp3'))
            run_process(
                '{c2acr} -m {co} -M 0 -l {fl} < {temp3} > {base_p_r0}'.format(
                    temp3=files['mgc'] + '_mgctemp3',
                    co=co_coef,
                    c2acr=SPTK['C2ACR'],
                    fl=fl_coef,
                    base_p_r0=files['mgc'] + '_p_r0'))

            run_process(
                '{vopr} -m -n {order} < {mgc} {weight} > {temp4}'.format(
                    vopr=SPTK['VOPR'],
                    order=cfg.mgc_dim - 1,
                    mgc=files['mgc'],
                    weight=os.path.join(gen_dir, 'weight.bin'),
                    temp4=files['mgc'] + '_mgctemp4'))
            run_process('{mc2b} -m {order} -a {fw} < {temp4} > {temp5}'.format(
                order=cfg.mgc_dim - 1,
                mc2b=SPTK['MC2B'],
                fw=fw_coef,
                temp4=files['mgc'] + '_mgctemp4',
                temp5=files['mgc'] + '_mgctemp5'))
            run_process(
                '{bcp} -n {order} -s 0 -e 0 < {temp5} > {base_b0}'.format(
                    order=cfg.mgc_dim - 1,
                    bcp=SPTK['BCP'],
                    base_b0=files['mgc'] + '_b0',
                    temp5=files['mgc'] + '_mgctemp5'))

            run_process('{vopr} -d < {base_r0} {base_p_r0} > {temp6}'.format(
                vopr=SPTK['VOPR'],
                base_r0=files['mgc'] + '_r0',
                base_p_r0=files['mgc'] + '_p_r0',
                temp6=files['mgc'] + '_mgctemp6'))
            run_process('{sopr} -LN -d 2 < {temp6} > {temp7}'.format(
                sopr=SPTK['SOPR'],
                temp6=files['mgc'] + '_mgctemp6',
                temp7=files['mgc'] + '_mgctemp7'))
            run_process('{vopr} -a {base_b0} < {temp7} > {base_p_b0}'.format(
                vopr=SPTK['VOPR'],
                temp7=files['mgc'] + '_mgctemp7',
                base_b0=files['mgc'] + '_b0',
                base_p_b0=files['mgc'] + '_p_b0'))

            run_process(
                '{vopr} -m -n {order} < {mgc} {weight} > {temp8}'.format(
                    vopr=SPTK['VOPR'],
                    order=cfg.mgc_dim - 1,
                    mgc=files['mgc'],
                    weight=os.path.join(gen_dir, 'weight.bin'),
                    temp8=files['mgc'] + '_mgctemp8'))
            run_process('{mc2b} -m {order} -a {fw} < {temp8} > {temp9}'.format(
                order=cfg.mgc_dim - 1,
                mc2b=SPTK['MC2B'],
                fw=fw_coef,
                temp8=files['mgc'] + '_mgctemp8',
                temp9=files['mgc'] + '_mgctemp9'))
            run_process(
                '{bcp} -n {order} -s 1 -e {order} < {temp9} > {temp10}'.format(
                    order=cfg.mgc_dim - 1,
                    bcp=SPTK['BCP'],
                    temp9=files['mgc'] + '_mgctemp9',
                    temp10=files['mgc'] + '_mgctemp10'))
            run_process(
                '{merge} -n {order2} -s 0 -N 0 {base_p_b0} < {temp10} > {temp11}'
                .format(merge=SPTK['MERGE'],
                        order2=cfg.mgc_dim - 2,
                        base_p_b0=files['mgc'] + '_p_b0',
                        temp10=files['mgc'] + '_mgctemp10',
                        temp11=files['mgc'] + '_mgctemp11'))
            run_process(
                '{b2mc} -m {order} -a {fw} < {temp11} > {base_p_mgc}'.format(
                    order=cfg.mgc_dim - 1,
                    fw=fw_coef,
                    b2mc=SPTK['B2MC'],
                    base_p_mgc=files['mgc'] + '_p_mgc',
                    temp11=files['mgc'] + '_mgctemp11'))

            mgc_file_name = files['mgc'] + '_p_mgc'

        if cfg.vocoder_type == "STRAIGHT" and cfg.apply_GV:
            gen_mgc, frame_number = io_funcs.load_binary_file_frame(
                mgc_file_name, cfg.mgc_dim)

            gen_mu = np.reshape(np.mean(gen_mgc, axis=0), (-1, 1))
            gen_std = np.reshape(np.std(gen_mgc, axis=0), (-1, 1))

            local_gv = (ref_gv_std / gen_gv_std) * (gen_std -
                                                    gen_gv_mean) + ref_gv_mean

            enhanced_mgc = np.repeat(local_gv, frame_number, 1).T / np.repeat(
                gen_std, frame_number, 1).T * (gen_mgc - np.repeat(
                    gen_mu, frame_number, 1).T) + np.repeat(
                        gen_mu, frame_number, 1).T

            new_mgc_file_name = files['mgc'] + '_p_mgc'
            io_funcs.array_to_binary_file(enhanced_mgc, new_mgc_file_name)

            mgc_file_name = files['mgc'] + '_p_mgc'

        if cfg.do_post_filtering and cfg.apply_GV:
            logger.critical(
                'Both smoothing techniques together can\'t be applied!!\n')
            raise

        ###mgc to sp to wav
        if cfg.vocoder_type == 'STRAIGHT':
            run_process(
                '{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} > {sp}'
                .format(mgc2sp=SPTK['MGC2SP'],
                        alpha=cfg.fw_alpha,
                        order=cfg.mgc_dim - 1,
                        fl=cfg.fl,
                        mgc=mgc_file_name,
                        sp=files['sp']))
            run_process(
                '{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} > {f0}'.format(
                    sopr=SPTK['SOPR'], lf0=files['lf0'], f0=files['f0']))
            run_process('{x2x} +fa {f0} > {f0a}'.format(x2x=SPTK['X2X'],
                                                        f0=files['f0'],
                                                        f0a=files['f0'] +
                                                        '.a'))

            if cfg.use_cep_ap:
                run_process(
                    '{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 0 {bap} > {ap}'
                    .format(mgc2sp=SPTK['MGC2SP'],
                            alpha=cfg.fw_alpha,
                            order=cfg.bap_dim - 1,
                            fl=cfg.fl,
                            bap=files['bap'],
                            ap=files['ap']))
            else:
                run_process('{bndap2ap} {bap} > {ap}'.format(
                    bndap2ap=STRAIGHT['BNDAP2AP'],
                    bap=files['bap'],
                    ap=files['ap']))

            run_process(
                '{synfft} -f {sr} -spec -fftl {fl} -shift {shift} -sigp 0.0 -cornf 400 -float -apfile {ap} {f0a} {sp} {wav}'
                .format(synfft=STRAIGHT['SYNTHESIS_FFT'],
                        sr=cfg.sr,
                        fl=cfg.fl,
                        shift=cfg.shift,
                        ap=files['ap'],
                        f0a=files['f0'] + '.a',
                        sp=files['sp'],
                        wav=files['wav']))

            run_process('rm -f {sp} {f0} {f0a} {ap}'.format(sp=files['sp'],
                                                            f0=files['f0'],
                                                            f0a=files['f0'] +
                                                            '.a',
                                                            ap=files['ap']))
        elif cfg.vocoder_type == 'WORLD':

            run_process(
                '{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} > {temp12}'.
                format(sopr=SPTK['SOPR'],
                       lf0=files['lf0'],
                       temp12=files['f0'] + '_temp12'))
            run_process('{x2x} +fd < {temp12} > {f0}'.format(
                x2x=SPTK['X2X'],
                f0=files['f0'],
                temp12=files['f0'] + '_temp12'))

            run_process('{sopr} -c 0 {bap} > {temp13}'.format(
                sopr=SPTK['SOPR'],
                bap=files['bap'],
                temp13=files['ap'] + '_temp13'))
            run_process('{x2x} +fd < {temp13} > {ap}'.format(
                x2x=SPTK['X2X'],
                ap=files['ap'],
                temp13=files['ap'] + '_temp13'))

            ### If using world v2, please comment above line and uncomment this
            #run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 0 {bap} | {sopr} -d 32768.0 -P | {x2x} +fd > {ap}'
            #            .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.bap_dim, fl=cfg.fl, bap=bap_file_name, sopr=SPTK['SOPR'], x2x=SPTK['X2X'], ap=files['ap']))

            run_process(
                '{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} > {temp14}'
                .format(mgc2sp=SPTK['MGC2SP'],
                        alpha=cfg.fw_alpha,
                        order=cfg.mgc_dim - 1,
                        fl=cfg.fl,
                        mgc=mgc_file_name,
                        temp14=files['sp'] + '_temp14'))
            run_process('{sopr} -d 32768.0 -P < {temp14} > {temp15}'.format(
                sopr=SPTK['SOPR'],
                temp14=files['sp'] + '_temp14',
                temp15=files['sp'] + '_temp15'))
            run_process('{x2x} +fd < {temp15} > {sp}'.format(
                x2x=SPTK['X2X'],
                sp=files['sp'],
                temp15=files['sp'] + '_temp15'))

            run_process('{synworld} {fl} {sr} {f0} {sp} {ap} {wav}'.format(
                synworld=WORLD['SYNTHESIS'],
                fl=cfg.fl,
                sr=cfg.sr,
                f0=files['f0'],
                sp=files['sp'],
                ap=files['ap'],
                wav=files['wav']))

            #run_process('rm -f {ap} {sp} {f0}'.format(ap=files['ap'],sp=files['sp'],f0=files['f0']))

        else:

            logger.critical('The vocoder %s is not supported yet!\n' %
                            cfg.vocoder_type)
            raise

        os.chdir(cur_dir)
Пример #21
0
def trim_silence(in_list, out_list, in_dimension, label_list, label_dimension, \
                                       silence_feature_index, percent_to_keep=0):
    '''
    Function to trim silence from binary label/speech files based on binary labels.
        in_list: list of binary label/speech files to trim
        out_list: trimmed files
        in_dimension: dimension of data to trim
        label_list: list of binary labels which contain trimming criterion
        label_dimesion:
        silence_feature_index: index of feature in labels which is silence: 1 means silence (trim), 0 means leave.
    '''
    assert len(in_list) == len(out_list) == len(label_list)
    io_funcs = BinaryIOCollection()
    for (infile, outfile, label_file) in zip(in_list, out_list, label_list):

        data = io_funcs.load_binary_file(infile, in_dimension)
        label = io_funcs.load_binary_file(label_file, label_dimension)

        audio_label_difference = data.shape[0] - label.shape[0]
        assert math.fabs(audio_label_difference) < 3,'%s and %s contain different numbers of frames: %s %s'%(infile, label_file,  data.shape[0], label.shape[0])

        ## In case they are different, resize -- keep label fixed as we assume this has
        ## already been processed. (This problem only arose with STRAIGHT features.)
        if audio_label_difference < 0:  ## label is longer -- pad audio to match by repeating last frame:
            print('audio too short -- pad')
            padding = numpy.vstack([data[-1, :]] * int(math.fabs(audio_label_difference)))
            data = numpy.vstack([data, padding])
        elif audio_label_difference > 0: ## audio is longer -- cut it
            print('audio too long -- trim')
            new_length = label.shape[0]
            data = data[:new_length, :]
        #else: -- expected case -- lengths match, so do nothing

        silence_flag = label[:, silence_feature_index]
#         print silence_flag
        if not (numpy.unique(silence_flag) == numpy.array([0,1])).all():
            ## if it's all 0s or 1s, that's ok:
            assert (numpy.unique(silence_flag) == numpy.array([0]).all()) or \
                   (numpy.unique(silence_flag) == numpy.array([1]).all()), \
                   'dimension %s of %s contains values other than 0 and 1'%(silence_feature_index, infile)
        print('Remove %d%% of frames (%s frames) as silence... '%(100 * numpy.sum(silence_flag / float(len(silence_flag))), int(numpy.sum(silence_flag))))
        non_silence_indices = numpy.nonzero(silence_flag == 0)  ## get the indices where silence_flag == 0 is True (i.e. != 0)
        if percent_to_keep != 0:
            assert type(percent_to_keep) == int and percent_to_keep > 0
            #print silence_flag
            silence_indices = numpy.nonzero(silence_flag == 1)
            ## nonzero returns a tuple of arrays, one for each dimension of input array
            silence_indices = silence_indices[0]
            every_nth = 100  / percent_to_keep
            silence_indices_to_keep = silence_indices[::every_nth]  ## every_nth used +as step value in slice
                        ## -1 due to weird error with STRAIGHT features at line 144:
                        ## IndexError: index 445 is out of bounds for axis 0 with size 445
            if len(silence_indices_to_keep) == 0:
                silence_indices_to_keep = numpy.array([1]) ## avoid errors in case there is no silence
            print('   Restore %s%% (every %sth frame: %s frames) of silent frames'%(percent_to_keep, every_nth, len(silence_indices_to_keep)))

            ## Append to end of utt -- same function used for labels and audio
            ## means that violation of temporal order doesn't matter -- will be consistent.
            ## Later, frame shuffling will disperse silent frames evenly across minibatches:
            non_silence_indices = ( numpy.hstack( [non_silence_indices[0], silence_indices_to_keep] ) )
                                                    ##  ^---- from tuple and back (see nonzero note above)

        trimmed_data = data[non_silence_indices, :]  ## advanced integer indexing
        io_funcs.array_to_binary_file(trimmed_data, outfile)
Пример #22
0
    def extract_acousitc_label_features(self, orig_file, output_file):
        io_funcs = BinaryIOCollection()
        totalMat = io_funcs.file2matrix(orig_file, numpy.int)
        labelMat = totalMat[:, :-5]
        durMat = totalMat[:, -5:]

        label_len = totalMat.shape[1] - 5

        self.label_dimension = label_len + 9

        phone_number = labelMat.shape[0]

        label_feature_matrix = numpy.empty((100000, self.label_dimension))

        state_number = 5
        label_feature_index = 0

        for phone_index in xrange(phone_number):
            label_vector = labelMat[phone_index, :]
            state_vector = durMat[phone_index, :]
            phone_duration = 0
            state_duration_bases = numpy.zeros((5, ), dtype=numpy.int)
            for state_index in xrange(state_number):
                state_duration_bases[state_index] = phone_duration
                phone_duration = phone_duration + state_vector[state_index]

            for state_index in xrange(state_number):
                frame_number = state_vector[state_index]
                current_block_binary_array = numpy.zeros(
                    (frame_number, self.label_dimension))
                state_duration_base = state_duration_bases[state_index]
                state_index_backward = state_number - state_index
                for i in xrange(frame_number):
                    current_block_binary_array[i, 0:label_len] = label_vector

                    current_block_binary_array[
                        i, label_len] = float(i + 1) / float(
                            frame_number)  ## fraction through state (forwards)
                    current_block_binary_array[
                        i, label_len + 1] = float(frame_number - i) / float(
                            frame_number
                        )  ## fraction through state (backwards)
                    current_block_binary_array[i, label_len + 2] = float(
                        frame_number)  ## length of state in frames
                    current_block_binary_array[i, label_len + 3] = float(
                        state_index)  ## state index (counting forwards)
                    current_block_binary_array[i, label_len + 4] = float(
                        state_index_backward
                    )  ## state index (counting backwards)

                    current_block_binary_array[i, label_len + 5] = float(
                        phone_duration)  ## length of phone in frames
                    current_block_binary_array[
                        i, label_len + 6] = float(frame_number) / float(
                            phone_duration
                        )  ## fraction of the phone made up by current state
                    current_block_binary_array[i, label_len + 7] = float(
                        phone_duration - i - state_duration_base) / float(
                            phone_duration
                        )  ## fraction through phone (forwards)
                    current_block_binary_array[
                        i, label_len +
                        8] = float(state_duration_base + i + 1) / float(
                            phone_duration
                        )  ## fraction through phone (backwards)

                label_feature_matrix[
                    label_feature_index:label_feature_index +
                    frame_number, ] = current_block_binary_array
                label_feature_index = label_feature_index + frame_number

        label_feature_matrix = label_feature_matrix[0:label_feature_index, ]

        #print label_feature_matrix.shape

        io_funcs.array_to_binary_file(label_feature_matrix, output_file)
Пример #23
0
    def modify_dur_from_state_alignment_labels(self, label_file_name,
                                               gen_dur_file_name,
                                               gen_lab_file_name):
        logger = logging.getLogger("dur")

        state_number = self.state_number
        dur_dim = state_number

        io_funcs = BinaryIOCollection()
        dur_features, frame_number = io_funcs.load_binary_file_frame(
            gen_dur_file_name, dur_dim)

        fid = open(label_file_name)
        utt_labels = fid.readlines()
        fid.close()

        label_number = len(utt_labels)
        logger.info('loaded %s, %3d labels' % (label_file_name, label_number))

        out_fid = open(gen_lab_file_name, 'w')

        current_index = 0
        prev_end_time = 0
        for line in utt_labels:
            line = line.strip()

            if len(line) < 1:
                continue
            temp_list = re.split('\s+', line)

            if len(temp_list) == 1:
                start_time = 0
                end_time = 600000  ## hard-coded silence duration
                full_label = temp_list[0]
            else:
                start_time = int(temp_list[0])
                end_time = int(temp_list[1])
                full_label = temp_list[2]

                full_label_length = len(
                    full_label) - 3  # remove state information [k]
                state_index = full_label[full_label_length + 1]
                state_index = int(state_index) - 1

            label_binary_flag = self.check_silence_pattern(full_label)

            if len(temp_list) == 1:
                for state_index in range(1, state_number + 1):
                    if label_binary_flag == 1:
                        current_state_dur = end_time - start_time
                    else:
                        pred_state_dur = dur_features[current_index,
                                                      state_index - 1]
                        current_state_dur = int(pred_state_dur) * 5 * 10000
                    out_fid.write(
                        str(prev_end_time) + ' ' +
                        str(prev_end_time + current_state_dur) + ' ' +
                        full_label + '[' + str(state_index + 1) + ']\n')
                    prev_end_time = prev_end_time + current_state_dur
            else:
                if label_binary_flag == 1:
                    current_state_dur = end_time - start_time
                else:
                    pred_state_dur = dur_features[current_index,
                                                  state_index - 1]
                    current_state_dur = int(pred_state_dur) * 5 * 10000
                out_fid.write(
                    str(prev_end_time) + ' ' +
                    str(prev_end_time + current_state_dur) + ' ' + full_label +
                    '\n')
                prev_end_time = prev_end_time + current_state_dur

            if state_index == state_number and label_binary_flag != 1:
                current_index += 1

        logger.debug(
            'modifed label with predicted duration of %d frames x %d features'
            % dur_features.shape)
Пример #24
0
                dr = int(dr) * 5 * 10000
                op1.write(
                    str(prev_ed) + ' ' + str(prev_ed + dr) + ' ' + fstr[2] +
                    '\n')
                prev_ed = prev_ed + dr

            count = count + 1

        ip2.close()
        op1.close()


if __name__ == "__main__":

    htsclass = readHTSlabelFile()
    io_funcs = BinaryIOCollection()

    ### speaker ###
    speaker = 'blz16'
    decomposition_unit = 'phone'
    normalization = 'MVN'

    out_dim = 8
    CTC_classes = 12

    ### Absolute work path ###
    work_dir = '/afs/inf.ed.ac.uk/group/cstr/projects/phd/s1432486/work/Hybrid_prosody_model/'

    label_align_dir = os.path.join(
        work_dir, 'Data/inter-module/' + speaker + '/label_state_align')
    feat_dir_path = 'dur_' + decomposition_unit + '_' + str(out_dim)
Пример #25
0
    def prepare_data(self, in_file_list_dict, out_file_list, in_dimension_dict,
                     out_dimension_dict):

        logger = logging.getLogger("acoustic_comp")

        stream_start_index = {}
        stream_dim_index = 0
        for stream_name in list(out_dimension_dict.keys()):
            if stream_name not in stream_start_index:
                stream_start_index[stream_name] = stream_dim_index

            stream_dim_index += out_dimension_dict[stream_name]

        io_funcs = BinaryIOCollection()

        for i in range(self.file_number):
            out_file_name = out_file_list[i]

            #if os.path.isfile(out_file_name):
            #    logger.info('processing file %4d of %4d : %s exists' % (i+1, self.file_number, out_file_name))
            #    continue

            logger.info('processing file %4d of %4d : %s' %
                        (i + 1, self.file_number, out_file_name))

            out_data_matrix = None
            out_frame_number = 0

            for k in range(self.data_stream_number):
                data_stream_name = self.data_stream_list[k]
                in_file_name = in_file_list_dict[data_stream_name][i]
                in_feature_dim = in_dimension_dict[data_stream_name]
                features, frame_number = io_funcs.load_binary_file_frame(
                    in_file_name, in_feature_dim)

                if k == 0:
                    out_frame_number = frame_number
                    out_data_matrix = numpy.zeros(
                        (out_frame_number, self.out_dimension))

                if frame_number > out_frame_number:
                    features = features[0:out_frame_number, ]
                    frame_number = out_frame_number

                try:
                    assert out_frame_number == frame_number
                except AssertionError:
                    logger.critical(
                        'the frame number of data stream %s is not consistent with others: current %d others %d'
                        % (data_stream_name, out_frame_number, frame_number))
                    raise

                dim_index = stream_start_index[data_stream_name]

                if data_stream_name in ['lf0', 'F0']:  ## F0 added for GlottHMM
                    features, vuv_vector = self.interpolate_f0(features)

                    ### if vuv information to be recorded, store it in corresponding column
                    if self.record_vuv:
                        out_data_matrix[0:out_frame_number,
                                        stream_start_index['vuv']:
                                        stream_start_index['vuv'] +
                                        1] = vuv_vector

                out_data_matrix[0:out_frame_number, dim_index:dim_index +
                                in_feature_dim] = features
                dim_index = dim_index + in_feature_dim

                if self.compute_dynamic[data_stream_name]:

                    delta_features = self.compute_dynamic_matrix(
                        features, self.delta_win, frame_number, in_feature_dim)
                    acc_features = self.compute_dynamic_matrix(
                        features, self.acc_win, frame_number, in_feature_dim)

                    out_data_matrix[0:out_frame_number, dim_index:dim_index +
                                    in_feature_dim] = delta_features
                    dim_index = dim_index + in_feature_dim

                    out_data_matrix[0:out_frame_number, dim_index:dim_index +
                                    in_feature_dim] = acc_features

            ### write data to file
            io_funcs.array_to_binary_file(out_data_matrix, out_file_name)
            logger.debug(' wrote %d frames of features', out_frame_number)
Пример #26
0
    def acoustic_decomposition(self, in_file_list, dimension, out_dimension_dict, file_extension_dict, var_file_dict, do_MLPG=True, cfg=None):

        # pdb.set_trace()
        logger = logging.getLogger('param_generation')

        logger.debug('acoustic_decomposition for %d files' % len(in_file_list) )

        self.load_covariance(var_file_dict, out_dimension_dict)

        stream_start_index = {}
        dimension_index = 0
        recorded_vuv = False
        vuv_dimension = None

        for feature_name in list(out_dimension_dict.keys()):
            stream_start_index[feature_name] = dimension_index
            dimension_index += out_dimension_dict[feature_name]

        io_funcs = BinaryIOCollection()

        mlpg_algo = MLParameterGeneration()

        findex=0
        flen=len(in_file_list)
        for file_name in in_file_list:

            findex=findex+1

            dir_name = os.path.dirname(file_name)
            file_id = os.path.splitext(os.path.basename(file_name))[0]

            features, frame_number = io_funcs.load_binary_file_frame(file_name, dimension)

          #  logger.info('processing %4d of %4d: %s' % (findex,flen,file_name) )
          #   if file_name == "/home/gyzhang/merlin/egs/kingtts/s2/experiments/kingtts/acoustic_model/gen/feed_forward_4_relu/103002.cmp":
          #       pdb.set_trace()
            for feature_name in self.gen_wav_features:

                logger.debug(' feature: %s' % feature_name)

                current_features = features[:, stream_start_index[feature_name]:stream_start_index[feature_name]+out_dimension_dict[feature_name]]
                if FAST_MLPG:
                    ### fast version wants variance per frame, not single global one:
                    var = self.var[feature_name]
                    var = numpy.transpose(numpy.tile(var,frame_number))
                else:
                    var = self.var[feature_name]

#                print  var.shape[1]
                if do_MLPG == False:
                    gen_features = current_features
                else:
                    # print("mlpg generate {}".format(file_name))
                    gen_features = mlpg_algo.generation(current_features, var, out_dimension_dict[feature_name]//3)
                logger.debug(' feature dimensions: %d by %d' %(gen_features.shape[0], gen_features.shape[1]))

                if feature_name in ['lf0', 'F0']:
                    if 'vuv' in stream_start_index:
                        vuv_feature = features[:, stream_start_index['vuv']:stream_start_index['vuv']+1]

                        for i in range(frame_number):
                            if vuv_feature[i, 0] < 0.5 or gen_features[i, 0] < numpy.log(20):
                                gen_features[i, 0] = self.inf_float

                new_file_name = os.path.join(dir_name, file_id + file_extension_dict[feature_name])

                if self.enforce_silence:
                    silence_pattern = cfg.silence_pattern
                    label_align_dir = cfg.in_label_align_dir
                    in_f = open(label_align_dir+'/'+file_id+'.lab','r')
                    for line in in_f.readlines():
                        line = line.strip()
                        if len(line) < 1:
                            continue
                        temp_list  = re.split('\s+', line)
                        start_time = int(int(temp_list[0])*(10**-4)/5)
                        end_time   = int(int(temp_list[1])*(10**-4)/5)

                        full_label = temp_list[2]

                        label_binary_flag = self.check_silence_pattern(full_label, silence_pattern)

                        if label_binary_flag:
                            if feature_name in ['lf0', 'F0', 'mag']:
                                gen_features[start_time:end_time, :] = self.inf_float
                            else:
                                gen_features[start_time:end_time, :] = 0.0

                io_funcs.array_to_binary_file(gen_features, new_file_name)
                logger.debug(' wrote to file %s' % new_file_name)