Python put_speech示例，util.speech_manip.put_speech Python示例

示例#1

0

显示文件

文件： FeatureDumper.py 项目： arnacadia/Oscar

    def process_utterance(self, utt, make_label=True):

        utt_data = []

        utt_questions = defaultdict(int)  ## {}

        nodelist = utt.xpath(self.target_nodes)
        if nodelist == []:
            print(
                'WARNING: FeatureDumper\'s target_nodes matches no nodes: %s' %
                (self.config["target_nodes"]))

        for node in nodelist:
            node_data, node_questions = self.get_node_context_label(node)
            utt_data.append(node_data)

            ##utt_questions.update(node_questions)
            ## Sum the dictionaries' values:
            for question in node_questions:
                utt_questions[question] += node_questions[question]

        if make_label:
            label_file = utt.get_filename(self.output_filetype)
            if self.binary_output:
                utt_data = [line.split(' ') for line in utt_data]
                ## In case of string data being present, following line will give:
                ## ValueError: could not convert string to float: a
                utt_data = numpy.array(utt_data, dtype='float')
                put_speech(utt_data, label_file)
            else:
                writelist(utt_data, label_file, uni=True)

        return (utt_data, utt_questions
                )  ## for writing utterance-level labels,

示例#2

0

显示文件

    def generate_from_norm_binary_lab(self, bin_label_file, labdim, outwave, enforce_silence=False, mlpg=True,
                                      vuv_thresh=0.5, fzero_scale=1.0):

        input = get_speech(bin_label_file, labdim)

        # input = input[:500,:]
        output = self.predict(input, input_normalisation=True)

        put_speech(output, '/afs/inf.ed.ac.uk/user/o/owatts/temp/cpu_gen/undenorm_66_015_from_norm_lab.cmp')
        sys.exit('vliadnviadnvdvn stoped early')

        streams = self.split_into_streams(output)

        if mlpg:
            mlpged = {}
            for (stream, data) in streams.items():
                if stream in self.indims:
                    mlpg_data = self.param_generator.generation(data, self.stream_std[stream], self.indims[stream])
                else:
                    mlpg_data = data
                mlpged[stream] = mlpg_data
            streams = mlpged

        else:
            # take statics only!
            statics = {}
            for (stream, data) in streams.items():
                if stream in self.indims:
                    statics[stream] = data[:, :self.indims[stream]]
                else:  ## for e.g. vuv
                    statics[stream] = data
            streams = statics

        if enforce_silence:
            for (stream, data) in streams.items():
                print input[:, self.silent_feature_indices]
                sys.exit('ntfbdfbsfrbsfbr')
                silent_frames = numpy.sum(input[:, self.silent_feature_indices], axis=1)
                data[silent_frames == 1.0, :] = 0.0
                streams[stream] = data

        if 'lf0' in streams:
            fzero = numpy.exp(streams['lf0'])

            if 'vuv' in streams:
                vuv = streams['vuv']
                lf0 = streams['lf0']
                fzero[vuv <= vuv_thresh] = 0.0

            fzero *= fzero_scale

            streams['lf0'] = fzero

        self.world_resynth(streams, outwave)

示例#3

0

显示文件

    def world_resynth(self, streams, outfile):
        '''
        refactored this from AcousticModel. TODO: clean up more, and replace also in AM
        '''
        
        bin_dir = self.hts_dir     ## world here too
        
#         alpha = 0.42
#         order = 39
#         fftl = 1024
#         sr = 16000

        alpha = self.alpha # 0.71
        order = self.mcep_order # 59
        sr = self.sample_rate # 44100
        fftl = self.fftl
      
        
        for (stream, data) in streams.items():   
            put_speech(data, '/tmp/tmp.%s'%(stream))  
            comm=bin_dir+"/x2x +fd /tmp/tmp."+stream+" >/tmp/tmp_d."+stream
            print comm
            os.system(comm)


        comm = "%s/mgc2sp -a %s -g 0 -m %s -l %s -o 2 /tmp/tmp.mgc | %s/sopr -d 32768.0 -P | %s/x2x +fd -o > /tmp/tmp.spec"%(bin_dir, alpha, order, fftl, bin_dir, bin_dir)
        print comm
        os.system(comm)
    
        '''Avoid:   x2x : error: input data is over the range of type 'double'!
               -o      : clip by minimum and maximum of output data            
                 type if input data is over the range of               
                 output data type.
        '''    

        comm = "%s/synth %s %s /tmp/tmp_d.lf0 /tmp/tmp.spec /tmp/tmp_d.bap /tmp/tmp.resyn.wav"%(bin_dir, fftl, sr)
        print comm
        os.system(comm)
        
        os.system("mv /tmp/tmp.resyn.wav "+outfile)
        print 'Produced %s'%(outfile)

示例#4

0

显示文件

文件： FeatureExtractor.py 项目： kevinyang007/Ossian

    def process_utterance(self, utt):

        ## If there is no waveform attached to the utt, don't do anything:
        if not utt.has_attribute("waveform"):
            return

            ## Add some data to the utt structure recording the structure of the
        ## associated acoustic features we've produced. Do this first, in case
        ## we use existing features.
        self.stream_sizes[
            1] = '1'  ## otherwise '1 1 1' for F0    TODO: fix this nicely!
        utt.add_acoustic_stream_info(self.feats, self.stream_sizes)

        ## If a feature file already exists, skip:
        if utt.has_external_data(self.output_filetype):
            ##  TODO: check description against existing feats?
            return

        ## else extract features
        infile = utt.get("waveform")
        outfile = utt.get_filename(self.output_filetype)

        ## strip suffix .cmp:-
        assert outfile.endswith('.' + self.output_filetype)
        chars_to_strip = len(self.output_filetype) + 1
        outstem = outfile[:-chars_to_strip]

        rate = self.rate
        sample_rate = self.rate
        alpha = self.alpha
        order = self.order
        fftl = self.fftl
        apsize = self.apsize
        frameshift_ms = self.frameshift_ms

        script_dir = self.voice_resources.path[c.SCRIPT]

        ## 1) remove wave header, downsample etc. with sox:
        comm = "sox -t wav " + infile
        comm += " -c 1 -e signed-integer "
        comm += " -r %s" % (rate)
        comm += " -b 16 "
        comm += " " + outstem + ".wav"
        comm += " dither"  ## added for hi and rj data blizz 2014
        success = os.system(comm)
        if success != 0:
            print 'sox failed on utterance ' + utt.get("utterance_name")
            return

        comm = "%s/analysis %s.wav %s.f0.double %s.sp.double %s.bap.double > %s.log" % (
            self.tool, outstem, outstem, outstem, outstem, outstem)
        success = os.system(comm)  # This command is very slow
        # print comm
        if success != 0:
            print 'world analysis failed on utterance ' + utt.get(
                "utterance_name")
            return

        if self.resynthesise_training_data:
            ## resynthesis to test
            comm = "%s/synth %s %s %s.f0.double %s.sp.double %s.bap.double %s.resyn.wav > %s.log" % (
                self.tool, fftl, rate, outstem, outstem, outstem, outstem,
                outstem)
            success = os.system(comm)
            if success != 0:
                print 'world synthesis failed on utterance ' + utt.get(
                    "utterance_name")
                return

        comm = "%s/x2x +df %s.sp.double | %s/sopr -R -m 32768.0 | %s/mcep -a %s -m %s -l %s -j 0 -f 0.0 -q 3 > %s.mgc" % (
            self.tool, outstem, self.tool, self.tool, alpha, order, fftl,
            outstem)
        ## -e 1.0E-8
        success = os.system(comm)  # This command is very slow
        if success != 0:
            print 'conversion of world spectrum to mel cepstra failed on utterance ' + utt.get(
                "utterance_name")
            return

        for stream in ['bap']:
            comm = "%s/x2x +df %s.%s.double > %s.%s" % (
                self.tool, outstem, stream, outstem, stream)
            success = os.system(comm)
            if success != 0:
                print 'double -> float conversion (stream: ' + stream + ') failed on utterance ' + utt.get(
                    "utterance_name")
                return

        for stream in ['f0']:
            comm = "%s/x2x +da %s.%s.double > %s.%s.txt" % (
                self.tool, outstem, stream, outstem, stream)
            success = os.system(comm)
            if success != 0:
                print 'double -> ascii conversion (stream: ' + stream + ') failed on utterance ' + utt.get(
                    "utterance_name")
                return

                ## 5) F0 conversion:
        f0 = [float(val) for val in readlist(outstem + '.f0.txt')]
        log_f0 = []
        for val in f0:
            if val == 0.0:
                log_f0.append('-1.0E10')
            else:
                log_f0.append(math.log(val))
        writelist(log_f0, outstem + '.f0.log')

        comm = "%s/x2x +af %s.f0.log > %s.lf0" % (self.tool, outstem, outstem)
        success = os.system(comm)
        if success != 0:
            print 'writing log f0 failed on utterance ' + utt.get(
                "utterance_name")
            return

        ## add mcep/ap/f0 deltas:
        for (stream, dimen) in [('mgc', order + 1), ('bap', apsize),
                                ('lf0', 1)]:
            comm = "perl %s/window.pl %s " % (script_dir, dimen)
            comm += "%s.%s %s > %s.%s.delta" % (outstem, stream, ' '.join(
                self.winfiles), outstem, stream)
            success = os.system(comm)  # This command is very slow
            if success != 0:
                print 'delta (' + stream + ') extraction failed on utterance ' + utt.get(
                    "utterance_name")
                return

        ### combined streams:--
        ap = get_speech(outstem + '.bap.delta', apsize * len(self.winfiles))
        mgc = get_speech(outstem + '.mgc.delta',
                         (order + 1) * len(self.winfiles))
        lf0 = get_speech(outstem + '.lf0.delta', 1 * len(self.winfiles))
        cmp = numpy.hstack([mgc, lf0, ap])
        put_speech(cmp, outfile)

        ## 7) add header
        floats_per_frame = (order + 2 + apsize) * len(
            self.winfiles)  ## +2 for energy and F0
        add_htk_header(outfile, floats_per_frame, frameshift_ms)

        ## 8) tidy:
        self.extensions_to_keep = ['.' + self.output_filetype,
                                   '.f0.txt']  ## TODO: make configuable?
        self.extensions_to_keep.append('.resyn.wav')
        self.extensions_to_keep.extend(['.mgc', '.bap', '.lf0'])

        keepfiles = [outstem + ending for ending in self.extensions_to_keep]

        for junk in glob.glob(outstem + '.*'):
            if not junk in keepfiles:
                os.remove(junk)

示例#5

0

显示文件

文件： AcousticModel.py 项目： 031323/Ossian

    def process_utterance(self, utt):
        from numpy import loadtxt, savetxt,exp, mean,median
        if utt.has_attribute("waveform"):
            #print "Utt has a natural waveform -- don't synthesise"
            return



        if not self.trained:
            print 'WARNING: Cannot apply processor %s till model is trained'%(self.processor_name)
            return
        
        #self.postfilter_coeff = self.postfilter_coeff
        #self.scale_var = self.config.get('scale_var','n')
        #self.speech_rate = float(self.config.get('speech_rate',1.0))
        
        self.model_dir = os.path.join(self.get_location())
        bin_dir = self.voice_resources.path[c.BIN]
        
        label = utt.get_filename(self.input_label_filetype) 
        owave = utt.get_filename(self.output_filetype)
        
        # generate parameters with hts_engine, one stream at the time
        feats = str.split(self.stream_definitions["STREAM_NAMES"])
        os.system('mkdir -p ./tmp')
        #self.vuv = 0.4
        for f in feats:
        #for f in ['mgc']:
        
            comm = self.hts_dir + '/hts_engine '
            comm += "  -td %s/tree-duration.inf "%(self.model_dir)
            comm += "  -md %s/duration.pdf "%(self.model_dir)

            comm += "  -tf %s/tree-lf0.inf "%(self.model_dir)
            comm += "  -mf %s/lf0.pdf "%(self.model_dir)
        
            comm += "  -tm %s/tree-%s.inf "%(self.model_dir, f)
            comm += "  -mm %s/%s.pdf "%(self.model_dir,f)

            comm += "  -ow ./tmp/tmp.wav"        
            comm += "  -om ./tmp/tmp.%s"%(f)    
            comm += "  -of ./tmp/tmp.lf0"    
            
            ## windows:
            for stream in ['f', 'm']:
                for winfile in self.winfiles:
                    comm += "  -d%s %s "%(stream, winfile)
            comm += " -b %s "%(self.postfilter_coeff) ## for postfiltering 
            comm += " -r %s "%(self.speech_rate) 
            #comm += "-r 0.75 "
            #comm += "-p 240 "
            #comm += "-s 48000 "
            comm += "  -u %s "%(self.vuv)
            #comm += "  -u 0.95 "
            #comm += "  -ow %s "%(owave)
            comm += " -ot %s.log "%(label)
            comm += " -od ./tmp/tmp.dur "
            comm += "    %s  "%(label)
        
            print comm
            
            os.system(comm)
            if f=='mgc':
                os.system('cp ./tmp/tmp.wav '+'./tmp/hts.wav')

        #return
        
        
        ### hack -- tile silences with pure silence:
        sils = silence_frames_from_trace(label+ '.log')
        
        fftl, ap_dim = get_world_fft_and_apdim(self.sample_rate)

        fz= get_speech('./tmp/tmp.lf0',1)
        mgc= get_speech('./tmp/tmp.mgc',self.speech_coding_config['order']+1) # 40)
        ap= get_speech('./tmp/tmp.bap',ap_dim)

        for (i,val) in enumerate(sils):
            if val == 1:
                mgc[i,:] = 0.0
                fz[i] = -1.0
                ap[i] = 0.0
        
        ##
        #ap = np.zeros(np.shape(ap))
                
                
        # var sscale:
        if self.scale_var != 1.0:
           mgc = scale_variance(mgc, scale_factor=self.scale_var)


        ap =np.zeros(np.shape(ap))
        put_speech(fz, './tmp/tmp.lf0')
        put_speech(mgc, './tmp/tmp.mgc')
        put_speech(ap, './tmp/tmp.bap') 

        # process parameters -- OSW todo wavesynth processor sharing config with extraction
            
        f0 = []
        for f in reversed(feats):
        
            if f == "lf0":
                os.system(bin_dir+"/x2x +fa ./tmp/tmp."+f+" >./tmp/tmp_a."+f)
                
                f0 = loadtxt('./tmp/tmp_a.lf0')
                f0[f0>0]=exp(f0[f0>0])
                f0[f0<=0] = 0
                savetxt("./tmp/tmp_a.f0", f0.astype('float'), fmt = '%.8f')        
        
                os.system(bin_dir+"/x2x +ad ./tmp/tmp_a.f0 > ./tmp/tmp_a.f0.d")
        
            else:
                os.system(bin_dir+"/x2x +fd ./tmp/tmp."+f+" >./tmp/tmp_d."+f)
            
        


        bin = self.hts_dir  ## world here too
        
        
        alpha = self.alpha
        order = self.mcep_order
        sr = self.sample_rate



        '''
        alpha = 0.77
        order = 59
        fftl = 2048
        sr = 48000
        '''

        
        print 'h1'
        comm = "%s/mgc2sp -a %s -g 0 -m %s -l %s -o 2 ./tmp/tmp.mgc | %s/sopr -d 32768.0 -P | %s/x2x +fd -o > ./tmp/tmp.spec"%(bin, alpha, order, fftl, bin, bin)
        #comm = "%s/mgc2sp -a %s -g 0 -m %s -l %s -o 2 ./tmp/tmp.mgc | %s/sopr -d 32768.0 -P > ./tmp/tmp.spec"%(bin, alpha, order, fftl, bin, bin)
        os.system(comm)
    
        '''Avoid:   x2x : error: input data is over the range of type 'double'!
               -o      : clip by minimum and maximum of output data            
                 type if input data is over the range of               
                 output data type.
        '''    
    
    


        comm = "%s/synth %s %s ./tmp/tmp_a.f0.d ./tmp/tmp.spec ./tmp/tmp_d.bap ./tmp/tmp.resyn.wav"%(bin, fftl, sr)
        print comm
        os.system(comm)
        os.system("mv ./tmp/tmp.resyn.wav "+owave)