def generate_wav(gen_dir, file_id_list, cfg): logger = logging.getLogger("wav_generation") SPTK = cfg.SPTK # NND = cfg.NND STRAIGHT = cfg.STRAIGHT WORLD = cfg.WORLD ## to be moved pf_coef = cfg.pf_coef if isinstance(cfg.fw_alpha, basestring): if cfg.fw_alpha == 'Bark': fw_coef = bark_alpha(cfg.sr) elif cfg.fw_alpha == 'ERB': fw_coef = bark_alpha(cfg.sr) else: raise ValueError( 'cfg.fw_alpha=' + cfg.fw_alpha + ' not implemented, the frequency warping coefficient "fw_coef" cannot be deduced.' ) else: fw_coef = cfg.fw_alpha co_coef = cfg.co_coef fl_coef = cfg.fl if cfg.apply_GV: io_funcs = BinaryIOCollection() logger.info('loading global variance stats from %s' % (cfg.GV_dir)) ref_gv_mean_file = os.path.join(cfg.GV_dir, 'ref_gv.mean') gen_gv_mean_file = os.path.join(cfg.GV_dir, 'gen_gv.mean') ref_gv_std_file = os.path.join(cfg.GV_dir, 'ref_gv.std') gen_gv_std_file = os.path.join(cfg.GV_dir, 'gen_gv.std') ref_gv_mean, frame_number = io_funcs.load_binary_file_frame( ref_gv_mean_file, 1) gen_gv_mean, frame_number = io_funcs.load_binary_file_frame( gen_gv_mean_file, 1) ref_gv_std, frame_number = io_funcs.load_binary_file_frame( ref_gv_std_file, 1) gen_gv_std, frame_number = io_funcs.load_binary_file_frame( gen_gv_std_file, 1) counter = 1 max_counter = len(file_id_list) for filename in file_id_list: logger.info('creating waveform for %4d of %4d: %s' % (counter, max_counter, filename)) counter = counter + 1 base = filename files = { 'sp': base + cfg.sp_ext, 'mgc': base + cfg.mgc_ext, 'f0': base + '.f0', 'lf0': base + cfg.lf0_ext, 'ap': base + '.ap', 'bap': base + cfg.bap_ext, 'shape': base + cfg.shape_ext, 'texture': base + cfg.texture_ext, 'wav': base + '.wav' } mgc_file_name = files['mgc'] bap_file_name = files['bap'] cur_dir = os.getcwd() os.chdir(gen_dir) ### post-filtering if cfg.do_post_filtering and cfg.audio: line = "echo 1 1 " for i in range(2, cfg.mgc_dim): line = line + str(pf_coef) + " " run_process('{line} | {x2x} +af > {weight}'.format( line=line, x2x=SPTK['X2X'], weight=os.path.join(gen_dir, 'weight'))) run_process( '{freqt} -m {order} -a {fw} -M {co} -A 0 < {mgc} | {c2acr} -m {co} -M 0 -l {fl} > {base_r0}' .format(freqt=SPTK['FREQT'], order=cfg.mgc_dim - 1, fw=fw_coef, co=co_coef, mgc=files['mgc'], c2acr=SPTK['C2ACR'], fl=fl_coef, base_r0=files['mgc'] + '_r0')) run_process( '{vopr} -m -n {order} < {mgc} {weight} | {freqt} -m {order} -a {fw} -M {co} -A 0 | {c2acr} -m {co} -M 0 -l {fl} > {base_p_r0}' .format(vopr=SPTK['VOPR'], order=cfg.mgc_dim - 1, mgc=files['mgc'], weight=os.path.join(gen_dir, 'weight'), freqt=SPTK['FREQT'], fw=fw_coef, co=co_coef, c2acr=SPTK['C2ACR'], fl=fl_coef, base_p_r0=files['mgc'] + '_p_r0')) run_process( '{vopr} -m -n {order} < {mgc} {weight} | {mc2b} -m {order} -a {fw} | {bcp} -n {order} -s 0 -e 0 > {base_b0}' .format(vopr=SPTK['VOPR'], order=cfg.mgc_dim - 1, mgc=files['mgc'], weight=os.path.join(gen_dir, 'weight'), mc2b=SPTK['MC2B'], fw=fw_coef, bcp=SPTK['BCP'], base_b0=files['mgc'] + '_b0')) run_process( '{vopr} -d < {base_r0} {base_p_r0} | {sopr} -LN -d 2 | {vopr} -a {base_b0} > {base_p_b0}' .format(vopr=SPTK['VOPR'], base_r0=files['mgc'] + '_r0', base_p_r0=files['mgc'] + '_p_r0', sopr=SPTK['SOPR'], base_b0=files['mgc'] + '_b0', base_p_b0=files['mgc'] + '_p_b0')) run_process( '{vopr} -m -n {order} < {mgc} {weight} | {mc2b} -m {order} -a {fw} | {bcp} -n {order} -s 1 -e {order} | {merge} -n {order2} -s 0 -N 0 {base_p_b0} | {b2mc} -m {order} -a {fw} > {base_p_mgc}' .format(vopr=SPTK['VOPR'], order=cfg.mgc_dim - 1, mgc=files['mgc'], weight=os.path.join(gen_dir, 'weight'), mc2b=SPTK['MC2B'], fw=fw_coef, bcp=SPTK['BCP'], merge=SPTK['MERGE'], order2=cfg.mgc_dim - 2, base_p_b0=files['mgc'] + '_p_b0', b2mc=SPTK['B2MC'], base_p_mgc=files['mgc'] + '_p_mgc')) mgc_file_name = files['mgc'] + '_p_mgc' if (cfg.vocoder_type == "STRAIGHT" or cfg.vocoder_type == "STRAIGHT_M_TRIAL") and cfg.apply_GV and cfg.audio: gen_mgc, frame_number = io_funcs.load_binary_file_frame( mgc_file_name, cfg.mgc_dim) gen_mu = np.reshape(np.mean(gen_mgc, axis=0), (-1, 1)) gen_std = np.reshape(np.std(gen_mgc, axis=0), (-1, 1)) local_gv = (ref_gv_std / gen_gv_std) * (gen_std - gen_gv_mean) + ref_gv_mean enhanced_mgc = np.repeat(local_gv, frame_number, 1).T / np.repeat( gen_std, frame_number, 1).T * (gen_mgc - np.repeat( gen_mu, frame_number, 1).T) + np.repeat( gen_mu, frame_number, 1).T new_mgc_file_name = files['mgc'] + '_p_mgc' io_funcs.array_to_binary_file(enhanced_mgc, new_mgc_file_name) mgc_file_name = files['mgc'] + '_p_mgc' if cfg.do_post_filtering and cfg.apply_GV and cfg.audio: logger.critical( 'Both smoothing techniques together can\'t be applied!!\n') raise ###mgc to sp to wav # if cfg.vocoder_type == 'STRAIGHT' and cfg.audio: # run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} > {sp}' # .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.mgc_dim-1, fl=cfg.fl, mgc=mgc_file_name, sp=files['sp'])) # run_process('{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} > {f0}'.format(sopr=SPTK['SOPR'], lf0=files['lf0'], f0=files['f0'])) # run_process('{x2x} +fa {f0} > {f0a}'.format(x2x=SPTK['X2X'], f0=files['f0'], f0a=files['f0'] + '.a')) # if cfg.use_cep_ap: # run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 0 {bap} > {ap}' # .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.bap_dim-1, fl=cfg.fl, bap=files['bap'], ap=files['ap'])) # else: # run_process('{bndap2ap} {bap} > {ap}' # .format(bndap2ap=STRAIGHT['BNDAP2AP'], bap=files['bap'], ap=files['ap'])) # run_process('{synfft} -f {sr} -spec -fftl {fl} -shift {shift} -sigp 0.0 -cornf 400 -float -apfile {ap} {f0a} {sp} {wav}' # .format(synfft=STRAIGHT['SYNTHESIS_FFT'], sr=cfg.sr, fl=cfg.fl, shift=cfg.shift, ap=files['ap'], f0a=files['f0']+'.a', sp=files['sp'], wav=files['wav'])) # run_process('rm -f {sp} {f0} {f0a} {ap}' # .format(sp=files['sp'],f0=files['f0'],f0a=files['f0']+'.a',ap=files['ap'])) ###mgc to sp to wav if cfg.vocoder_type == 'STRAIGHT' and cfg.audio: run_process( '{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} > {sp}' .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.mgc_dim - 1, fl=cfg.fl, mgc=mgc_file_name, sp=files['sp'])) run_process( '{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} > {f0}'.format( sopr=SPTK['SOPR'], lf0=files['lf0'], f0=files['f0'])) run_process('{x2x} +fa {f0} > {f0a}'.format(x2x=SPTK['X2X'], f0=files['f0'], f0a=files['f0'] + '.a')) if cfg.use_cep_ap: run_process( '{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 0 {bap} > {ap}' .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.bap_dim - 1, fl=cfg.fl, bap=files['bap'], ap=files['ap'])) else: run_process('{bndap2ap} {bap} > {ap}'.format( bndap2ap=STRAIGHT['BNDAP2AP'], bap=files['bap'], ap=files['ap'])) size = os.path.getsize(files['f0']) size = size / 4 straight_normalization = 1024.0 / (2200.0 * 32768.0) spectralUpdateInterval = 1000.0 * cfg.shift / cfg.sr synth_straight_file_name = base + '_synth.m' synth_straight_file = open(synth_straight_file_name, "w") synth_straight_file.write("addpath(path,'%s');\n" % cfg.STRAIGHT_DIR) synth_straight_file.write("prm.spectralUpdateInterval = %f;\n" % spectralUpdateInterval) synth_straight_file.write( "prm.levelNormalizationIndicator = 0;\n\n") synth_straight_file.write("fprintf(1,'\\nSynthesizing %s\\n');\n" % files['wav']) synth_straight_file.write("fid1 = fopen('%s','r','%s');\n" % (files['sp'], "ieee-le")) synth_straight_file.write("fid2 = fopen('%s','r','%s');\n" % (files['ap'], "ieee-le")) synth_straight_file.write("fid3 = fopen('%s','r','%s');\n" % (files['f0'], "ieee-le")) synth_straight_file.write("sp = fread(fid1,[%d, %d],'float');\n" % (cfg.sp_dim, size)) synth_straight_file.write("ap = fread(fid2,[%d, %d],'float');\n" % (cfg.sp_dim, size)) synth_straight_file.write("f0 = fread(fid3,[%d, %d],'float');\n" % (1, size)) synth_straight_file.write("fclose(fid1);\n") synth_straight_file.write("fclose(fid2);\n") synth_straight_file.write("fclose(fid3);\n") synth_straight_file.write( "sp = sp/32768.0;\n") # we use this normalization now synth_straight_file.write( "[sy] = exstraightsynth(f0,sp,ap,%d,prm);\n" % cfg.sr) synth_straight_file.write("wavwrite( sy, %d, '%s');\n\n" % (cfg.sr, files['wav'])) synth_straight_file.write("quit;\n") synth_straight_file.close() os.system("%s < %s" % (cfg.MATLAB_COMMAND, synth_straight_file_name)) if cfg.visual == True: # generate talking head synth_straight_file_name = base + '_vid_synth.m' synth_straight_file = open(synth_straight_file_name, "w") bytes_per_frame_shape = cfg.shape_dim * 4 output_name_shape = base + '.shape_h' run_process( '/usr/bin/perl %s %d %d %d 9 %s > %s' % (cfg.addhtkheader, cfg.sr, cfg.shift, bytes_per_frame_shape, files['shape'], output_name_shape)) bytes_per_frame_texture = cfg.texture_dim * 4 output_name_texture = base + '.texture_h' run_process( '/usr/bin/perl %s %d %d %d 9 %s > %s' % (cfg.addhtkheader, cfg.sr, cfg.shift, bytes_per_frame_texture, files['texture'], output_name_texture)) synth_straight_file.write("cd %s;\n" % cfg.aam_tools_path) synth_straight_file.write("parpool;\n") synth_straight_file.write("addpath mex\n") synth_straight_file.write("addpath utils\n") synth_straight_file.write("params.num_sh = %d;\n" % cfg.shape_dim) synth_straight_file.write("params.num_te = %d;\n" % cfg.texture_dim) synth_straight_file.write("params.fps = %f;\n" % 29.97) synth_straight_file.write("figure('Visible','Off')\n") synth_straight_file.write( "convert_to_vid('%s', '%s', '%s', '%s', '%s', '%s', params);\n" % (cfg.aam_model, os.path.join(gen_dir, output_name_shape), os.path.join(gen_dir, output_name_texture), os.path.join(gen_dir, (base + '.wav')), os.path.join(gen_dir, (base + '.mkv')), gen_dir)) # synth_straight_file.write("pause(1);\n") synth_straight_file.write("quit;\n") synth_straight_file.close() # run_process("%s < %s" % (cfg.MATLAB_COMMAND_V, synth_straight_file_name)) os.system("%s < %s" % (cfg.MATLAB_COMMAND_V, synth_straight_file_name)) os.chdir(cur_dir)
def dnn_generation(valid_file_list, nnets_file_name, n_ins, n_outs, out_file_list, target_mean_vector, target_std_vector, out_dimension_dict, file_extension_dict, vocoder='straight'): logger = logging.getLogger("dnn_generation") logger.debug('Starting dnn_generation') inf_float = -1.0e+10 plotlogger = logging.getLogger("plotting") cfg.gen_wav_features if vocoder == 'straight': gen_wav_features = ['mgc', 'lf0', 'bap'] elif vocoder == 'glotthmm': gen_wav_features = ['F0', 'Gain', 'HNR', 'LSF', 'LSFsource'] ## TODO: take this from config else: sys.exit('unsupported vocoder %s !' % (vocoder)) stream_start_index = {} dimension_index = 0 for feature_name in list(out_dimension_dict.keys()): stream_start_index[feature_name] = dimension_index dimension_index += out_dimension_dict[feature_name] dnn_model = pickle.load(open(nnets_file_name, 'rb')) file_number = len(valid_file_list) io_funcs = BinaryIOCollection() mlpg = MLParameterGenerationFast() for i in range(file_number): logger.info('generating %4d of %4d: %s' % (i + 1, file_number, valid_file_list[i])) fid_lab = open(valid_file_list[i], 'rb') features = numpy.fromfile(fid_lab, dtype=numpy.float32) fid_lab.close() features = features[:(n_ins * (features.size / n_ins))] features = features.reshape((-1, n_ins)) frame_number = features.shape[0] test_set_x = theano.shared( numpy.asarray(features, dtype=theano.config.floatX)) mean_matrix = numpy.tile(target_mean_vector, (features.shape[0], 1)) std_matrix = numpy.tile(target_std_vector, (features.shape[0], 1)) predicted_mix = dnn_model.parameter_prediction_mix( test_set_x=test_set_x) max_index = numpy.argmax(predicted_mix, axis=1) temp_predicted_mu = dnn_model.parameter_prediction( test_set_x=test_set_x) temp_predicted_sigma = dnn_model.parameter_prediction_sigma( test_set_x=test_set_x) predicted_mu = numpy.zeros((temp_predicted_mu.shape[0], n_outs)) predicted_sigma = numpy.zeros((temp_predicted_sigma.shape[0], n_outs)) for kk in range(temp_predicted_mu.shape[0]): predicted_mu[kk, :] = temp_predicted_mu[kk, max_index[kk] * n_outs:(max_index[kk] + 1) * n_outs] predicted_sigma[kk, :] = temp_predicted_sigma[ kk, max_index[kk] * n_outs:(max_index[kk] + 1) * n_outs] # print predicted_mu.shape # predicted_mu = predicted_mu[aa*n_outs:(aa+1)*n_outs] predicted_mu = predicted_mu * std_matrix + mean_matrix predicted_sigma = ((predicted_sigma**0.5) * std_matrix)**2 dir_name = os.path.dirname(out_file_list[i]) file_id = os.path.splitext(os.path.basename(out_file_list[i]))[0] mlpg = MLParameterGenerationFast() for feature_name in gen_wav_features: current_features = predicted_mu[:, stream_start_index[feature_name]: stream_start_index[feature_name] + out_dimension_dict[feature_name]] current_sigma = predicted_sigma[:, stream_start_index[feature_name]: stream_start_index[feature_name] + out_dimension_dict[feature_name]] gen_features = mlpg.generation( current_features, current_sigma, out_dimension_dict[feature_name] / 3) if feature_name in ['lf0', 'F0']: if 'vuv' in stream_start_index: vuv_feature = predicted_mu[:, stream_start_index['vuv']: stream_start_index['vuv'] + 1] for i in range(frame_number): if vuv_feature[i, 0] < 0.5: gen_features[i, 0] = inf_float # print gen_features new_file_name = os.path.join( dir_name, file_id + file_extension_dict[feature_name]) io_funcs.array_to_binary_file(gen_features, new_file_name)
def acoustic_decomposition(self, in_file_list, dimension, out_dimension_dict, file_extension_dict, var_file_dict, do_MLPG=True, cfg=None, meta=None): logger = logging.getLogger('param_generation') logger.debug('acoustic_decomposition for %d files' % len(in_file_list)) self.load_covariance(var_file_dict, out_dimension_dict) stream_start_index = {} dimension_index = 0 recorded_vuv = False vuv_dimension = None for feature_name in sorted(out_dimension_dict.keys()): # if feature_name != 'vuv': stream_start_index[feature_name] = dimension_index # else: # vuv_dimension = dimension_index # recorded_vuv = True dimension_index += out_dimension_dict[feature_name] io_funcs = BinaryIOCollection() mlpg_algo = MLParameterGeneration() findex = 0 flen = len(in_file_list) for file_name in in_file_list: findex = findex + 1 dir_name = os.path.dirname(file_name) file_id = os.path.splitext(os.path.basename(file_name))[0] features, frame_number = io_funcs.load_binary_file_frame( file_name, dimension) logger.info('processing %4d of %4d: %s' % (findex, flen, file_name)) for feature_name in self.gen_wav_features: logger.debug(' feature: %s' % feature_name) current_features = features[:, stream_start_index[feature_name]: stream_start_index[feature_name] + out_dimension_dict[feature_name]] if FAST_MLPG: ### fast version wants variance per frame, not single global one: var = self.var[feature_name] var = numpy.transpose(numpy.tile(var, frame_number)) else: var = self.var[feature_name] if feature_name == 'lf0' and meta is not None: cur_ind = 60 for syllable in meta: for note in syllable['notes']: current_features[int(cur_ind):int(cur_ind) + int(note[1]), 0] = note[0] cur_ind += note[1] # print var.shape[1] if do_MLPG == False: gen_features = current_features else: gen_features = mlpg_algo.generation( current_features, var, out_dimension_dict[feature_name] // 3) # else: # self.logger.critical("the dimensions do not match for MLPG: %d vs %d" %(var.shape[1], out_dimension_dict[feature_name])) # raise logger.debug(' feature dimensions: %d by %d' % (gen_features.shape[0], gen_features.shape[1])) if feature_name in ['lf0', 'F0']: if 'vuv' in stream_start_index: vuv_feature = features[:, stream_start_index['vuv']: stream_start_index['vuv'] + 1] for i in range(frame_number): if vuv_feature[i, 0] < 0.5 or gen_features[ i, 0] < numpy.log(20): gen_features[i, 0] = self.inf_float new_file_name = os.path.join( dir_name, file_id + file_extension_dict[feature_name]) if self.enforce_silence: silence_pattern = cfg.silence_pattern label_align_dir = cfg.in_label_align_dir in_f = open(label_align_dir + '/' + file_id + '.lab', 'r') for line in in_f.readlines(): line = line.strip() if len(line) < 1: continue temp_list = re.split('\s+', line) start_time = int(int(temp_list[0]) * (10**-4) / 5) end_time = int(int(temp_list[1]) * (10**-4) / 5) full_label = temp_list[2] label_binary_flag = self.check_silence_pattern( full_label, silence_pattern) if label_binary_flag: if feature_name in ['lf0', 'F0', 'mag']: gen_features[ start_time:end_time, :] = self.inf_float else: gen_features[start_time:end_time, :] = 0.0 io_funcs.array_to_binary_file(gen_features, new_file_name) logger.debug(' wrote to file %s' % new_file_name)
def modify_dur_from_phone_alignment_labels(self, label_file_name, gen_dur_file_name, gen_lab_file_name): logger = logging.getLogger("dur") dur_dim = 1 io_funcs = BinaryIOCollection() dur_features, frame_number = io_funcs.load_binary_file_frame( gen_dur_file_name, dur_dim) fid = open(label_file_name) utt_labels = fid.readlines() fid.close() label_number = len(utt_labels) logger.info('loaded %s, %3d labels' % (label_file_name, label_number)) out_fid = open(gen_lab_file_name, 'w') current_index = 0 prev_end_time = 0 for line in utt_labels: line = line.strip() if len(line) < 1: continue temp_list = re.split('\s+', line) if len(temp_list) == 1: start_time = 0 end_time = 3000000 ## hard-coded silence duration full_label = temp_list[0] else: start_time = int(temp_list[0]) end_time = int(temp_list[1]) full_label = temp_list[2] label_binary_flag = self.check_silence_pattern(full_label) if label_binary_flag == 1: current_phone_dur = end_time - start_time out_fid.write( str(prev_end_time) + ' ' + str(prev_end_time + current_phone_dur) + ' ' + full_label + '\n') prev_end_time = prev_end_time + current_phone_dur continue else: phone_dur = dur_features[current_index] phone_dur = int(phone_dur) * 5 * 10000 out_fid.write( str(prev_end_time) + ' ' + str(prev_end_time + phone_dur) + ' ' + full_label + '\n') prev_end_time = prev_end_time + phone_dur current_index += 1 logger.debug( 'modifed label with predicted duration of %d frames x %d features' % dur_features.shape)
def read_data_from_file_list(inp_file_list, out_file_list, inp_dim, out_dim, sequential_training=True): io_funcs = BinaryIOCollection() num_of_utt = len(inp_file_list) file_length_dict = {'framenum2utt': {}, 'utt2framenum': {}} if sequential_training: temp_set_x = {} temp_set_y = {} else: temp_set_x = np.empty((FRAME_BUFFER_SIZE, inp_dim)) temp_set_y = np.empty((FRAME_BUFFER_SIZE, out_dim)) ### read file by file ### current_index = 0 for i in xrange(num_of_utt): inp_file_name = inp_file_list[i] out_file_name = out_file_list[i] inp_features, inp_frame_number = io_funcs.load_binary_file_frame( inp_file_name, inp_dim) out_features, out_frame_number = io_funcs.load_binary_file_frame( out_file_name, out_dim) base_file_name = os.path.basename(inp_file_name).split(".")[0] if abs(inp_frame_number - out_frame_number) > 5: print 'the number of frames in input and output features are different: %d vs %d (%s)' % ( inp_frame_number, out_frame_number, base_file_name) sys.exit(0) else: frame_number = min(inp_frame_number, out_frame_number) if sequential_training: temp_set_x[base_file_name] = inp_features[0:frame_number] temp_set_y[base_file_name] = out_features[0:frame_number] else: temp_set_x[current_index:current_index + frame_number, ] = inp_features[0:frame_number] temp_set_y[current_index:current_index + frame_number, ] = out_features[0:frame_number] current_index += frame_number if frame_number not in file_length_dict['framenum2utt']: file_length_dict['framenum2utt'][frame_number] = [base_file_name] else: file_length_dict['framenum2utt'][frame_number].append( base_file_name) file_length_dict['utt2framenum'][base_file_name] = frame_number drawProgressBar(i + 1, num_of_utt) sys.stdout.write("\n") if not sequential_training: temp_set_x = temp_set_x[0:current_index, ] temp_set_y = temp_set_y[0:current_index, ] return temp_set_x, temp_set_y, file_length_dict
def load_next_batch(self): io_funcs = BinaryIOCollection() ## set sequence length for batch training if (self.training_algo == 1): # set seq length to maximum seq length from current batch self.set_seq_length_from_current_batch() elif (self.training_algo == 2): # set seq length to maximum seq length from current bucket while not self.current_bucket_size: self.get_next_bucket() elif (self.training_algo == 3): # seq length is set based on default/user configuration pass temp_set_x = numpy.zeros((self.buffer_size, self.n_ins)) temp_set_y = numpy.zeros((self.buffer_size, self.n_outs)) ### read file by file ### current_index = 0 while True: if current_index >= self.buffer_size: print('buffer size reached by file index %d' % (self.file_index)) break if self.training_algo == 2: # choose utterance from current bucket list base_file_name = self.current_bucket_list[ self.bucket_file_index] self.utt_index = self.file_length_dict['utt2index'][ base_file_name] else: # choose utterance randomly from current file list #self.utt_index = numpy.random.randint(self.list_size) ## choose utterance in serial order self.utt_index = self.file_index base_file_name = os.path.basename( self.x_files_list[self.utt_index]).split('.')[0] in_features, lab_frame_number = io_funcs.load_binary_file_frame( self.x_files_list[self.utt_index], self.n_ins) out_features, out_frame_number = io_funcs.load_binary_file_frame( self.y_files_list[self.utt_index], self.n_outs) frame_number = self.file_length_dict['utt2framenum'][ base_file_name] temp_set_x[current_index:current_index + frame_number, ] = in_features temp_set_y[current_index:current_index + frame_number, ] = out_features current_index += frame_number if ((self.file_index + 1) % self.merge_size == 0): num_of_samples = int( numpy.ceil(float(current_index) / float(self.seq_length))) current_index = self.seq_length * num_of_samples self.file_index += 1 # break for any of the below conditions if self.training_algo == 2: self.bucket_file_index += 1 if (self.bucket_file_index >= self.current_bucket_size): self.current_bucket_size = 0 break if (self.bucket_file_index % self.batch_size == 0): break else: if (self.file_index % self.batch_size == 0) or (self.file_index >= self.list_size): break if self.file_index >= self.list_size: self.end_reading = True self.file_index = 0 num_of_samples = int( numpy.ceil(float(current_index) / float(self.seq_length))) temp_set_x = temp_set_x[0:num_of_samples * self.seq_length, ] temp_set_y = temp_set_y[0:num_of_samples * self.seq_length, ] temp_set_x = temp_set_x.reshape(num_of_samples, self.seq_length, self.n_ins) temp_set_y = temp_set_y.reshape(num_of_samples, self.seq_length, self.n_outs) shared_set_x = self.make_shared(temp_set_x, 'x') shared_set_y = self.make_shared(temp_set_y, 'y') shared_set_xy = (shared_set_x, shared_set_y) return shared_set_xy, temp_set_x, temp_set_y
def wavgen_straight_type_vocoder(gen_dir, file_id_list, cfg, logger): ''' Waveform generation with STRAIGHT or WORLD vocoders. (whose acoustic parameters are: mgc, bap, and lf0) ''' SPTK = cfg.SPTK # NND = cfg.NND STRAIGHT = cfg.STRAIGHT WORLD = cfg.WORLD ## to be moved pf_coef = cfg.pf_coef if isinstance(cfg.fw_alpha, str): if cfg.fw_alpha == 'Bark': fw_coef = bark_alpha(cfg.sr) elif cfg.fw_alpha == 'ERB': fw_coef = bark_alpha(cfg.sr) else: raise ValueError( 'cfg.fw_alpha=' + cfg.fw_alpha + ' not implemented, the frequency warping coefficient "fw_coef" cannot be deduced.' ) else: fw_coef = cfg.fw_alpha co_coef = cfg.co_coef fl_coef = cfg.fl if cfg.apply_GV: io_funcs = BinaryIOCollection() logger.info('loading global variance stats from %s' % (cfg.GV_dir)) ref_gv_mean_file = os.path.join(cfg.GV_dir, 'ref_gv.mean') gen_gv_mean_file = os.path.join(cfg.GV_dir, 'gen_gv.mean') ref_gv_std_file = os.path.join(cfg.GV_dir, 'ref_gv.std') gen_gv_std_file = os.path.join(cfg.GV_dir, 'gen_gv.std') ref_gv_mean, frame_number = io_funcs.load_binary_file_frame( ref_gv_mean_file, 1) gen_gv_mean, frame_number = io_funcs.load_binary_file_frame( gen_gv_mean_file, 1) ref_gv_std, frame_number = io_funcs.load_binary_file_frame( ref_gv_std_file, 1) gen_gv_std, frame_number = io_funcs.load_binary_file_frame( gen_gv_std_file, 1) counter = 1 max_counter = len(file_id_list) for filename in file_id_list: logger.info('creating waveform for %4d of %4d: %s' % (counter, max_counter, filename)) counter = counter + 1 base = filename files = { 'sp': base + cfg.sp_ext, 'mgc': base + cfg.mgc_ext, 'f0': base + '.f0', 'lf0': base + cfg.lf0_ext, 'ap': base + '.ap', 'bap': base + cfg.bap_ext, 'wav': base + '.wav' } mgc_file_name = files['mgc'] bap_file_name = files['bap'] cur_dir = os.getcwd() os.chdir(gen_dir) ### post-filtering if cfg.do_post_filtering: mgc_file_name = files['mgc'] + '_p_mgc' post_filter(files['mgc'], mgc_file_name, cfg.mgc_dim, pf_coef, fw_coef, co_coef, fl_coef, gen_dir, cfg) if cfg.vocoder_type == "STRAIGHT" and cfg.apply_GV: gen_mgc, frame_number = io_funcs.load_binary_file_frame( mgc_file_name, cfg.mgc_dim) gen_mu = np.reshape(np.mean(gen_mgc, axis=0), (-1, 1)) gen_std = np.reshape(np.std(gen_mgc, axis=0), (-1, 1)) local_gv = (ref_gv_std / gen_gv_std) * (gen_std - gen_gv_mean) + ref_gv_mean enhanced_mgc = np.repeat(local_gv, frame_number, 1).T / np.repeat( gen_std, frame_number, 1).T * (gen_mgc - np.repeat( gen_mu, frame_number, 1).T) + np.repeat( gen_mu, frame_number, 1).T new_mgc_file_name = files['mgc'] + '_p_mgc' io_funcs.array_to_binary_file(enhanced_mgc, new_mgc_file_name) mgc_file_name = files['mgc'] + '_p_mgc' if cfg.do_post_filtering and cfg.apply_GV: logger.critical( 'Both smoothing techniques together can\'t be applied!!\n') raise ###mgc to sp to wav if cfg.vocoder_type == 'STRAIGHT': run_process( '{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} > {sp}' .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.mgc_dim - 1, fl=cfg.fl, mgc=mgc_file_name, sp=files['sp'])) run_process( '{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} > {f0}'.format( sopr=SPTK['SOPR'], lf0=files['lf0'], f0=files['f0'])) run_process('{x2x} +fa {f0} > {f0a}'.format(x2x=SPTK['X2X'], f0=files['f0'], f0a=files['f0'] + '.a')) if cfg.use_cep_ap: run_process( '{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 0 {bap} > {ap}' .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.bap_dim - 1, fl=cfg.fl, bap=files['bap'], ap=files['ap'])) else: run_process('{bndap2ap} {bap} > {ap}'.format( bndap2ap=STRAIGHT['BNDAP2AP'], bap=files['bap'], ap=files['ap'])) run_process( '{synfft} -f {sr} -spec -fftl {fl} -shift {shift} -sigp 1.2 -cornf 4000 -float -apfile {ap} {f0a} {sp} {wav}' .format(synfft=STRAIGHT['SYNTHESIS_FFT'], sr=cfg.sr, fl=cfg.fl, shift=cfg.shift, ap=files['ap'], f0a=files['f0'] + '.a', sp=files['sp'], wav=files['wav'])) run_process('rm -f {sp} {f0} {f0a} {ap}'.format(sp=files['sp'], f0=files['f0'], f0a=files['f0'] + '.a', ap=files['ap'])) elif cfg.vocoder_type == 'WORLD': run_process( '{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} | {x2x} +fd > {f0}' .format(sopr=SPTK['SOPR'], lf0=files['lf0'], x2x=SPTK['X2X'], f0=files['f0'])) run_process('{sopr} -c 0 {bap} | {x2x} +fd > {ap}'.format( sopr=SPTK['SOPR'], bap=files['bap'], x2x=SPTK['X2X'], ap=files['ap'])) ### If using world v2, please comment above line and uncomment this # run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 0 {bap} | {sopr} -d 32768.0 -P | {x2x} +fd > {ap}' # .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.bap_dim, fl=cfg.fl, bap=bap_file_name, sopr=SPTK['SOPR'], x2x=SPTK['X2X'], ap=files['ap'])) run_process( '{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} | {sopr} -d 32768.0 -P | {x2x} +fd > {sp}' .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.mgc_dim - 1, fl=cfg.fl, mgc=mgc_file_name, sopr=SPTK['SOPR'], x2x=SPTK['X2X'], sp=files['sp'])) run_process('{synworld} {fl} {sr} {f0} {sp} {ap} {wav}'.format( synworld=WORLD['SYNTHESIS'], fl=cfg.fl, sr=cfg.sr, f0=files['f0'], sp=files['sp'], ap=files['ap'], wav=files['wav'])) run_process('rm -f {ap} {sp} {f0}'.format(ap=files['ap'], sp=files['sp'], f0=files['f0'])) os.chdir(cur_dir)
def predict(self, test_x, out_scaler, gen_test_file_list, sequential_training=False, stateful=False): #### compute predictions #### io_funcs = BinaryIOCollection() test_id_list = test_x.keys() test_id_list.sort() test_file_number = len(test_id_list) print("generating features on held-out test data...") with tf.Session() as sess: new_saver = tf.train.import_meta_graph( os.path.join(self.ckpt_dir, "mymodel.ckpt.meta")) print "loading the model parameters..." output_layer = tf.get_collection("output_layer")[0] input_layer = tf.get_collection("input_layer")[0] new_saver.restore(sess, os.path.join(self.ckpt_dir, "mymodel.ckpt")) print "The model parameters are successfully restored" for utt_index in xrange(test_file_number): gen_test_file_name = gen_test_file_list[utt_index] temp_test_x = test_x[test_id_list[utt_index]] num_of_rows = temp_test_x.shape[0] if not sequential_training: is_training_batch = tf.get_collection( "is_training_batch")[0] if self.dropout_rate != 0.0: is_training_drop = tf.get_collection( "is_training_drop")[0] y_predict = sess.run(output_layer, feed_dict={ input_layer: temp_test_x, is_training_drop: False, is_training_batch: False }) else: y_predict = sess.run(output_layer, feed_dict={ input_layer: temp_test_x, is_training_batch: False }) else: temp_test_x = np.reshape(temp_test_x, [1, num_of_rows, self.n_in]) hybrid = 0 utt_length_placeholder = tf.get_collection("utt_length")[0] if "tanh" in self.hidden_layer_type: hybrid = 1 is_training_batch = tf.get_collection( "is_training_batch")[0] if self.dropout_rate != 0.0: is_training_drop = tf.get_collection( "is_training_drop")[0] if hybrid: y_predict = sess.run(output_layer, feed_dict={ input_layer: temp_test_x, utt_length_placeholder: [num_of_rows], is_training_drop: False, is_training_batch: False }) else: y_predict = sess.run(output_layer, feed_dict={ input_layer: temp_test_x, utt_length_placeholder: [num_of_rows], is_training_drop: False }) elif hybrid: y_predict = sess.run(output_layer, feed_dict={ input_layer: temp_test_x, utt_length_placeholder: [num_of_rows], is_training_batch: False }) else: y_predict = sess.run(output_layer, feed_dict={ input_layer: temp_test_x, utt_length_placeholder: [num_of_rows] }) data_utils.denorm_data(y_predict, out_scaler) io_funcs.array_to_binary_file(y_predict, gen_test_file_name) data_utils.drawProgressBar(utt_index + 1, test_file_number)
def extract_base_features(self, feat_dir_path, feat_switch, list_of_files, decomposition_unit, unit_dim): ### load Binary module ### io_funcs = BinaryIOCollection() htsclass = readHTSlabelFile() ### read file by file ### for i in range(len(list_of_files)): filename = list_of_files[i] print filename binary_label_dir = feat_dir_path['input_binary'] label_align_dir = feat_dir_path['input_labfile'] txt_dir = feat_dir_path['input_txt'] out_feat_dir = feat_dir_path['output_feat'] in_filename = os.path.join(binary_label_dir, filename + '.lab') in_lab_file = os.path.join(label_align_dir, filename + '.lab') in_txt_file = os.path.join(txt_dir, filename + '.txt') out_filename = os.path.join(out_feat_dir, filename + '.lab') word_embed_list = [] binary_feat_list = [] identity_vec_list = [] dur_feat_list = [] dur_list = [] ### read text file ### if feat_switch['wordEmbed']: ip1 = open(in_txt_file, 'r') text_Data = ip1.readlines() ip1.close() norm_text = self.format_text(text_Data[0].strip()) norm_text = norm_text.replace('OUF', 'O U F') norm_text = norm_text.replace('Mmm', 'M m m') norm_text = norm_text.replace('USA', 'U S A') list_of_words = norm_text.split() ### read label file ### [phone, st_arr, ph_arr, mean_f0_arr] = htsclass.read_state_align_label_file(in_lab_file) file_len = len(phone) ### read binary label file ### features = io_funcs.load_binary_file(in_filename, 1) ### take non-silence region ### ph_start = int(ph_arr[0][1] / (np.power(10, 4) * 5)) ph_end = int(ph_arr[1][file_len - 2] / (np.power(10, 4) * 5)) ### extract duration features ### frame_feat_list = features.reshape( len(features) / unit_dim['frame'], unit_dim['frame']) frame_feat_list = frame_feat_list[ph_start:ph_end, :] dur_feat_list = frame_feat_list[:, -9:] ### initialise common variables ### num_of_frames = 0 ### initialise syllable variables ### #frame_indx=0; syl_num_of_frames = 0 wc = 0 phinsyl = 0 syl_identity = self.zeros(300, 1) syl = '' j = 0 while j < file_len: #### ignore silence #### if (phone[j] == '#' or phone[j] == 'pau'): j = j + 1 continue ### extract boundaries of phone ### ph_start = int(ph_arr[0][j] / (np.power(10, 4) * 5)) ph_end = int(ph_arr[1][j] / (np.power(10, 4) * 5)) num_of_frames = sum(st_arr[j][:] / (np.power(10, 4) * 5)) mid_frame = (ph_start + ph_end) / 2 ### syllable ending information ### syl_end = 0 if (mean_f0_arr[j + 1][3] - mean_f0_arr[j][3] != 0): syl_end = 1 ### word ending information ### word_end = 0 if (mean_f0_arr[j + 1][5] - mean_f0_arr[j][5] != 0): word_end = 1 ### syllable duration ### syl_num_of_frames += num_of_frames ### extract binary phone-level features ### st_indx = unit_dim['frame'] * mid_frame mid_frame_feat = features[st_indx:st_indx + 592] mid_frame_feat = np.reshape(mid_frame_feat, len(mid_frame_feat)) ### word embedding features ### if feat_switch['wordEmbed']: ### word embeddings for syllable ### word = list_of_words[wc] if (word_end and phone[j] != 'pau'): wc += 1 if (phone[j] == 'pau'): word_vec = self.wrd_embeds['*UNKNOWN*'] elif word in self.wrd_embeds: word_vec = self.wrd_embeds[word] elif word.lower() in self.wrd_embeds: word_vec = self.wrd_embeds[word.lower()] else: word_vec = self.wrd_embeds['*UNKNOWN*'] ### identity features ### if feat_switch['identity']: ### phone identity features ### ph_identity = mid_frame_feat[99:148] if decomposition_unit == 'syllable': ### syllable identity features st_indx = phinsyl * 50 syl_identity[st_indx:st_indx + 49] = ph_identity syl = syl + phone[j] ### to make nucleus centre ### #if phone[j] in self.vlist: # vow_index = phinsyl ### if silence is allowed ### #if phone[j] == '#': # syl_identity[(phinsyl+1)*50-1] = 1 phinsyl += 1 #### select features depending on decomposition unit ### ### frame-level features ### if (decomposition_unit == 'frame'): ### duration features for phone ### dur_list.append(num_of_frames) ### frame level binary features ### if feat_switch['binary'] and j + 2 == file_len: ### load normalisation statistics ### label_norm_float_file = os.path.join( binary_label_dir, '../label_norm_float_HTS.dat') fid = open(label_norm_float_file, 'r') arr12 = [float(x.strip()) for x in fid.readlines()] fid.close() min_vector = np.array(arr12[0:len(arr12) / 2]) max_vector = np.array(arr12[len(arr12) / 2:len(arr12)]) max_range_vector = max_vector - min_vector max_range_vector[max_range_vector == 0] = 1 ### normalise features ### nrows = len(frame_feat_list) for x in xrange(nrows): norm_frame_feat = ( frame_feat_list[x, :] - min_vector) / max_range_vector * 0.98 + 0.01 norm_frame_vec = ' '.join( map(str, norm_frame_feat[:])) binary_feat_list.append(norm_frame_vec) ### embedding features ### if feat_switch['wordEmbed']: for x in xrange(num_of_frames): word_embed_list.append(word_vec) ### phone-level features ### if (decomposition_unit == 'phone'): ### duration features for phone ### dur_list.append(num_of_frames) ### phone level binary features ### if feat_switch['binary']: #ph_feat = np.concatenate((mid_frame_feat[0:99], mid_frame_feat[348:]), axis=0) norm_ph_feat = [ 0.99 if x == 1 else 0.01 for x in mid_frame_feat ] norm_ph_vec = ' '.join(map(str, norm_ph_feat[:])) binary_feat_list.append(norm_ph_vec) ### embedding features ### if feat_switch['wordEmbed']: word_embed_list.append(word_vec) ### phone-identity features ### if feat_switch['identity']: extra_ph = 1 if phone[j] == 'o~' else 0 ph_identity = np.append(ph_identity, extra_ph) #norm_ph_identity = [0.99 if x==1 else 0.01 for x in ph_identity] norm_ph_identity = [int(x) for x in ph_identity] norm_ph_identity_vec = ' '.join( map(str, norm_ph_identity[:])) identity_vec_list.append(norm_ph_identity_vec) ### syllable level features ### if (decomposition_unit == 'syllable' and syl_end): #print syl ### duration features for syllable ### dur_list.append(syl_num_of_frames) ### syllable and above level binary features ### if feat_switch['binary']: syl_feat = [] for x in range(len(mid_frame_feat)): if (x < 348 or (x >= 405 and x < 421)): continue syl_feat.append(mid_frame_feat[x]) norm_syl_feat = [ 0.99 if x == 1 else 0.01 for x in syl_feat ] norm_syl_vec = ' '.join(map(str, norm_syl_feat[:])) binary_feat_list.append(norm_syl_vec) if feat_switch['wordEmbed']: word_embed_list.append(word_vec) ### syllable-identity features ### if feat_switch['identity']: ### to make nucleus centre ### #if(vow_index<=1): # syl_identity = np.roll(syl_identity, 50*(vow_index+1)) norm_syl_identity = [ 0.99 if x == 1 else 0.01 for x in syl_identity ] norm_syl_identity_vec = ' '.join( map(str, norm_syl_identity[:])) identity_vec_list.append(norm_syl_identity_vec) ### reset syllable information ### phinsyl = 0 syl = '' syl_num_of_frames = 0 syl_identity = self.zeros(300, 1) j += 1 ### default vectors to use ### if feat_switch['identity'] and decomposition_unit == 'syllable': syl_identity = self.zeros(300, 1) norm_syl_identity = [ 0.99 if x == 1 else 0.01 for x in syl_identity ] norm_syl_identity_vec = ' '.join(map(str, norm_syl_identity[:])) if feat_switch['wordEmbed']: word_vec = self.wrd_embeds['*UNKNOWN*'] ### writing features to output file ### op1 = open(out_filename, 'w') num_of_vectors = max(len(binary_feat_list), len(identity_vec_list), len(word_embed_list)) for x in range(num_of_vectors): ### initialise feat vector ### feat_vec = '' ### binary features ### if feat_switch['binary']: feat_vec = feat_vec + binary_feat_list[x] + ' ' ### word embeddings ### if feat_switch['wordEmbed']: if feat_switch['wordEmbed'] >= 3: if (x - 1 < 0): feat_vec = feat_vec + word_vec + ' ' else: feat_vec = feat_vec + word_embed_list[x - 1] + ' ' feat_vec = feat_vec + word_embed_list[x] + ' ' if feat_switch['wordEmbed'] >= 3: if (x + 1 >= len(binary_feat_list)): feat_vec = feat_vec + word_vec + ' ' else: feat_vec = feat_vec + word_embed_list[x + 1] + ' ' ### identity features ### if feat_switch['identity']: if feat_switch['identity'] >= 5: if (x - 2 < 0): feat_vec = feat_vec + norm_syl_identity_vec + ' ' else: feat_vec = feat_vec + identity_vec_list[x - 2] + ' ' if feat_switch['identity'] >= 3: if (x - 1 < 0): feat_vec = feat_vec + norm_syl_identity_vec + ' ' else: feat_vec = feat_vec + identity_vec_list[x - 1] + ' ' feat_vec = feat_vec + identity_vec_list[x] + ' ' if feat_switch['identity'] >= 3: if (x + 1 >= len(binary_feat_list)): feat_vec = feat_vec + norm_syl_identity_vec + ' ' else: feat_vec = feat_vec + identity_vec_list[x + 1] + ' ' if feat_switch['identity'] >= 5: if (x + 2 >= len(binary_feat_list)): feat_vec = feat_vec + norm_syl_identity_vec + ' ' else: feat_vec = feat_vec + identity_vec_list[x + 2] + ' ' op1.write(feat_vec + '\n') #for z in range(dur_list[x]): # op1.write(feat_vec + ' '.join(map(str, dur_feat_list[frame_indx+z,:]))+'\n') #frame_indx+=dur_list[x] op1.close()
def wavgen_straight_type_vocoder(gen_dir, file_id_list, cfg, logger): ''' Waveform generation with STRAIGHT or WORLD vocoders. (whose acoustic parameters are: mgc, bap, and lf0) ''' SPTK = cfg.SPTK # NND = cfg.NND STRAIGHT = cfg.STRAIGHT WORLD = cfg.WORLD # to be moved pf_coef = cfg.pf_coef if isinstance(cfg.fw_alpha, str): if cfg.fw_alpha == 'Bark': fw_coef = bark_alpha(cfg.sr) elif cfg.fw_alpha == 'ERB': fw_coef = bark_alpha(cfg.sr) else: raise ValueError( 'cfg.fw_alpha=' + cfg.fw_alpha + ' not implemented, the frequency warping coefficient "fw_coef" cannot be deduced.' ) else: fw_coef = cfg.fw_alpha co_coef = cfg.co_coef fl_coef = cfg.fl if cfg.apply_GV: io_funcs = BinaryIOCollection() logger.info('loading global variance stats from %s' % (cfg.GV_dir)) ref_gv_mean_file = os.path.join(cfg.GV_dir, 'ref_gv.mean') gen_gv_mean_file = os.path.join(cfg.GV_dir, 'gen_gv.mean') ref_gv_std_file = os.path.join(cfg.GV_dir, 'ref_gv.std') gen_gv_std_file = os.path.join(cfg.GV_dir, 'gen_gv.std') ref_gv_mean, frame_number = io_funcs.load_binary_file_frame( ref_gv_mean_file, 1) gen_gv_mean, frame_number = io_funcs.load_binary_file_frame( gen_gv_mean_file, 1) ref_gv_std, frame_number = io_funcs.load_binary_file_frame( ref_gv_std_file, 1) gen_gv_std, frame_number = io_funcs.load_binary_file_frame( gen_gv_std_file, 1) # counter=1 max_counter = len(file_id_list) for filename in file_id_list: # logger.info('creating waveform for %4d of %4d: %s' % (max_counter, filename)) # counter=counter+1 # pdb.set_trace() base = filename files = { 'sp': base + cfg.sp_ext, 'mgc': base + cfg.mgc_ext, 'f0': base + '.f0', 'lf0': base + cfg.lf0_ext, 'ap': base + '.ap', 'bap': base + cfg.bap_ext, 'wav': base + '.wav' } mgc_file_name = files['mgc'] bap_file_name = files['bap'] # pdb.set_trace() cur_dir = os.getcwd() os.chdir(gen_dir) # post-filtering if cfg.do_post_filtering: mgc_file_name = files['mgc'] + '_p_mgc' # pdb.set_trace() post_filter(files['mgc'], mgc_file_name, cfg.mgc_dim, pf_coef, fw_coef, co_coef, fl_coef, gen_dir, cfg) if cfg.vocoder_type == "STRAIGHT" and cfg.apply_GV: gen_mgc, frame_number = io_funcs.load_binary_file_frame( mgc_file_name, cfg.mgc_dim) gen_mu = np.reshape(np.mean(gen_mgc, axis=0), (-1, 1)) gen_std = np.reshape(np.std(gen_mgc, axis=0), (-1, 1)) local_gv = (ref_gv_std / gen_gv_std) * \ (gen_std - gen_gv_mean) + ref_gv_mean enhanced_mgc = np.repeat(local_gv, frame_number, 1).T / np.repeat( gen_std, frame_number, 1).T * (gen_mgc - np.repeat( gen_mu, frame_number, 1).T) + np.repeat( gen_mu, frame_number, 1).T new_mgc_file_name = files['mgc'] + '_p_mgc' io_funcs.array_to_binary_file(enhanced_mgc, new_mgc_file_name) mgc_file_name = files['mgc'] + '_p_mgc' if cfg.do_post_filtering and cfg.apply_GV: logger.critical( 'Both smoothing techniques together can\'t be applied!!\n') # raise # mgc to sp to wav if cfg.vocoder_type == 'STRAIGHT': run_process( '{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} > {sp}' .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.mgc_dim - 1, fl=cfg.fl, mgc=mgc_file_name, sp=files['sp'])) run_process( '{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} > {f0}'.format( sopr=SPTK['SOPR'], lf0=files['lf0'], f0=files['f0'])) run_process('{x2x} +fa {f0} > {f0a}'.format(x2x=SPTK['X2X'], f0=files['f0'], f0a=files['f0'] + '.a')) if cfg.use_cep_ap: run_process( '{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 0 {bap} > {ap}' .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.bap_dim - 1, fl=cfg.fl, bap=files['bap'], ap=files['ap'])) else: run_process('{bndap2ap} {bap} > {ap}'.format( bndap2ap=STRAIGHT['BNDAP2AP'], bap=files['bap'], ap=files['ap'])) run_process( '{synfft} -f {sr} -spec -fftl {fl} -shift {shift} -sigp 1.2 -cornf 4000 -float -apfile {ap} {f0a} {sp} {wav}' .format(synfft=STRAIGHT['SYNTHESIS_FFT'], sr=cfg.sr, fl=cfg.fl, shift=cfg.shift, ap=files['ap'], f0a=files['f0'] + '.a', sp=files['sp'], wav=files['wav'])) run_process('rm -f {sp} {f0} {f0a} {ap}'.format(sp=files['sp'], f0=files['f0'], f0a=files['f0'] + '.a', ap=files['ap'])) elif cfg.vocoder_type == 'WORLD': run_process( '{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} | {x2x} +fd > {f0}' .format(sopr=SPTK['SOPR'], lf0=files['lf0'], x2x=SPTK['X2X'], f0=files['f0'])) run_process('{sopr} -c 0 {bap} | {x2x} +fd > {ap}'.format( sopr=SPTK['SOPR'], bap=files['bap'], x2x=SPTK['X2X'], ap=files['ap'])) # If using world v2, please comment above line and uncomment this # run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 0 {bap} | {sopr} -d 32768.0 -P | {x2x} +fd > {ap}' # .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.bap_dim, fl=cfg.fl, bap=bap_file_name, sopr=SPTK['SOPR'], x2x=SPTK['X2X'], ap=files['ap'])) run_process( '{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} | {sopr} -d 32768.0 -P | {x2x} +fd > {sp}' .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.mgc_dim - 1, fl=cfg.fl, mgc=mgc_file_name, sopr=SPTK['SOPR'], x2x=SPTK['X2X'], sp=files['sp'])) run_process('{synworld} {fl} {sr} {f0} {sp} {ap} {wav}'.format( synworld=WORLD['SYNTHESIS'], fl=cfg.fl, sr=cfg.sr, f0=files['f0'], sp=files['sp'], ap=files['ap'], wav=files['wav'])) # y = pw.synthesize(f0, sp, ap, fs) run_process('rm -f {ap} {sp} {f0}'.format(ap=files['ap'], sp=files['sp'], f0=files['f0'])) elif cfg.vocoder_type == 'WORLD_PY': logging.info( "generate speech with py world, sampling rate is {0}".format( cfg.sr)) # pdb.set_trace() lf0_file = files['lf0'] lf0 = read_binfile(lf0_file, dim=1, dtype=np.float32) zeros_index = np.where(lf0 == -1E+10) nonzeros_index = np.where(lf0 != -1E+10) f0 = lf0.copy() f0[zeros_index] = 0 f0[nonzeros_index] = np.exp(lf0[nonzeros_index]) f0 = f0.astype(np.float64) if cfg.sr == 16000: bap_dim = 1 elif cfg.sr == 48000: bap_dim = 5 else: pass bap = read_binfile(bap_file_name, dim=bap_dim, dtype=np.float32) ap = pyworld.decode_aperiodicity( bap.astype(np.float64).reshape(-1, bap_dim), cfg.sr, cfg.fl) mc = read_binfile(mgc_file_name, dim=60, dtype=np.float32) alpha = pysptk.util.mcepalpha(cfg.sr) sp = pysptk.mc2sp(mc.astype(np.float64), fftlen=cfg.fl, alpha=alpha) wav = pyworld.synthesize(f0, sp, ap, cfg.sr, 5) x2 = wav / np.max(wav) * 32768 x2 = x2.astype(np.int16) scipy.io.wavfile.write(files['wav'], cfg.sr, x2) os.chdir(cur_dir)
def compute_distortion(self, file_id_list, reference_dir, generation_dir, file_ext, feature_dim): total_voiced_frame_number = 0 distortion = 0.0 vuv_error = 0 total_frame_number = 0 io_funcs = BinaryIOCollection() ref_all_files_data = numpy.reshape(numpy.array([]), (-1, 1)) gen_all_files_data = numpy.reshape(numpy.array([]), (-1, 1)) for file_id in file_id_list: ref_file_name = reference_dir + '/' + file_id + file_ext gen_file_name = generation_dir + '/' + file_id + file_ext ref_data, ref_frame_number = io_funcs.load_binary_file_frame( ref_file_name, feature_dim) gen_data, gen_frame_number = io_funcs.load_binary_file_frame( gen_file_name, feature_dim) # accept the difference upto two frames if abs(ref_frame_number - gen_frame_number) <= 2: ref_frame_number = min(ref_frame_number, gen_frame_number) gen_frame_number = min(ref_frame_number, gen_frame_number) ref_data = ref_data[0:ref_frame_number, ] gen_data = gen_data[0:gen_frame_number, ] if ref_frame_number != gen_frame_number: self.logger.critical( "The number of frames is not the same: %d vs %d (%s). Error in compute_distortion.py\n." % (ref_frame_number, gen_frame_number, file_id)) raise if file_ext == '.lf0': ref_all_files_data = numpy.concatenate( (ref_all_files_data, ref_data), axis=0) gen_all_files_data = numpy.concatenate( (gen_all_files_data, gen_data), axis=0) temp_distortion, temp_vuv_error, voiced_frame_number = self.compute_f0_mse( ref_data, gen_data) vuv_error += temp_vuv_error total_voiced_frame_number += voiced_frame_number elif file_ext == '.dur': ref_data = numpy.reshape(numpy.sum(ref_data, axis=1), (-1, 1)) gen_data = numpy.reshape(numpy.sum(gen_data, axis=1), (-1, 1)) ref_all_files_data = numpy.concatenate( (ref_all_files_data, ref_data), axis=0) gen_all_files_data = numpy.concatenate( (gen_all_files_data, gen_data), axis=0) continue elif file_ext == '.mgc': temp_distortion = self.compute_mse(ref_data[:, 1:feature_dim], gen_data[:, 1:feature_dim]) else: temp_distortion = self.compute_mse(ref_data, gen_data) distortion += temp_distortion total_frame_number += ref_frame_number if file_ext == '.dur': dur_rmse = self.compute_rmse(ref_all_files_data, gen_all_files_data) dur_corr = self.compute_corr(ref_all_files_data, gen_all_files_data) return dur_rmse, dur_corr elif file_ext == '.lf0': distortion /= float(total_voiced_frame_number) vuv_error /= float(total_frame_number) distortion = numpy.sqrt(distortion) f0_corr = self.compute_f0_corr(ref_all_files_data, gen_all_files_data) return distortion, f0_corr, vuv_error else: distortion /= float(total_frame_number) return distortion
def load_labels_with_phone_alignment(self, file_name, dur_file_name): # this is not currently used ??? -- it works now :D logger = logging.getLogger("labels") #logger.critical('unused function ???') #raise Exception if dur_file_name: io_funcs = BinaryIOCollection() dur_dim = 1 ## hard coded for now manual_dur_data = io_funcs.load_binary_file(dur_file_name, dur_dim) if self.add_frame_features: assert self.dimension == self.dict_size + self.frame_feature_size elif self.subphone_feats != 'none': assert self.dimension == self.dict_size + self.frame_feature_size else: assert self.dimension == self.dict_size label_feature_matrix = numpy.empty((100000, self.dimension)) ph_count = 0 label_feature_index = 0 fid = open(file_name) for line in fid.readlines(): line = line.strip() if len(line) < 1: continue temp_list = re.split('\s+', line) start_time = int(temp_list[0]) end_time = int(temp_list[1]) full_label = temp_list[2] # to do - support different frame shift - currently hardwired to 5msec # currently under beta testing: support different frame shift if dur_file_name: frame_number = manual_dur_data[ph_count] else: frame_number = int((end_time - start_time) / 50000) ph_count = ph_count + 1 #label_binary_vector = self.pattern_matching(full_label) label_binary_vector = self.pattern_matching_binary(full_label) # if there is no CQS question, the label_continuous_vector will become to empty label_continuous_vector = self.pattern_matching_continous_position( full_label) label_vector = numpy.concatenate( [label_binary_vector, label_continuous_vector], axis=1) if self.subphone_feats == "coarse_coding": cc_feat_matrix = self.extract_coarse_coding_features_relative( frame_number) if self.add_frame_features: current_block_binary_array = numpy.zeros( (frame_number, self.dict_size + self.frame_feature_size)) for i in range(frame_number): current_block_binary_array[i, 0:self.dict_size] = label_vector if self.subphone_feats == 'minimal_phoneme': ## features which distinguish frame position in phoneme current_block_binary_array[ i, self.dict_size] = float(i + 1) / float( frame_number ) # fraction through phone forwards current_block_binary_array[ i, self.dict_size + 1] = float(frame_number - i) / float( frame_number ) # fraction through phone backwards current_block_binary_array[ i, self.dict_size + 2] = float( frame_number) # phone duration elif self.subphone_feats == 'coarse_coding': ## features which distinguish frame position in phoneme using three continous numerical features current_block_binary_array[i, self.dict_size + 0] = cc_feat_matrix[i, 0] current_block_binary_array[i, self.dict_size + 1] = cc_feat_matrix[i, 1] current_block_binary_array[i, self.dict_size + 2] = cc_feat_matrix[i, 2] current_block_binary_array[i, self.dict_size + 3] = float(frame_number) elif self.subphone_feats == 'none': pass else: sys.exit('unknown subphone_feats type') label_feature_matrix[ label_feature_index:label_feature_index + frame_number, ] = current_block_binary_array label_feature_index = label_feature_index + frame_number elif self.subphone_feats == 'none': current_block_binary_array = label_vector label_feature_matrix[label_feature_index:label_feature_index + 1, ] = current_block_binary_array label_feature_index = label_feature_index + 1 fid.close() label_feature_matrix = label_feature_matrix[0:label_feature_index, ] logger.info('loaded %s, %3d labels' % (file_name, ph_count)) logger.debug('made label matrix of %d frames x %d labels' % label_feature_matrix.shape) return label_feature_matrix
def read_data_from_file_list_shared_2(speaker_id_list, inp_file_list, out_file_list, inp_dim, out_dim, sequential_training=False): io_funcs = BinaryIOCollection() num_of_utt = len(inp_file_list) num_of_spk = len(speaker_id_list) file_length_dict = {'framenum2utt':{}, 'utt2framenum':{}} if sequential_training: temp_set_x = {} temp_set_y = {} else: temp_set_x = np.empty((FRAME_BUFFER_SIZE, inp_dim)) temp_set_y = [] for i in range(num_of_spk): temp_set_y.append(np.empty((FRAME_BUFFER_SIZE, out_dim))) ### read file by file ### current_index = [0]*num_of_spk # Keep index for each speaker for spk_i, speaker in enumerate(speaker_id_list): # Pull sublist of files matching current speaker logical_index = [speaker in name for name in inp_file_list] inp_file_sublist = np.array(inp_file_list)[logical_index] out_file_sublist = np.array(out_file_list)[logical_index] num_sub_utt = len(out_file_sublist) # Remember to index the speaker as well for i in range(num_sub_utt): inp_file_name = inp_file_sublist[i] out_file_name = out_file_sublist[i] inp_features, inp_frame_number = io_funcs.load_binary_file_frame(inp_file_name, inp_dim) out_features, out_frame_number = io_funcs.load_binary_file_frame(out_file_name, out_dim) base_file_name = os.path.basename(inp_file_name).split(".")[0] if abs(inp_frame_number-out_frame_number)>5: print('the number of frames in input and output features are different: %d vs %d (%s)' %(inp_frame_number, out_frame_number, base_file_name)) sys.exit(0) else: frame_number = min(inp_frame_number, out_frame_number) if sequential_training: temp_set_x[base_file_name] = inp_features[0:frame_number] temp_set_y[speaker][base_file_name] = out_features[0:frame_number] else: temp_set_x[sum(current_index):sum(current_index)+frame_number, ] = inp_features[0:frame_number] temp_set_y[spk_i][current_index[spk_i]:current_index[spk_i]+frame_number, ] = out_features[0:frame_number] current_index[spk_i] += frame_number if frame_number not in file_length_dict['framenum2utt']: file_length_dict['framenum2utt'][frame_number] = [base_file_name] else: file_length_dict['framenum2utt'][frame_number].append(base_file_name) file_length_dict['utt2framenum'][base_file_name] = frame_number drawProgressBar(i+1, num_sub_utt) sys.stdout.write("\n") if not sequential_training: set_x = temp_set_x[0:sum(current_index), ] set_y =[] for i in range(num_of_spk): set_y.append(temp_set_y[i][0:current_index[i], ]) else: set_x = temp_set_x set_y = temp_set_y return set_x, set_y, file_length_dict
def read_data_from_file_list(inp_file_list, out_file_list, inp_dim, out_dim, sequential_training=True): """ read input and output files from file lists Args: inp_file_list: list of input files Returns: if sequential training temp_set_x: will be the list of dicts [{'arctic_a0001':features},...] where features are the input features with shape (T, nx) file_length_dict: example {'framenum2utt': {578: ['arctic_a0001'], 675: ['arctic_a0002'], 606: ['arctic_a0003']}, 'utt2framenum': {'arctic_a0001': 578, 'arctic_a0002': 675, 'arctic_a0003': 606}} tells us each utterance's # of frames """ io_funcs = BinaryIOCollection() num_of_utt = len(inp_file_list) file_length_dict = {'framenum2utt': {}, 'utt2framenum': {}} if sequential_training: temp_set_x = {} temp_set_y = {} else: temp_set_x = np.empty((FRAME_BUFFER_SIZE, inp_dim)) temp_set_y = np.empty((FRAME_BUFFER_SIZE, out_dim)) ### read file by file ### current_index = 0 for i in range(num_of_utt): inp_file_name = inp_file_list[i] out_file_name = out_file_list[i] logging.debug("read file from {}".format(inp_file_name)) inp_features, inp_frame_number = io_funcs.load_binary_file_frame( inp_file_name, inp_dim) out_features, out_frame_number = io_funcs.load_binary_file_frame( out_file_name, out_dim) base_file_name = os.path.basename(inp_file_name).split(".")[0] # pdb.set_trace() if abs(inp_frame_number - out_frame_number) > 5: print( 'the number of frames in input and output features are different: %d vs %d (%s)' % (inp_frame_number, out_frame_number, base_file_name)) sys.exit(0) else: frame_number = min(inp_frame_number, out_frame_number) if sequential_training: temp_set_x[base_file_name] = inp_features[0:frame_number] temp_set_y[base_file_name] = out_features[0:frame_number] else: try: temp_set_x[current_index:current_index + frame_number, ] = inp_features[0:frame_number] temp_set_y[current_index:current_index + frame_number, ] = out_features[0:frame_number] current_index += frame_number except ValueError: pdb.set_trace() print(inp_file_name) if frame_number not in file_length_dict['framenum2utt']: file_length_dict['framenum2utt'][frame_number] = [base_file_name] else: file_length_dict['framenum2utt'][frame_number].append( base_file_name) file_length_dict['utt2framenum'][base_file_name] = frame_number drawProgressBar(i + 1, num_of_utt) sys.stdout.write("\n") if not sequential_training: temp_set_x = temp_set_x[0:current_index, ] temp_set_y = temp_set_y[0:current_index, ] return temp_set_x, temp_set_y, file_length_dict
def make_labels(self,input_file_descriptors,out_file_name=None,\ fill_missing_values=False,iterate_over_frames=False): ## input_file_descriptors is e.g. {'xpath': <open XML file for reading>} # file_descriptors is a dictionary of open label files all for the same utterance # currently supports XPATH or HTS file formats only # keys should be 'xpath' or 'hts' # an array in which to assemble all the features all_labels = None try: assert self.configuration except AssertionError: self.logger.critical( 'no label configuration loaded, so cannot make labels') raise # now iterate through the features, and create the features from the appropriate open label file xpath_list = [] ## gather all here and extact all features in one pass mapper_list = [] for (item_number, feature_specification) in enumerate(self.configuration.labels): #osw# self.logger.debug('constructing feature %.80s ...' % feature_specification ) ## osw -- we'll append frame features to the data for the *LAST* ## feature_specification in our list add_frame_features = False if item_number + 1 == len(self.configuration.labels): add_frame_features = True #osw# self.logger.debug('append frame features') # which label file should we use? if feature_specification.has_key('xpath'): # xpath and hts are mutually exclusive label styles assert not feature_specification.has_key('hts') #osw# self.logger.debug(' feature style: xpath ; XPATH: %s' % feature_specification['xpath'] ) # actually make the features from this open file and the current XPATH try: assert self.configuration.target_nodes except: self.logger.critical( 'When using XPATH features, "target_nodes" must be defined in the label config file' ) raise try: xpath_list.append(feature_specification['xpath']) if feature_specification.has_key('mapper'): mapper_list.append(feature_specification['mapper']) else: mapper_list.append(None) except: self.logger.critical( 'error creating XMLLabelNormalisation object for feature %s' % feature_specification) raise if feature_specification.has_key('hts'): assert not feature_specification.has_key('xpath') # not yet implemented ! self.logger.warning( 'HTS features not implemented - ignoring them!') #these_labels=None # to do, with implementation: deal with fill_missing_values correctly ## Now extract all feats in one go -- go straight to all_labels -- don't compose from 'these_labels': label_normaliser = XMLLabelNormalisation( xpath=xpath_list, mapper=mapper_list, fill_missing_values=fill_missing_values, target_nodes=self.configuration.target_nodes, use_compiled_xpath=self.use_precompiled_xpaths, iterate_over_frames=iterate_over_frames) try: all_labels = label_normaliser.extract_linguistic_features( input_file_descriptors['xpath'], add_frame_features=add_frame_features) except KeyError: self.logger.critical( 'no open xpath label file available to create feature %s' % feature_specification) raise # # add these_features as additional columns of all_features # if (these_labels != None): # if all_labels != None: # all_labels = numpy.hstack((all_labels,these_labels)) # else: # all_labels= these_labels if all_labels != None: self.logger.debug(' composed features now have dimension %d' % all_labels.shape[1]) #osw# self.logger.debug( 'first line of labels: ' + str(all_labels[0,:])) # finally, save the labels if out_file_name: io_funcs = BinaryIOCollection() io_funcs.array_to_binary_file(all_labels, out_file_name) ## osw: useful for debugging: ##numpy.savetxt(out_file_name + '.TXT', all_labels, delimiter='\t') # debug # with printoptions(threshold=3000, linewidth=1000, edgeitems=1000, precision=1, suppress=True): # # print all_labels # print all_labels.sum(axis=1) self.logger.info('saved numerical features of shape %s to %s' % (all_labels.shape, out_file_name)) else: return all_features
def load_next_batch_S2S(self): """Load the data for one utterance. This function will be called when utterance-by-utterance loading is required (e.g., sequential training). """ temp_set_x = numpy.empty((self.buffer_size, self.n_ins)) temp_set_y = numpy.empty((self.buffer_size, self.n_outs)) temp_set_d = numpy.empty((self.buffer_size, 1)) io_fun = BinaryIOCollection() lab_start_frame_number = 0 lab_end_frame_number = 0 out_start_frame_number = 0 out_end_frame_number = 0 new_x_files_list = self.x_files_list[self.file_index].split(',') new_y_files_list = self.y_files_list[self.file_index].split(',') new_dur_files_list = self.dur_files_list[self.file_index].split(',') for new_file_index in xrange(len(new_x_files_list)): in_features, lab_frame_number = io_fun.load_binary_file_frame( new_x_files_list[new_file_index], self.n_ins) out_features, out_frame_number = io_fun.load_binary_file_frame( new_y_files_list[new_file_index], self.n_outs) lab_end_frame_number += lab_frame_number out_end_frame_number += out_frame_number temp_set_x[lab_start_frame_number:lab_end_frame_number, ] = in_features[0:lab_frame_number, ] temp_set_y[out_start_frame_number:out_end_frame_number, ] = out_features[0:out_frame_number, ] if not self.dur_files_list: dur_frame_number = out_end_frame_number temp_set_d = numpy.array([dur_frame_number]) else: dur_features, dur_frame_number = io_fun.load_binary_file_frame( new_dur_files_list[new_file_index], 1) assert sum(dur_features) == out_frame_number temp_set_d[lab_start_frame_number:lab_end_frame_number, ] = dur_features[0:lab_frame_number, ] lab_start_frame_number = lab_end_frame_number out_start_frame_number = out_end_frame_number temp_set_x = temp_set_x[0:lab_end_frame_number, ] temp_set_y = temp_set_y[0:out_end_frame_number, ] temp_set_d = temp_set_d[0:lab_end_frame_number, ] temp_set_d = numpy.reshape(temp_set_d, (-1, )) temp_set_d = temp_set_d.astype(int) self.file_index += 1 if self.file_index >= self.list_size: self.end_reading = True self.file_index = 0 shared_set_x = self.make_shared(temp_set_x, 'x') shared_set_y = self.make_shared(temp_set_y, 'y') shared_set_d = theano.shared(numpy.asarray(temp_set_d, dtype='int32'), name='d', borrow=True) shared_set_xyd = (shared_set_x, shared_set_y, shared_set_d) return shared_set_xyd, temp_set_x, temp_set_y, temp_set_d
def generate_wav( data, gen_dir='./results', base='sample', sptk_dir='/u/kumarrit/world.py/merlin/tools/bin/SPTK-3.9/', world_dir='/u/kumarrit/world.py/merlin/tools/bin/WORLD/', norm_info_file='/data/lisa/exp/kumarrit/vctk/norm_info_mgc_lf0_vuv_bap_63_MVN.dat', do_post_filtering=True, mgc_dim=60, fl=1024, sr=16000): io_funcs = BinaryIOCollection() file_name = os.path.join(gen_dir, base + ".cmp") fid = open(norm_info_file, 'rb') cmp_info = numpy.fromfile(fid, dtype=numpy.float32) fid.close() cmp_info = cmp_info.reshape((2, -1)) cmp_mean = cmp_info[0, ] cmp_std = cmp_info[1, ] data = data * cmp_std + cmp_mean io_funcs.array_to_binary_file(data, file_name) # This code was adapted from Merlin. I should add the license. out_dimension_dict = {'bap': 1, 'lf0': 1, 'mgc': 60, 'vuv': 1} stream_start_index = {} file_extension_dict = { 'mgc': '.mgc', 'bap': '.bap', 'lf0': '.lf0', 'dur': '.dur', 'cmp': '.cmp' } gen_wav_features = ['mgc', 'lf0', 'bap'] dimension_index = 0 for feature_name in out_dimension_dict.keys(): stream_start_index[feature_name] = dimension_index dimension_index += out_dimension_dict[feature_name] dir_name = os.path.dirname(file_name) file_id = os.path.splitext(os.path.basename(file_name))[0] features, frame_number = io_funcs.load_binary_file_frame(file_name, 63) for feature_name in gen_wav_features: current_features = features[:, stream_start_index[feature_name]: stream_start_index[feature_name] + out_dimension_dict[feature_name]] gen_features = current_features if feature_name in ['lf0', 'F0']: if 'vuv' in stream_start_index.keys(): vuv_feature = features[:, stream_start_index['vuv']: stream_start_index['vuv'] + 1] for i in xrange(frame_number): if vuv_feature[i, 0] < 0.5: gen_features[i, 0] = -1.0e+10 # self.inf_float new_file_name = os.path.join( dir_name, file_id + file_extension_dict[feature_name]) io_funcs.array_to_binary_file(gen_features, new_file_name) pf_coef = 1.4 fw_alpha = 0.58 co_coef = 511 sptk_path = { 'SOPR': sptk_dir + 'sopr', 'FREQT': sptk_dir + 'freqt', 'VSTAT': sptk_dir + 'vstat', 'MGC2SP': sptk_dir + 'mgc2sp', 'MERGE': sptk_dir + 'merge', 'BCP': sptk_dir + 'bcp', 'MC2B': sptk_dir + 'mc2b', 'C2ACR': sptk_dir + 'c2acr', 'MLPG': sptk_dir + 'mlpg', 'VOPR': sptk_dir + 'vopr', 'B2MC': sptk_dir + 'b2mc', 'X2X': sptk_dir + 'x2x', 'VSUM': sptk_dir + 'vsum' } world_path = { 'ANALYSIS': world_dir + 'analysis', 'SYNTHESIS': world_dir + 'synth' } fw_coef = fw_alpha fl_coef = fl files = { 'sp': base + '.sp', 'mgc': base + '.mgc', 'f0': base + '.f0', 'lf0': base + '.lf0', 'ap': base + '.ap', 'bap': base + '.bap', 'wav': base + '.wav' } mgc_file_name = files['mgc'] cur_dir = os.getcwd() os.chdir(gen_dir) # post-filtering if do_post_filtering: line = "echo 1 1 " for i in range(2, mgc_dim): line = line + str(pf_coef) + " " run_process('{line} | {x2x} +af > {weight}'.format( line=line, x2x=sptk_path['X2X'], weight=os.path.join(gen_dir, 'weight'))) run_process('{freqt} -m {order} -a {fw} -M {co} -A 0 < {mgc} | ' '{c2acr} -m {co} -M 0 -l {fl} > {base_r0}'.format( freqt=sptk_path['FREQT'], order=mgc_dim - 1, fw=fw_coef, co=co_coef, mgc=files['mgc'], c2acr=sptk_path['C2ACR'], fl=fl_coef, base_r0=files['mgc'] + '_r0')) run_process('{vopr} -m -n {order} < {mgc} {weight} | ' '{freqt} -m {order} -a {fw} -M {co} -A 0 | ' '{c2acr} -m {co} -M 0 -l {fl} > {base_p_r0}'.format( vopr=sptk_path['VOPR'], order=mgc_dim - 1, mgc=files['mgc'], weight=os.path.join(gen_dir, 'weight'), freqt=sptk_path['FREQT'], fw=fw_coef, co=co_coef, c2acr=sptk_path['C2ACR'], fl=fl_coef, base_p_r0=files['mgc'] + '_p_r0')) run_process('{vopr} -m -n {order} < {mgc} {weight} | ' '{mc2b} -m {order} -a {fw} | ' '{bcp} -n {order} -s 0 -e 0 > {base_b0}'.format( vopr=sptk_path['VOPR'], order=mgc_dim - 1, mgc=files['mgc'], weight=os.path.join(gen_dir, 'weight'), mc2b=sptk_path['MC2B'], fw=fw_coef, bcp=sptk_path['BCP'], base_b0=files['mgc'] + '_b0')) run_process( '{vopr} -d < {base_r0} {base_p_r0} | ' '{sopr} -LN -d 2 | {vopr} -a {base_b0} > {base_p_b0}'.format( vopr=sptk_path['VOPR'], base_r0=files['mgc'] + '_r0', base_p_r0=files['mgc'] + '_p_r0', sopr=sptk_path['SOPR'], base_b0=files['mgc'] + '_b0', base_p_b0=files['mgc'] + '_p_b0')) run_process('{vopr} -m -n {order} < {mgc} {weight} | ' '{mc2b} -m {order} -a {fw} | ' '{bcp} -n {order} -s 1 -e {order} | ' '{merge} -n {order2} -s 0 -N 0 {base_p_b0} | ' '{b2mc} -m {order} -a {fw} > {base_p_mgc}'.format( vopr=sptk_path['VOPR'], order=mgc_dim - 1, mgc=files['mgc'], weight=os.path.join(gen_dir, 'weight'), mc2b=sptk_path['MC2B'], fw=fw_coef, bcp=sptk_path['BCP'], merge=sptk_path['MERGE'], order2=mgc_dim - 2, base_p_b0=files['mgc'] + '_p_b0', b2mc=sptk_path['B2MC'], base_p_mgc=files['mgc'] + '_p_mgc')) mgc_file_name = files['mgc'] + '_p_mgc' # Vocoder WORLD run_process('{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} | ' '{x2x} +fd > {f0}'.format(sopr=sptk_path['SOPR'], lf0=files['lf0'], x2x=sptk_path['X2X'], f0=files['f0'])) run_process('{sopr} -c 0 {bap} | {x2x} +fd > {ap}'.format( sopr=sptk_path['SOPR'], bap=files['bap'], x2x=sptk_path['X2X'], ap=files['ap'])) run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} | ' '{sopr} -d 32768.0 -P | {x2x} +fd > {sp}'.format( mgc2sp=sptk_path['MGC2SP'], alpha=fw_alpha, order=mgc_dim - 1, fl=fl, mgc=mgc_file_name, sopr=sptk_path['SOPR'], x2x=sptk_path['X2X'], sp=files['sp'])) run_process('{synworld} {fl} {sr} {f0} {sp} {ap} {wav}'.format( synworld=world_path['SYNTHESIS'], fl=fl, sr=sr, f0=files['f0'], sp=files['sp'], ap=files['ap'], wav=files['wav'])) run_process('rm -f {ap} {sp} {f0} {bap} {lf0} {mgc} {mgc}_b0 {mgc}_p_b0 ' '{mgc}_p_mgc {mgc}_p_r0 {mgc}_r0 {cmp} weight'.format( ap=files['ap'], sp=files['sp'], f0=files['f0'], bap=files['bap'], lf0=files['lf0'], mgc=files['mgc'], cmp=base + '.cmp')) os.chdir(cur_dir)
def load_next_batch_S2SML(self): """Load the data for one utterance. This function will be called when utterance-by-utterance loading is required (e.g., sequential training). """ inp_length = (self.MLU_div['word'][1] - self.MLU_div['word'][0]) + ( self.MLU_div['word'][3] - self.MLU_div['word'][2]) af_length = self.MLU_div['length'][-1] new_temp_set_x = numpy.empty((self.buffer_size, inp_length)) new_temp_set_y = numpy.empty((self.buffer_size, self.n_outs)) new_temp_set_af = numpy.empty((self.buffer_size, af_length)) new_temp_set_d = [ numpy.array([], 'int32'), numpy.array([], 'int32'), numpy.array([], 'int32') ] io_fun = BinaryIOCollection() lab_start_frame_number = 0 lab_end_frame_number = 0 out_start_frame_number = 0 out_end_frame_number = 0 new_x_files_list = self.x_files_list[self.file_index].split(',') new_y_files_list = self.y_files_list[self.file_index].split(',') new_dur_files_list = self.dur_files_list[self.file_index].split(',') for new_file_index in xrange(len(new_x_files_list)): in_features, lab_frame_number = io_fun.load_binary_file_frame( new_x_files_list[new_file_index], self.n_ins) out_features, out_frame_number = io_fun.load_binary_file_frame( new_y_files_list[new_file_index], self.n_outs) dur_features, dur_frame_number = io_fun.load_binary_file_frame( new_dur_files_list[new_file_index], 1) ### MLU features sub-division ### temp_set_MLU = in_features[0:lab_frame_number, ] temp_set_y = out_features[0:out_frame_number, ] temp_set_phone = numpy.concatenate([ temp_set_MLU[:, self.MLU_div['phone'][0]:self. MLU_div['phone'][1]], temp_set_MLU[:, self.MLU_div['phone'][2]:self.MLU_div['phone'][3]] ], axis=1) temp_set_syl = numpy.concatenate([ temp_set_MLU[:, self.MLU_div['syl'][0]:self.MLU_div['syl'][1]], temp_set_MLU[:, self.MLU_div['syl'][2]:self.MLU_div['syl'][3]] ], axis=1) temp_set_word = numpy.concatenate([ temp_set_MLU[:, self.MLU_div['word'][0]:self.MLU_div['word'][1]], temp_set_MLU[:, self.MLU_div['word'][2]:self.MLU_div['word'][3]] ], axis=1) ### duration array sub-division ### dur_features = numpy.reshape(dur_features, (-1, )) temp_set_d = dur_features.astype(int) dur_word_syl = temp_set_d[0:-lab_frame_number] num_ph = lab_frame_number num_syl = (numpy.where( numpy.cumsum(dur_word_syl[::-1]) == lab_frame_number)[0][0] + 1) num_words = len(dur_word_syl) - num_syl temp_set_dur_phone = temp_set_d[-num_ph:] temp_set_dur_word = dur_word_syl[0:num_words] temp_set_dur_syl = dur_word_syl[num_words:] ### additional feature matrix (syllable+phone+frame=432) ### num_frames = sum(temp_set_dur_phone) temp_set_af = numpy.empty((num_frames, self.MLU_div['length'][-1])) temp_set_af[0:num_syl, self.MLU_div['length'][0]:self. MLU_div['length'][1]] = temp_set_syl[ numpy.cumsum(temp_set_dur_syl) - 1] temp_set_af[0:num_ph, self.MLU_div['length'][1]:self. MLU_div['length'][2]] = temp_set_phone ### input word feature matrix ### temp_set_dur_word_segments = numpy.zeros(num_words, dtype='int32') syl_bound = numpy.cumsum(temp_set_dur_word) for indx in xrange(num_words): temp_set_dur_word_segments[indx] = int( sum(temp_set_dur_syl[0:syl_bound[indx]])) temp_set_x = temp_set_word[temp_set_dur_word_segments - 1] ### for batch processing ### lab_end_frame_number += num_words out_end_frame_number += out_frame_number new_temp_set_x[lab_start_frame_number:lab_end_frame_number, ] = temp_set_x[0:num_words, ] new_temp_set_y[out_start_frame_number:out_end_frame_number, ] = temp_set_y[0:out_frame_number, ] new_temp_set_af[out_start_frame_number:out_end_frame_number, ] = temp_set_af[0:out_frame_number, ] new_temp_set_d[0] = numpy.append(new_temp_set_d[0], temp_set_dur_word) new_temp_set_d[1] = numpy.append(new_temp_set_d[1], temp_set_dur_syl) new_temp_set_d[2] = numpy.append(new_temp_set_d[2], temp_set_dur_phone) lab_start_frame_number = lab_end_frame_number out_start_frame_number = out_end_frame_number new_temp_set_x = new_temp_set_x[0:lab_end_frame_number, ] new_temp_set_y = new_temp_set_y[0:out_end_frame_number, ] new_temp_set_af = new_temp_set_af[0:out_end_frame_number, ] new_temp_set_d = numpy.concatenate( (new_temp_set_d[0], new_temp_set_d[1], new_temp_set_d[2])) ### rest of the code similar to S2S ### self.file_index += 1 if self.file_index >= self.list_size: self.end_reading = True self.file_index = 0 shared_set_x = self.make_shared(new_temp_set_x, 'x') shared_set_y = self.make_shared(new_temp_set_y, 'y') shared_set_d = theano.shared(numpy.asarray(new_temp_set_d, dtype='int32'), name='d', borrow=True) shared_set_xyd = (shared_set_x, shared_set_y, shared_set_d) return shared_set_xyd, new_temp_set_x, new_temp_set_y, new_temp_set_d, new_temp_set_af
def load_next_partition(self): """Load one block data. The number of frames will be the buffer size set during intialisation. """ self.logger.debug('loading next partition') temp_set_x = numpy.empty((self.buffer_size, self.n_ins)) temp_set_y = numpy.empty((self.buffer_size, self.n_outs)) current_index = 0 ### first check whether there are remaining data from previous utterance if self.remain_frame_number > 0: temp_set_x[ current_index:self.remain_frame_number, ] = self.remain_data_x temp_set_y[ current_index:self.remain_frame_number, ] = self.remain_data_y current_index += self.remain_frame_number self.remain_frame_number = 0 io_fun = BinaryIOCollection() while True: if current_index >= self.buffer_size: break if self.file_index >= self.list_size: self.end_reading = True self.file_index = 0 break in_features, lab_frame_number = io_fun.load_binary_file_frame( self.x_files_list[self.file_index], self.n_ins) out_features, out_frame_number = io_fun.load_binary_file_frame( self.y_files_list[self.file_index], self.n_outs) frame_number = lab_frame_number if abs( lab_frame_number - out_frame_number ) < 5: ## we allow small difference here. may not be correct, but sometimes, there is one/two frames difference if lab_frame_number > out_frame_number: frame_number = out_frame_number else: base_file_name = self.x_files_list[self.file_index].split( '/')[-1].split('.')[0] self.logger.critical( "the number of frames in label and acoustic features are different: %d vs %d (%s)" % (lab_frame_number, out_frame_number, base_file_name)) raise out_features = out_features[0:frame_number, ] in_features = in_features[0:frame_number, ] if current_index + frame_number <= self.buffer_size: temp_set_x[current_index:current_index + frame_number, ] = in_features temp_set_y[current_index:current_index + frame_number, ] = out_features current_index = current_index + frame_number else: ## if current utterance cannot be stored in the block, then leave the remaining part for the next block used_frame_number = self.buffer_size - current_index temp_set_x[current_index:self.buffer_size, ] = in_features[ 0:used_frame_number, ] temp_set_y[current_index:self.buffer_size, ] = out_features[ 0:used_frame_number, ] current_index = self.buffer_size self.remain_data_x = in_features[ used_frame_number:frame_number, ] self.remain_data_y = out_features[ used_frame_number:frame_number, ] self.remain_frame_number = frame_number - used_frame_number self.file_index += 1 temp_set_x = temp_set_x[0:current_index, ] temp_set_y = temp_set_y[0:current_index, ] numpy.random.seed(271639) numpy.random.shuffle(temp_set_x) numpy.random.seed(271639) numpy.random.shuffle(temp_set_y) shared_set_x = self.make_shared(temp_set_x, 'x') shared_set_y = self.make_shared(temp_set_y, 'y') shared_set_xy = (shared_set_x, shared_set_y) # temp_set_x = self.make_shared(temp_set_x, 'x') # temp_set_y = self.make_shared(temp_set_y, 'y') return shared_set_xy, temp_set_x, temp_set_y
def generate_wav(gen_dir, file_id_list, cfg): logger = logging.getLogger("wav_generation") SPTK = cfg.SPTK # NND = cfg.NND STRAIGHT = cfg.STRAIGHT WORLD = cfg.WORLD ## to be moved pf_coef = cfg.pf_coef if isinstance(cfg.fw_alpha, basestring): if cfg.fw_alpha == 'Bark': fw_coef = bark_alpha(cfg.sr) elif cfg.fw_alpha == 'ERB': fw_coef = bark_alpha(cfg.sr) else: raise ValueError( 'cfg.fw_alpha=' + cfg.fw_alpha + ' not implemented, the frequency warping coefficient "fw_coef" cannot be deduced.' ) else: fw_coef = cfg.fw_alpha co_coef = cfg.co_coef fl_coef = cfg.fl if cfg.apply_GV: io_funcs = BinaryIOCollection() logger.info('loading global variance stats from %s' % (cfg.GV_dir)) ref_gv_mean_file = os.path.join(cfg.GV_dir, 'ref_gv.mean') gen_gv_mean_file = os.path.join(cfg.GV_dir, 'gen_gv.mean') ref_gv_std_file = os.path.join(cfg.GV_dir, 'ref_gv.std') gen_gv_std_file = os.path.join(cfg.GV_dir, 'gen_gv.std') ref_gv_mean, frame_number = io_funcs.load_binary_file_frame( ref_gv_mean_file, 1) gen_gv_mean, frame_number = io_funcs.load_binary_file_frame( gen_gv_mean_file, 1) ref_gv_std, frame_number = io_funcs.load_binary_file_frame( ref_gv_std_file, 1) gen_gv_std, frame_number = io_funcs.load_binary_file_frame( gen_gv_std_file, 1) counter = 1 max_counter = len(file_id_list) for filename in file_id_list: logger.info('creating waveform for %4d of %4d: %s' % (counter, max_counter, filename)) counter = counter + 1 base = filename files = { 'sp': base + cfg.sp_ext, 'mgc': base + cfg.mgc_ext, 'f0': base + '.f0', 'lf0': base + cfg.lf0_ext, 'ap': base + '.ap', 'bap': base + cfg.bap_ext, 'wav': base + '.wav' } mgc_file_name = files['mgc'] bap_file_name = files['bap'] cur_dir = os.getcwd() os.chdir(gen_dir) ### post-filtering if cfg.do_post_filtering: line = "echo 1 1 " for i in range(2, cfg.mgc_dim): line = line + str(pf_coef) + " " run_process('{line} >{weighttxt}'.format(line=line, weighttxt=os.path.join( gen_dir, 'weight.txt'))) run_process('{x2x} +af < {weighttxt} > {weight}'.format( x2x=SPTK['X2X'], weighttxt=os.path.join(gen_dir, 'weight.txt'), weight=os.path.join(gen_dir, 'weight.bin'))) run_process( '{freqt} -m {order} -a {fw} -M {co} -A 0 < {mgc} > {temp1}'. format(freqt=SPTK['FREQT'], order=cfg.mgc_dim - 1, fw=fw_coef, co=co_coef, mgc=files['mgc'], temp1=files['mgc'] + '_r0temp1')) run_process( '{c2acr} -m {co} -M 0 -l {fl} <{temp1} > {base_r0}'.format( co=co_coef, c2acr=SPTK['C2ACR'], fl=fl_coef, base_r0=files['mgc'] + '_r0', temp1=files['mgc'] + '_r0temp1')) run_process( '{vopr} -m -n {order} < {mgc} {weight} > {temp2}'.format( vopr=SPTK['VOPR'], order=cfg.mgc_dim - 1, mgc=files['mgc'], weight=os.path.join(gen_dir, 'weight.bin'), temp2=files['mgc'] + '_mgctemp2')) run_process( '{freqt} -m {order} -a {fw} -M {co} -A 0 < {temp2} > {temp3}'. format(order=cfg.mgc_dim - 1, freqt=SPTK['FREQT'], fw=fw_coef, co=co_coef, temp2=files['mgc'] + '_mgctemp2', temp3=files['mgc'] + '_mgctemp3')) run_process( '{c2acr} -m {co} -M 0 -l {fl} < {temp3} > {base_p_r0}'.format( temp3=files['mgc'] + '_mgctemp3', co=co_coef, c2acr=SPTK['C2ACR'], fl=fl_coef, base_p_r0=files['mgc'] + '_p_r0')) run_process( '{vopr} -m -n {order} < {mgc} {weight} > {temp4}'.format( vopr=SPTK['VOPR'], order=cfg.mgc_dim - 1, mgc=files['mgc'], weight=os.path.join(gen_dir, 'weight.bin'), temp4=files['mgc'] + '_mgctemp4')) run_process('{mc2b} -m {order} -a {fw} < {temp4} > {temp5}'.format( order=cfg.mgc_dim - 1, mc2b=SPTK['MC2B'], fw=fw_coef, temp4=files['mgc'] + '_mgctemp4', temp5=files['mgc'] + '_mgctemp5')) run_process( '{bcp} -n {order} -s 0 -e 0 < {temp5} > {base_b0}'.format( order=cfg.mgc_dim - 1, bcp=SPTK['BCP'], base_b0=files['mgc'] + '_b0', temp5=files['mgc'] + '_mgctemp5')) run_process('{vopr} -d < {base_r0} {base_p_r0} > {temp6}'.format( vopr=SPTK['VOPR'], base_r0=files['mgc'] + '_r0', base_p_r0=files['mgc'] + '_p_r0', temp6=files['mgc'] + '_mgctemp6')) run_process('{sopr} -LN -d 2 < {temp6} > {temp7}'.format( sopr=SPTK['SOPR'], temp6=files['mgc'] + '_mgctemp6', temp7=files['mgc'] + '_mgctemp7')) run_process('{vopr} -a {base_b0} < {temp7} > {base_p_b0}'.format( vopr=SPTK['VOPR'], temp7=files['mgc'] + '_mgctemp7', base_b0=files['mgc'] + '_b0', base_p_b0=files['mgc'] + '_p_b0')) run_process( '{vopr} -m -n {order} < {mgc} {weight} > {temp8}'.format( vopr=SPTK['VOPR'], order=cfg.mgc_dim - 1, mgc=files['mgc'], weight=os.path.join(gen_dir, 'weight.bin'), temp8=files['mgc'] + '_mgctemp8')) run_process('{mc2b} -m {order} -a {fw} < {temp8} > {temp9}'.format( order=cfg.mgc_dim - 1, mc2b=SPTK['MC2B'], fw=fw_coef, temp8=files['mgc'] + '_mgctemp8', temp9=files['mgc'] + '_mgctemp9')) run_process( '{bcp} -n {order} -s 1 -e {order} < {temp9} > {temp10}'.format( order=cfg.mgc_dim - 1, bcp=SPTK['BCP'], temp9=files['mgc'] + '_mgctemp9', temp10=files['mgc'] + '_mgctemp10')) run_process( '{merge} -n {order2} -s 0 -N 0 {base_p_b0} < {temp10} > {temp11}' .format(merge=SPTK['MERGE'], order2=cfg.mgc_dim - 2, base_p_b0=files['mgc'] + '_p_b0', temp10=files['mgc'] + '_mgctemp10', temp11=files['mgc'] + '_mgctemp11')) run_process( '{b2mc} -m {order} -a {fw} < {temp11} > {base_p_mgc}'.format( order=cfg.mgc_dim - 1, fw=fw_coef, b2mc=SPTK['B2MC'], base_p_mgc=files['mgc'] + '_p_mgc', temp11=files['mgc'] + '_mgctemp11')) mgc_file_name = files['mgc'] + '_p_mgc' if cfg.vocoder_type == "STRAIGHT" and cfg.apply_GV: gen_mgc, frame_number = io_funcs.load_binary_file_frame( mgc_file_name, cfg.mgc_dim) gen_mu = np.reshape(np.mean(gen_mgc, axis=0), (-1, 1)) gen_std = np.reshape(np.std(gen_mgc, axis=0), (-1, 1)) local_gv = (ref_gv_std / gen_gv_std) * (gen_std - gen_gv_mean) + ref_gv_mean enhanced_mgc = np.repeat(local_gv, frame_number, 1).T / np.repeat( gen_std, frame_number, 1).T * (gen_mgc - np.repeat( gen_mu, frame_number, 1).T) + np.repeat( gen_mu, frame_number, 1).T new_mgc_file_name = files['mgc'] + '_p_mgc' io_funcs.array_to_binary_file(enhanced_mgc, new_mgc_file_name) mgc_file_name = files['mgc'] + '_p_mgc' if cfg.do_post_filtering and cfg.apply_GV: logger.critical( 'Both smoothing techniques together can\'t be applied!!\n') raise ###mgc to sp to wav if cfg.vocoder_type == 'STRAIGHT': run_process( '{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} > {sp}' .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.mgc_dim - 1, fl=cfg.fl, mgc=mgc_file_name, sp=files['sp'])) run_process( '{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} > {f0}'.format( sopr=SPTK['SOPR'], lf0=files['lf0'], f0=files['f0'])) run_process('{x2x} +fa {f0} > {f0a}'.format(x2x=SPTK['X2X'], f0=files['f0'], f0a=files['f0'] + '.a')) if cfg.use_cep_ap: run_process( '{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 0 {bap} > {ap}' .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.bap_dim - 1, fl=cfg.fl, bap=files['bap'], ap=files['ap'])) else: run_process('{bndap2ap} {bap} > {ap}'.format( bndap2ap=STRAIGHT['BNDAP2AP'], bap=files['bap'], ap=files['ap'])) run_process( '{synfft} -f {sr} -spec -fftl {fl} -shift {shift} -sigp 0.0 -cornf 400 -float -apfile {ap} {f0a} {sp} {wav}' .format(synfft=STRAIGHT['SYNTHESIS_FFT'], sr=cfg.sr, fl=cfg.fl, shift=cfg.shift, ap=files['ap'], f0a=files['f0'] + '.a', sp=files['sp'], wav=files['wav'])) run_process('rm -f {sp} {f0} {f0a} {ap}'.format(sp=files['sp'], f0=files['f0'], f0a=files['f0'] + '.a', ap=files['ap'])) elif cfg.vocoder_type == 'WORLD': run_process( '{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} > {temp12}'. format(sopr=SPTK['SOPR'], lf0=files['lf0'], temp12=files['f0'] + '_temp12')) run_process('{x2x} +fd < {temp12} > {f0}'.format( x2x=SPTK['X2X'], f0=files['f0'], temp12=files['f0'] + '_temp12')) run_process('{sopr} -c 0 {bap} > {temp13}'.format( sopr=SPTK['SOPR'], bap=files['bap'], temp13=files['ap'] + '_temp13')) run_process('{x2x} +fd < {temp13} > {ap}'.format( x2x=SPTK['X2X'], ap=files['ap'], temp13=files['ap'] + '_temp13')) ### If using world v2, please comment above line and uncomment this #run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 0 {bap} | {sopr} -d 32768.0 -P | {x2x} +fd > {ap}' # .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.bap_dim, fl=cfg.fl, bap=bap_file_name, sopr=SPTK['SOPR'], x2x=SPTK['X2X'], ap=files['ap'])) run_process( '{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} > {temp14}' .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.mgc_dim - 1, fl=cfg.fl, mgc=mgc_file_name, temp14=files['sp'] + '_temp14')) run_process('{sopr} -d 32768.0 -P < {temp14} > {temp15}'.format( sopr=SPTK['SOPR'], temp14=files['sp'] + '_temp14', temp15=files['sp'] + '_temp15')) run_process('{x2x} +fd < {temp15} > {sp}'.format( x2x=SPTK['X2X'], sp=files['sp'], temp15=files['sp'] + '_temp15')) run_process('{synworld} {fl} {sr} {f0} {sp} {ap} {wav}'.format( synworld=WORLD['SYNTHESIS'], fl=cfg.fl, sr=cfg.sr, f0=files['f0'], sp=files['sp'], ap=files['ap'], wav=files['wav'])) #run_process('rm -f {ap} {sp} {f0}'.format(ap=files['ap'],sp=files['sp'],f0=files['f0'])) else: logger.critical('The vocoder %s is not supported yet!\n' % cfg.vocoder_type) raise os.chdir(cur_dir)
def trim_silence(in_list, out_list, in_dimension, label_list, label_dimension, \ silence_feature_index, percent_to_keep=0): ''' Function to trim silence from binary label/speech files based on binary labels. in_list: list of binary label/speech files to trim out_list: trimmed files in_dimension: dimension of data to trim label_list: list of binary labels which contain trimming criterion label_dimesion: silence_feature_index: index of feature in labels which is silence: 1 means silence (trim), 0 means leave. ''' assert len(in_list) == len(out_list) == len(label_list) io_funcs = BinaryIOCollection() for (infile, outfile, label_file) in zip(in_list, out_list, label_list): data = io_funcs.load_binary_file(infile, in_dimension) label = io_funcs.load_binary_file(label_file, label_dimension) audio_label_difference = data.shape[0] - label.shape[0] assert math.fabs(audio_label_difference) < 3,'%s and %s contain different numbers of frames: %s %s'%(infile, label_file, data.shape[0], label.shape[0]) ## In case they are different, resize -- keep label fixed as we assume this has ## already been processed. (This problem only arose with STRAIGHT features.) if audio_label_difference < 0: ## label is longer -- pad audio to match by repeating last frame: print('audio too short -- pad') padding = numpy.vstack([data[-1, :]] * int(math.fabs(audio_label_difference))) data = numpy.vstack([data, padding]) elif audio_label_difference > 0: ## audio is longer -- cut it print('audio too long -- trim') new_length = label.shape[0] data = data[:new_length, :] #else: -- expected case -- lengths match, so do nothing silence_flag = label[:, silence_feature_index] # print silence_flag if not (numpy.unique(silence_flag) == numpy.array([0,1])).all(): ## if it's all 0s or 1s, that's ok: assert (numpy.unique(silence_flag) == numpy.array([0]).all()) or \ (numpy.unique(silence_flag) == numpy.array([1]).all()), \ 'dimension %s of %s contains values other than 0 and 1'%(silence_feature_index, infile) print('Remove %d%% of frames (%s frames) as silence... '%(100 * numpy.sum(silence_flag / float(len(silence_flag))), int(numpy.sum(silence_flag)))) non_silence_indices = numpy.nonzero(silence_flag == 0) ## get the indices where silence_flag == 0 is True (i.e. != 0) if percent_to_keep != 0: assert type(percent_to_keep) == int and percent_to_keep > 0 #print silence_flag silence_indices = numpy.nonzero(silence_flag == 1) ## nonzero returns a tuple of arrays, one for each dimension of input array silence_indices = silence_indices[0] every_nth = 100 / percent_to_keep silence_indices_to_keep = silence_indices[::every_nth] ## every_nth used +as step value in slice ## -1 due to weird error with STRAIGHT features at line 144: ## IndexError: index 445 is out of bounds for axis 0 with size 445 if len(silence_indices_to_keep) == 0: silence_indices_to_keep = numpy.array([1]) ## avoid errors in case there is no silence print(' Restore %s%% (every %sth frame: %s frames) of silent frames'%(percent_to_keep, every_nth, len(silence_indices_to_keep))) ## Append to end of utt -- same function used for labels and audio ## means that violation of temporal order doesn't matter -- will be consistent. ## Later, frame shuffling will disperse silent frames evenly across minibatches: non_silence_indices = ( numpy.hstack( [non_silence_indices[0], silence_indices_to_keep] ) ) ## ^---- from tuple and back (see nonzero note above) trimmed_data = data[non_silence_indices, :] ## advanced integer indexing io_funcs.array_to_binary_file(trimmed_data, outfile)
def extract_acousitc_label_features(self, orig_file, output_file): io_funcs = BinaryIOCollection() totalMat = io_funcs.file2matrix(orig_file, numpy.int) labelMat = totalMat[:, :-5] durMat = totalMat[:, -5:] label_len = totalMat.shape[1] - 5 self.label_dimension = label_len + 9 phone_number = labelMat.shape[0] label_feature_matrix = numpy.empty((100000, self.label_dimension)) state_number = 5 label_feature_index = 0 for phone_index in xrange(phone_number): label_vector = labelMat[phone_index, :] state_vector = durMat[phone_index, :] phone_duration = 0 state_duration_bases = numpy.zeros((5, ), dtype=numpy.int) for state_index in xrange(state_number): state_duration_bases[state_index] = phone_duration phone_duration = phone_duration + state_vector[state_index] for state_index in xrange(state_number): frame_number = state_vector[state_index] current_block_binary_array = numpy.zeros( (frame_number, self.label_dimension)) state_duration_base = state_duration_bases[state_index] state_index_backward = state_number - state_index for i in xrange(frame_number): current_block_binary_array[i, 0:label_len] = label_vector current_block_binary_array[ i, label_len] = float(i + 1) / float( frame_number) ## fraction through state (forwards) current_block_binary_array[ i, label_len + 1] = float(frame_number - i) / float( frame_number ) ## fraction through state (backwards) current_block_binary_array[i, label_len + 2] = float( frame_number) ## length of state in frames current_block_binary_array[i, label_len + 3] = float( state_index) ## state index (counting forwards) current_block_binary_array[i, label_len + 4] = float( state_index_backward ) ## state index (counting backwards) current_block_binary_array[i, label_len + 5] = float( phone_duration) ## length of phone in frames current_block_binary_array[ i, label_len + 6] = float(frame_number) / float( phone_duration ) ## fraction of the phone made up by current state current_block_binary_array[i, label_len + 7] = float( phone_duration - i - state_duration_base) / float( phone_duration ) ## fraction through phone (forwards) current_block_binary_array[ i, label_len + 8] = float(state_duration_base + i + 1) / float( phone_duration ) ## fraction through phone (backwards) label_feature_matrix[ label_feature_index:label_feature_index + frame_number, ] = current_block_binary_array label_feature_index = label_feature_index + frame_number label_feature_matrix = label_feature_matrix[0:label_feature_index, ] #print label_feature_matrix.shape io_funcs.array_to_binary_file(label_feature_matrix, output_file)
def modify_dur_from_state_alignment_labels(self, label_file_name, gen_dur_file_name, gen_lab_file_name): logger = logging.getLogger("dur") state_number = self.state_number dur_dim = state_number io_funcs = BinaryIOCollection() dur_features, frame_number = io_funcs.load_binary_file_frame( gen_dur_file_name, dur_dim) fid = open(label_file_name) utt_labels = fid.readlines() fid.close() label_number = len(utt_labels) logger.info('loaded %s, %3d labels' % (label_file_name, label_number)) out_fid = open(gen_lab_file_name, 'w') current_index = 0 prev_end_time = 0 for line in utt_labels: line = line.strip() if len(line) < 1: continue temp_list = re.split('\s+', line) if len(temp_list) == 1: start_time = 0 end_time = 600000 ## hard-coded silence duration full_label = temp_list[0] else: start_time = int(temp_list[0]) end_time = int(temp_list[1]) full_label = temp_list[2] full_label_length = len( full_label) - 3 # remove state information [k] state_index = full_label[full_label_length + 1] state_index = int(state_index) - 1 label_binary_flag = self.check_silence_pattern(full_label) if len(temp_list) == 1: for state_index in range(1, state_number + 1): if label_binary_flag == 1: current_state_dur = end_time - start_time else: pred_state_dur = dur_features[current_index, state_index - 1] current_state_dur = int(pred_state_dur) * 5 * 10000 out_fid.write( str(prev_end_time) + ' ' + str(prev_end_time + current_state_dur) + ' ' + full_label + '[' + str(state_index + 1) + ']\n') prev_end_time = prev_end_time + current_state_dur else: if label_binary_flag == 1: current_state_dur = end_time - start_time else: pred_state_dur = dur_features[current_index, state_index - 1] current_state_dur = int(pred_state_dur) * 5 * 10000 out_fid.write( str(prev_end_time) + ' ' + str(prev_end_time + current_state_dur) + ' ' + full_label + '\n') prev_end_time = prev_end_time + current_state_dur if state_index == state_number and label_binary_flag != 1: current_index += 1 logger.debug( 'modifed label with predicted duration of %d frames x %d features' % dur_features.shape)
dr = int(dr) * 5 * 10000 op1.write( str(prev_ed) + ' ' + str(prev_ed + dr) + ' ' + fstr[2] + '\n') prev_ed = prev_ed + dr count = count + 1 ip2.close() op1.close() if __name__ == "__main__": htsclass = readHTSlabelFile() io_funcs = BinaryIOCollection() ### speaker ### speaker = 'blz16' decomposition_unit = 'phone' normalization = 'MVN' out_dim = 8 CTC_classes = 12 ### Absolute work path ### work_dir = '/afs/inf.ed.ac.uk/group/cstr/projects/phd/s1432486/work/Hybrid_prosody_model/' label_align_dir = os.path.join( work_dir, 'Data/inter-module/' + speaker + '/label_state_align') feat_dir_path = 'dur_' + decomposition_unit + '_' + str(out_dim)
def prepare_data(self, in_file_list_dict, out_file_list, in_dimension_dict, out_dimension_dict): logger = logging.getLogger("acoustic_comp") stream_start_index = {} stream_dim_index = 0 for stream_name in list(out_dimension_dict.keys()): if stream_name not in stream_start_index: stream_start_index[stream_name] = stream_dim_index stream_dim_index += out_dimension_dict[stream_name] io_funcs = BinaryIOCollection() for i in range(self.file_number): out_file_name = out_file_list[i] #if os.path.isfile(out_file_name): # logger.info('processing file %4d of %4d : %s exists' % (i+1, self.file_number, out_file_name)) # continue logger.info('processing file %4d of %4d : %s' % (i + 1, self.file_number, out_file_name)) out_data_matrix = None out_frame_number = 0 for k in range(self.data_stream_number): data_stream_name = self.data_stream_list[k] in_file_name = in_file_list_dict[data_stream_name][i] in_feature_dim = in_dimension_dict[data_stream_name] features, frame_number = io_funcs.load_binary_file_frame( in_file_name, in_feature_dim) if k == 0: out_frame_number = frame_number out_data_matrix = numpy.zeros( (out_frame_number, self.out_dimension)) if frame_number > out_frame_number: features = features[0:out_frame_number, ] frame_number = out_frame_number try: assert out_frame_number == frame_number except AssertionError: logger.critical( 'the frame number of data stream %s is not consistent with others: current %d others %d' % (data_stream_name, out_frame_number, frame_number)) raise dim_index = stream_start_index[data_stream_name] if data_stream_name in ['lf0', 'F0']: ## F0 added for GlottHMM features, vuv_vector = self.interpolate_f0(features) ### if vuv information to be recorded, store it in corresponding column if self.record_vuv: out_data_matrix[0:out_frame_number, stream_start_index['vuv']: stream_start_index['vuv'] + 1] = vuv_vector out_data_matrix[0:out_frame_number, dim_index:dim_index + in_feature_dim] = features dim_index = dim_index + in_feature_dim if self.compute_dynamic[data_stream_name]: delta_features = self.compute_dynamic_matrix( features, self.delta_win, frame_number, in_feature_dim) acc_features = self.compute_dynamic_matrix( features, self.acc_win, frame_number, in_feature_dim) out_data_matrix[0:out_frame_number, dim_index:dim_index + in_feature_dim] = delta_features dim_index = dim_index + in_feature_dim out_data_matrix[0:out_frame_number, dim_index:dim_index + in_feature_dim] = acc_features ### write data to file io_funcs.array_to_binary_file(out_data_matrix, out_file_name) logger.debug(' wrote %d frames of features', out_frame_number)
def acoustic_decomposition(self, in_file_list, dimension, out_dimension_dict, file_extension_dict, var_file_dict, do_MLPG=True, cfg=None): # pdb.set_trace() logger = logging.getLogger('param_generation') logger.debug('acoustic_decomposition for %d files' % len(in_file_list) ) self.load_covariance(var_file_dict, out_dimension_dict) stream_start_index = {} dimension_index = 0 recorded_vuv = False vuv_dimension = None for feature_name in list(out_dimension_dict.keys()): stream_start_index[feature_name] = dimension_index dimension_index += out_dimension_dict[feature_name] io_funcs = BinaryIOCollection() mlpg_algo = MLParameterGeneration() findex=0 flen=len(in_file_list) for file_name in in_file_list: findex=findex+1 dir_name = os.path.dirname(file_name) file_id = os.path.splitext(os.path.basename(file_name))[0] features, frame_number = io_funcs.load_binary_file_frame(file_name, dimension) # logger.info('processing %4d of %4d: %s' % (findex,flen,file_name) ) # if file_name == "/home/gyzhang/merlin/egs/kingtts/s2/experiments/kingtts/acoustic_model/gen/feed_forward_4_relu/103002.cmp": # pdb.set_trace() for feature_name in self.gen_wav_features: logger.debug(' feature: %s' % feature_name) current_features = features[:, stream_start_index[feature_name]:stream_start_index[feature_name]+out_dimension_dict[feature_name]] if FAST_MLPG: ### fast version wants variance per frame, not single global one: var = self.var[feature_name] var = numpy.transpose(numpy.tile(var,frame_number)) else: var = self.var[feature_name] # print var.shape[1] if do_MLPG == False: gen_features = current_features else: # print("mlpg generate {}".format(file_name)) gen_features = mlpg_algo.generation(current_features, var, out_dimension_dict[feature_name]//3) logger.debug(' feature dimensions: %d by %d' %(gen_features.shape[0], gen_features.shape[1])) if feature_name in ['lf0', 'F0']: if 'vuv' in stream_start_index: vuv_feature = features[:, stream_start_index['vuv']:stream_start_index['vuv']+1] for i in range(frame_number): if vuv_feature[i, 0] < 0.5 or gen_features[i, 0] < numpy.log(20): gen_features[i, 0] = self.inf_float new_file_name = os.path.join(dir_name, file_id + file_extension_dict[feature_name]) if self.enforce_silence: silence_pattern = cfg.silence_pattern label_align_dir = cfg.in_label_align_dir in_f = open(label_align_dir+'/'+file_id+'.lab','r') for line in in_f.readlines(): line = line.strip() if len(line) < 1: continue temp_list = re.split('\s+', line) start_time = int(int(temp_list[0])*(10**-4)/5) end_time = int(int(temp_list[1])*(10**-4)/5) full_label = temp_list[2] label_binary_flag = self.check_silence_pattern(full_label, silence_pattern) if label_binary_flag: if feature_name in ['lf0', 'F0', 'mag']: gen_features[start_time:end_time, :] = self.inf_float else: gen_features[start_time:end_time, :] = 0.0 io_funcs.array_to_binary_file(gen_features, new_file_name) logger.debug(' wrote to file %s' % new_file_name)