def normal_standardization(self, in_file_list, out_file_list, feature_dimension): # self.dimension_dict = dimension_dict self.feature_dimension = feature_dimension mean_vector = self.compute_mean(in_file_list, 0, feature_dimension) std_vector = self.compute_std(in_file_list, mean_vector, 0, feature_dimension) io_funcs = BinaryIOCollection() file_number = len(in_file_list) for i in range(file_number): features, current_frame_number = io_funcs.load_binary_file_frame( in_file_list[i], self.feature_dimension) mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1)) std_matrix = numpy.tile(std_vector, (current_frame_number, 1)) norm_features = (features - mean_matrix) / std_matrix io_funcs.array_to_binary_file(norm_features, out_file_list[i]) return mean_vector, std_vector
def process_utterance(self, utt): # if utt.has_attribute("waveform"): # print "Utt has a natural waveform -- don't synthesise" # return if not self.trained: print('WARNING: Cannot apply processor %s till model is trained' % (self.processor_name)) return label = utt.get_filename(self.input_label_filetype) owave = utt.get_filename(self.output_filetype) streams = self.model.generate(label, variance_expansion=self.variance_expansion, \ fill_unvoiced_gaps=self.fill_unvoiced_gaps) # TODO: save streams to binary files # Streams are a dictionary: {bap, lf0, mgc, vuv} # I can specify path via self.voice_resources['voice'] ('/home/alexander/Documents/text_to_speech/projects/Ossian_py3/voices//ice/ivona/lvl_lex_01_nn') directory = os.path.join(self.voice_resources.path['voice'], 'output', 'cmp') if not os.path.exists(directory): os.makedirs(directory) io = BinaryIOCollection() for name, data in streams.items(): file = os.path.join(directory, utt.data.attrib['utterance_name'] + '.' + name) io.array_to_binary_file(data, file) # writelist(utt_data=data, label_file=file, uni=False) self.world_resynth(streams, owave)
def merge_data(self, binary_label_file_list, new_feat_file_list, out_feat_file_list): ''' merging new features with normalised label features ''' utt_number = len(new_feat_file_list) if utt_number != len(binary_label_file_list): print("the number of new feature input files and label files should be the same!\n"); sys.exit(1) new_feat_ext = new_feat_file_list[0].split('/')[-1].split('.')[1] io_funcs = BinaryIOCollection() for i in range(utt_number): lab_file_name = binary_label_file_list[i] new_feat_file_name = new_feat_file_list[i] out_feat_file_name = out_feat_file_list[i] lab_features, lab_frame_number = io_funcs.load_binary_file_frame(lab_file_name, self.lab_dim) new_features, feat_frame_number = io_funcs.load_binary_file_frame(new_feat_file_name, self.feat_dim) if (lab_frame_number - feat_frame_number)>5: base_file_name = new_feat_file_list[i].split('/')[-1].split('.')[0] self.logger.critical("the number of frames in label and new features are different: %d vs %d (%s)" %(lab_frame_number, feat_frame_number, base_file_name)) raise merged_features = numpy.zeros((lab_frame_number, self.lab_dim+self.feat_dim)) merged_features[0:lab_frame_number, 0:self.lab_dim] = lab_features merged_features[0:feat_frame_number, self.lab_dim:self.lab_dim+self.feat_dim] = new_features[0:lab_frame_number, ] io_funcs.array_to_binary_file(merged_features, out_feat_file_name) self.logger.debug('merged new feature %s of %d frames with %d label features' % (new_feat_ext, feat_frame_number,lab_frame_number) )
def feature_normalisation(self, in_file_list, out_file_list): logger = logging.getLogger('feature_normalisation') # self.feature_dimension = feature_dimension try: assert len(in_file_list) == len(out_file_list) except AssertionError: logger.critical( 'The input and output file numbers are not the same! %d vs %d' % (len(in_file_list), len(out_file_list))) raise if self.mean_vector == None: self.mean_vector = self.compute_mean(in_file_list, 0, self.feature_dimension) if self.std_vector == None: self.std_vector = self.compute_std(in_file_list, self.mean_vector, 0, self.feature_dimension) io_funcs = BinaryIOCollection() file_number = len(in_file_list) for i in xrange(file_number): features, current_frame_number = io_funcs.load_binary_file_frame( in_file_list[i], self.feature_dimension) mean_matrix = numpy.tile(self.mean_vector, (current_frame_number, 1)) std_matrix = numpy.tile(self.std_vector, (current_frame_number, 1)) norm_features = (features - mean_matrix) / std_matrix io_funcs.array_to_binary_file(norm_features, out_file_list[i]) return self.mean_vector, self.std_vector
def make_equal_frames(self, in_file_list, ref_file_list, in_dimension_dict): logger = logging.getLogger("acoustic_comp") logger.info('making equal number of lines...') io_funcs = BinaryIOCollection() utt_number = len(in_file_list) for i in range(utt_number): in_file_name = in_file_list[i] in_data_stream_name = in_file_name.split('.')[-1] in_feature_dim = in_dimension_dict[in_data_stream_name] in_features, in_frame_number = io_funcs.load_binary_file_frame(in_file_name, in_feature_dim) ref_file_name = ref_file_list[i] ref_data_stream_name = ref_file_name.split('.')[-1] ref_feature_dim = in_dimension_dict[ref_data_stream_name] ref_features, ref_frame_number = io_funcs.load_binary_file_frame(ref_file_name, ref_feature_dim) target_features = numpy.zeros((ref_frame_number, in_feature_dim)) if in_frame_number == ref_frame_number: continue; elif in_frame_number > ref_frame_number: target_features[0:ref_frame_number, ] = in_features[0:ref_frame_number, ] elif in_frame_number < ref_frame_number: target_features[0:in_frame_number, ] = in_features[0:in_frame_number, ] io_funcs.array_to_binary_file(target_features, in_file_name) logger.info('Finished: made equal rows in data stream %s with reference to data stream %s ' %(in_data_stream_name, ref_data_stream_name))
def normalise_data(self, in_file_list, out_file_list): file_number = len(in_file_list) fea_max_min_diff = self.max_vector - self.min_vector diff_value = self.target_max_value - self.target_min_value fea_max_min_diff = numpy.reshape(fea_max_min_diff, (1, self.feature_dimension)) target_max_min_diff = numpy.zeros((1, self.feature_dimension)) target_max_min_diff.fill(diff_value) target_max_min_diff[fea_max_min_diff <= 0.0] = 1.0 fea_max_min_diff[fea_max_min_diff <= 0.0] = 1.0 io_funcs = BinaryIOCollection() for i in range(file_number): features = io_funcs.load_binary_file(in_file_list[i], self.feature_dimension) frame_number = features.size // self.feature_dimension fea_min_matrix = numpy.tile(self.min_vector, (frame_number, 1)) target_min_matrix = numpy.tile(self.target_min_value, (frame_number, self.feature_dimension)) fea_diff_matrix = numpy.tile(fea_max_min_diff, (frame_number, 1)) diff_norm_matrix = numpy.tile(target_max_min_diff, (frame_number, 1)) / fea_diff_matrix norm_features = diff_norm_matrix * (features - fea_min_matrix) + target_min_matrix ## If we are to keep some columns unnormalised, use advanced indexing to ## reinstate original values: m,n = numpy.shape(features) for col in self.exclude_columns: norm_features[list(range(m)),[col]*m] = features[list(range(m)),[col]*m] io_funcs.array_to_binary_file(norm_features, out_file_list[i])
def denormalise_data(self, in_file_list, out_file_list): logger = logging.getLogger("acoustic_norm") file_number = len(in_file_list) logger.info('MinMaxNormalisation.denormalise_data for %d files' % file_number) # print self.max_vector, self.min_vector fea_max_min_diff = self.max_vector - self.min_vector diff_value = self.target_max_value - self.target_min_value # logger.debug('reshaping fea_max_min_diff from shape %s to (1,%d)' % (fea_max_min_diff.shape, self.feature_dimension) ) fea_max_min_diff = numpy.reshape(fea_max_min_diff, (1, self.feature_dimension)) target_max_min_diff = numpy.zeros((1, self.feature_dimension)) target_max_min_diff.fill(diff_value) target_max_min_diff[fea_max_min_diff <= 0.0] = 1.0 fea_max_min_diff[fea_max_min_diff <= 0.0] = 1.0 io_funcs = BinaryIOCollection() for i in range(file_number): features = io_funcs.load_binary_file(in_file_list[i], self.feature_dimension) frame_number = features.size // self.feature_dimension fea_min_matrix = numpy.tile(self.min_vector, (frame_number, 1)) target_min_matrix = numpy.tile(self.target_min_value, (frame_number, self.feature_dimension)) fea_diff_matrix = numpy.tile(fea_max_min_diff, (frame_number, 1)) diff_norm_matrix = fea_diff_matrix / numpy.tile(target_max_min_diff, (frame_number, 1)) norm_features = diff_norm_matrix * (features - target_min_matrix) + fea_min_matrix io_funcs.array_to_binary_file(norm_features, out_file_list[i])
def feature_denormalisation(self, in_file_list, out_file_list, mean_vector, std_vector): io_funcs = BinaryIOCollection() file_number = len(in_file_list) try: assert len(in_file_list) == len(out_file_list) except AssertionError: logger.critical( 'The input and output file numbers are not the same! %d vs %d' % (len(in_file_list), len(out_file_list))) raise try: assert mean_vector.size == self.feature_dimension and std_vector.size == self.feature_dimension except AssertionError: logger.critical( 'the dimensionalities of the mean and standard derivation vectors are not the same as the dimensionality of the feature' ) raise for i in xrange(file_number): features, current_frame_number = io_funcs.load_binary_file_frame( in_file_list[i], self.feature_dimension) mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1)) std_matrix = numpy.tile(std_vector, (current_frame_number, 1)) norm_features = features * std_matrix + mean_matrix io_funcs.array_to_binary_file(norm_features, out_file_list[i])
def merge_label(self, binary_label_file_list, new_feat_file_list, out_feat_file_list): """ merging additional label for each utterance. """ utt_number = len(new_feat_file_list) if utt_number != len(binary_label_file_list): print( "the number of new feature input files and label files should be the same!\n" ) sys.exit(1) io_funcs = BinaryIOCollection() for i in range(utt_number): lab_file_name = binary_label_file_list[i] new_feat_file_name = new_feat_file_list[i] out_feat_file_name = out_feat_file_list[i] lab_features, lab_frame_number = io_funcs.load_binary_file_frame( lab_file_name, self.lab_dim) # shape of new feature shoule be (1, dim) new_features = io_funcs.load_binary_file(new_feat_file_name, self.feat_dim) # expand shape of new feature to (T, dim) new_features = numpy.tile(new_features, (lab_frame_number, 1)) merged_features = numpy.zeros( (lab_frame_number, self.lab_dim + self.feat_dim)) merged_features[0:lab_frame_number, 0:self.lab_dim] = lab_features merged_features[0:lab_frame_number, self.lab_dim:self.lab_dim + self.feat_dim] = new_features[0:lab_frame_number, ] io_funcs.array_to_binary_file(merged_features, out_feat_file_name)
def produce_nn_cmp(self, in_file_list, out_file_list): logger = logging.getLogger("acoustic_norm") delta_win = [-0.5, 0.0, 0.5] acc_win = [1.0, -2.0, 1.0] file_number = len(in_file_list) logger.info('starting creation of %d files' % file_number) for i in xrange(file_number): mgc_data, bap_data, lf0_data = self.load_cmp_file(in_file_list[i]) ip_lf0, vuv_vector = self.interpolate_f0(lf0_data) delta_lf0 = self.compute_delta(ip_lf0, delta_win) acc_lf0 = self.compute_delta(ip_lf0, acc_win) frame_number = ip_lf0.size cmp_data = numpy.concatenate((mgc_data, ip_lf0, delta_lf0, acc_lf0, vuv_vector, bap_data), axis=1) io_funcs = BinaryIOCollection() io_funcs.array_to_binary_file(cmp_data, out_file_list[i]) logger.info('finished creation of %d binary files' % file_number)
def predict(self, test_x, out_scaler, gen_test_file_list, sequential_training=False, stateful=False): #### compute predictions #### io_funcs = BinaryIOCollection() test_file_number = len(gen_test_file_list) print("generating features on held-out test data...") for utt_index in range(test_file_number): gen_test_file_name = gen_test_file_list[utt_index] test_id = os.path.splitext(os.path.basename(gen_test_file_name))[0] temp_test_x = test_x[test_id] num_of_rows = temp_test_x.shape[0] if stateful: temp_test_x = data_utils.get_stateful_input( temp_test_x, self.seq_length, self.batch_size) elif sequential_training: temp_test_x = np.reshape(temp_test_x, (1, num_of_rows, self.n_in)) predictions = self.model.predict(temp_test_x) if sequential_training: predictions = np.reshape(predictions, (num_of_rows, self.n_out)) data_utils.denorm_data(predictions, out_scaler) io_funcs.array_to_binary_file(predictions, gen_test_file_name) data_utils.drawProgressBar(utt_index + 1, test_file_number) sys.stdout.write("\n")
def feature_normalisation(self, in_file_list, out_file_list): logger = logging.getLogger('feature_normalisation') # self.feature_dimension = feature_dimension try: assert len(in_file_list) == len(out_file_list) except AssertionError: logger.critical('The input and output file numbers are not the same! %d vs %d' %(len(in_file_list), len(out_file_list))) raise if self.mean_vector is None: self.mean_vector = self.compute_mean(in_file_list, 0, self.feature_dimension) if self.std_vector is None: self.std_vector = self.compute_std(in_file_list, self.mean_vector, 0, self.feature_dimension) io_funcs = BinaryIOCollection() file_number = len(in_file_list) for i in range(file_number): features, current_frame_number = io_funcs.load_binary_file_frame(in_file_list[i], self.feature_dimension) mean_matrix = numpy.tile(self.mean_vector, (current_frame_number, 1)) std_matrix = numpy.tile(self.std_vector, (current_frame_number, 1)) norm_features = (features - mean_matrix) / std_matrix print(current_frame_number,in_file_list[i]) norm_features=numpy.concatenate([norm_features[:,:self.feature_dimension-2],features[:,self.feature_dimension-2:]],axis=-1) # in fact the problem is that I normalized the out put xvector pvector and onehotvector.... so we have to in formalized this print(' normalized vector :{}'.format(norm_features[1,:])) io_funcs.array_to_binary_file(norm_features, out_file_list[i]) return self.mean_vector, self.std_vector
def extract_dur_features(self, orig_file, output_file): io_funcs = BinaryIOCollection() totalMat = io_funcs.file2matrix(orig_file) self.label_dimension = totalMat.shape[1] - 5 # collum num durMat = totalMat[:, -5:] io_funcs.array_to_binary_file(durMat, output_file)
def predict(self, test_x, out_scaler, gen_test_file_list, sequential_training=False, stateful=False): #### compute predictions #### io_funcs = BinaryIOCollection() test_file_number = len(gen_test_file_list) print("generating features on held-out test data...") for utt_index in range(test_file_number): gen_test_file_name = gen_test_file_list[utt_index] test_id = os.path.splitext(os.path.basename(gen_test_file_name))[0] temp_test_x = test_x[test_id] num_of_rows = temp_test_x.shape[0] if stateful: temp_test_x = data_utils.get_stateful_input(temp_test_x, self.seq_length, self.batch_size) elif sequential_training: temp_test_x = np.reshape(temp_test_x, (1, num_of_rows, self.n_in)) predictions = self.model.predict(temp_test_x) if sequential_training: predictions = np.reshape(predictions, (num_of_rows, self.n_out)) data_utils.denorm_data(predictions, out_scaler) io_funcs.array_to_binary_file(predictions, gen_test_file_name) data_utils.drawProgressBar(utt_index+1, test_file_number) sys.stdout.write("\n")
def feature_normalisation(self, in_file_list, out_file_list): logger = logging.getLogger('feature_normalisation') # self.feature_dimension = feature_dimension try: assert len(in_file_list) == len(out_file_list) except AssertionError: logger.critical('The input and output file numbers are not the same! %d vs %d' %(len(in_file_list), len(out_file_list))) raise if self.mean_vector == None: self.mean_vector = self.compute_mean(in_file_list, 0, self.feature_dimension) if self.std_vector == None: self.std_vector = self.compute_std(in_file_list, self.mean_vector, 0, self.feature_dimension) io_funcs = BinaryIOCollection() file_number = len(in_file_list) for i in xrange(file_number): features, current_frame_number = io_funcs.load_binary_file_frame(in_file_list[i], self.feature_dimension) mean_matrix = numpy.tile(self.mean_vector, (current_frame_number, 1)) std_matrix = numpy.tile(self.std_vector, (current_frame_number, 1)) norm_features = (features - mean_matrix) / std_matrix io_funcs.array_to_binary_file(norm_features, out_file_list[i]) return self.mean_vector, self.std_vector
def make_equal_frames(self, in_file_list, ref_file_list, in_dimension_dict): logger = logging.getLogger("acoustic_comp") logger.info('making equal number of lines...') io_funcs = BinaryIOCollection() utt_number = len(in_file_list) for i in xrange(utt_number): in_file_name = in_file_list[i] in_data_stream_name = in_file_name.split('.')[-1] in_feature_dim = in_dimension_dict[in_data_stream_name] in_features, in_frame_number = io_funcs.load_binary_file_frame(in_file_name, in_feature_dim) ref_file_name = ref_file_list[i] ref_data_stream_name = ref_file_name.split('.')[-1] ref_feature_dim = in_dimension_dict[ref_data_stream_name] ref_features, ref_frame_number = io_funcs.load_binary_file_frame(ref_file_name, ref_feature_dim) target_features = numpy.zeros((ref_frame_number, in_feature_dim)) if in_frame_number == ref_frame_number: continue; elif in_frame_number > ref_frame_number: target_features[0:ref_frame_number, ] = in_features[0:ref_frame_number, ] elif in_frame_number < ref_frame_number: target_features[0:in_frame_number, ] = in_features[0:in_frame_number, ] io_funcs.array_to_binary_file(target_features, in_file_name) logger.info('Finished: made equal rows in data stream %s with reference to data stream %s ' %(in_data_stream_name, ref_data_stream_name))
def simple_scale_variance_CONTINUUM(indir, outdir, var_file_dict, out_dimension_dict, file_id_list): ## Try range of interpolation weights for combining global & local variance all_streams = ['cmp', 'HNR', 'F0', 'LSF', 'Gain', 'LSFsource'] streams_to_scale = ['LSF'] static_variances = {} static_dimension_dict = {} for (feature_name, size) in out_dimension_dict.items(): static_dimension_dict[feature_name] = size / 3 io_funcs = BinaryIOCollection() for feature_name in var_file_dict.keys(): var_values, dimension = io_funcs.load_binary_file_frame( var_file_dict[feature_name], 1) static_var_values = var_values[:static_dimension_dict[feature_name], :] static_variances[feature_name] = static_var_values if not os.path.isdir(outdir): os.makedirs(outdir) file_id_list_out = [] for uttname in file_id_list: for gv_weight in [ 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0 ]: local_weight = 1.0 - gv_weight for stream in all_streams: infile = os.path.join(indir, uttname + '.' + stream) extended_uttname = uttname + '_gv' + str(gv_weight) print extended_uttname outfile = os.path.join(outdir, extended_uttname + '.' + stream) if not os.path.isfile(infile): sys.exit(infile + ' does not exist') if stream in streams_to_scale: speech, dimension = io_funcs.load_binary_file_frame( infile, static_dimension_dict[stream]) utt_mean = numpy.mean(speech, axis=0) utt_std = numpy.std(speech, axis=0) global_std = numpy.transpose((static_variances[stream])) weighted_global_std = (gv_weight * global_std) + ( local_weight * utt_std) std_ratio = weighted_global_std / utt_std nframes, ndim = numpy.shape(speech) utt_mean_matrix = numpy.tile(utt_mean, (nframes, 1)) std_ratio_matrix = numpy.tile(std_ratio, (nframes, 1)) scaled_speech = ((speech - utt_mean_matrix) * std_ratio_matrix) + utt_mean_matrix io_funcs.array_to_binary_file(scaled_speech, outfile) else: os.system('cp %s %s' % (infile, outfile)) file_id_list_out.append(extended_uttname) return file_id_list_out
def simple_scale_variance(indir, outdir, var_file_dict, out_dimension_dict, file_id_list, gv_weight=1.0): ## simple variance scaling (silen et al. 2012, paragraph 3.1) ## TODO: Lots of things like stream names hardcoded here; 3 for delta + delta-delta; ... # all_streams = ['cmp','HNR','F0','LSF','Gain','LSFsource'] # streams_to_scale = ['LSF'] all_streams = ['cmp', 'mgc', 'lf0', 'bap'] streams_to_scale = ['mgc'] static_variances = {} static_dimension_dict = {} for (feature_name, size) in out_dimension_dict.items(): static_dimension_dict[feature_name] = size / 3 io_funcs = BinaryIOCollection() for feature_name in var_file_dict.keys(): var_values, dimension = io_funcs.load_binary_file_frame( var_file_dict[feature_name], 1) static_var_values = var_values[:static_dimension_dict[feature_name], :] static_variances[feature_name] = static_var_values if not os.path.isdir(outdir): os.makedirs(outdir) assert gv_weight <= 1.0 and gv_weight >= 0.0 local_weight = 1.0 - gv_weight for uttname in file_id_list: for stream in all_streams: infile = os.path.join(indir, uttname + '.' + stream) outfile = os.path.join(outdir, uttname + '.' + stream) if not os.path.isfile(infile): sys.exit(infile + ' does not exist') if stream in streams_to_scale: speech, dimension = io_funcs.load_binary_file_frame( infile, static_dimension_dict[stream]) utt_mean = numpy.mean(speech, axis=0) utt_std = numpy.std(speech, axis=0) global_std = numpy.transpose((static_variances[stream])) weighted_global_std = (gv_weight * global_std) + (local_weight * utt_std) std_ratio = weighted_global_std / utt_std nframes, ndim = numpy.shape(speech) utt_mean_matrix = numpy.tile(utt_mean, (nframes, 1)) std_ratio_matrix = numpy.tile(std_ratio, (nframes, 1)) scaled_speech = ((speech - utt_mean_matrix) * std_ratio_matrix) + utt_mean_matrix io_funcs.array_to_binary_file(scaled_speech, outfile) else: os.system('cp %s %s' % (infile, outfile))
def extract_label_features(self, orig_file, output_file): io_funcs = BinaryIOCollection() totalMat = io_funcs.file2matrix(orig_file) self.label_dimension = totalMat.shape[1] - 5 # collum num labelMat = totalMat[:, :-5] #print orig_file, totalMat.shape, labelMat.shape io_funcs.array_to_binary_file(labelMat, output_file)
def simple_scale_variance_CONTINUUM(indir, outdir, var_file_dict, out_dimension_dict, file_id_list): ## Try range of interpolation weights for combining global & local variance all_streams = ['cmp','HNR','F0','LSF','Gain','LSFsource'] streams_to_scale = ['LSF'] static_variances = {} static_dimension_dict = {} for (feature_name,size) in list(out_dimension_dict.items()): static_dimension_dict[feature_name] = size/3 io_funcs = BinaryIOCollection() for feature_name in list(var_file_dict.keys()): var_values, dimension = io_funcs.load_binary_file_frame(var_file_dict[feature_name], 1) static_var_values = var_values[:static_dimension_dict[feature_name], :] static_variances[feature_name] = static_var_values if not os.path.isdir(outdir): os.makedirs(outdir) file_id_list_out = [] for uttname in file_id_list: for gv_weight in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]: local_weight = 1.0 - gv_weight for stream in all_streams: infile = os.path.join(indir, uttname + '.' + stream) extended_uttname = uttname + '_gv' + str(gv_weight) print(extended_uttname) outfile = os.path.join(outdir, extended_uttname + '.' + stream) if not os.path.isfile(infile): sys.exit(infile + ' does not exist') if stream in streams_to_scale: speech, dimension = io_funcs.load_binary_file_frame(infile, static_dimension_dict[stream]) utt_mean = numpy.mean(speech, axis=0) utt_std = numpy.std(speech, axis=0) global_std = numpy.transpose((static_variances[stream])) weighted_global_std = (gv_weight * global_std) + (local_weight * utt_std) std_ratio = weighted_global_std / utt_std nframes, ndim = numpy.shape(speech) utt_mean_matrix = numpy.tile(utt_mean, (nframes,1)) std_ratio_matrix = numpy.tile(std_ratio, (nframes,1)) scaled_speech = ((speech - utt_mean_matrix) * std_ratio_matrix) + utt_mean_matrix io_funcs.array_to_binary_file(scaled_speech, outfile) else: os.system('cp %s %s'%(infile, outfile)) file_id_list_out.append(extended_uttname) return file_id_list_out
def extract_linguistic_features(self, in_file_name, out_file_name=None, label_type="state_align", dur_file_name=None): if label_type=="phone_align": A = self.load_labels_with_phone_alignment(in_file_name, dur_file_name) elif label_type=="state_align": A = self.load_labels_with_state_alignment(in_file_name) else: logger.critical("we don't support %s labels as of now!!" % (label_type)) if out_file_name: io_funcs = BinaryIOCollection() io_funcs.array_to_binary_file(A, out_file_name) else: return A
def extract_linguistic_features(self, in_file_name, out_file_name=None, label_type="state_align", dur_file_name=None): if label_type=="phone_align": A = self.load_labels_with_phone_alignment(in_file_name, dur_file_name) elif label_type=="state_align": A = self.load_labels_with_state_alignment(in_file_name) else: logger.critical("we don't support %s labels as of now!!" % (label_type)) if out_file_name: io_funcs = BinaryIOCollection() io_funcs.array_to_binary_file(A, out_file_name) else: return A
def simple_scale_variance(indir, outdir, var_file_dict, out_dimension_dict, file_id_list, gv_weight=1.0): ## simple variance scaling (silen et al. 2012, paragraph 3.1) ## TODO: Lots of things like stream names hardcoded here; 3 for delta + delta-delta; ... # all_streams = ['cmp','HNR','F0','LSF','Gain','LSFsource'] # streams_to_scale = ['LSF'] all_streams = ['cmp','mgc','lf0','bap'] streams_to_scale = ['mgc'] static_variances = {} static_dimension_dict = {} for (feature_name,size) in out_dimension_dict.items(): static_dimension_dict[feature_name] = size/3 io_funcs = BinaryIOCollection() for feature_name in var_file_dict.keys(): var_values, dimension = io_funcs.load_binary_file_frame(var_file_dict[feature_name], 1) static_var_values = var_values[:static_dimension_dict[feature_name], :] static_variances[feature_name] = static_var_values if not os.path.isdir(outdir): os.makedirs(outdir) assert gv_weight <= 1.0 and gv_weight >= 0.0 local_weight = 1.0 - gv_weight for uttname in file_id_list: for stream in all_streams: infile = os.path.join(indir, uttname + '.' + stream) outfile = os.path.join(outdir, uttname + '.' + stream) if not os.path.isfile(infile): sys.exit(infile + ' does not exist') if stream in streams_to_scale: speech, dimension = io_funcs.load_binary_file_frame(infile, static_dimension_dict[stream]) utt_mean = numpy.mean(speech, axis=0) utt_std = numpy.std(speech, axis=0) global_std = numpy.transpose((static_variances[stream])) weighted_global_std = (gv_weight * global_std) + (local_weight * utt_std) std_ratio = weighted_global_std / utt_std nframes, ndim = numpy.shape(speech) utt_mean_matrix = numpy.tile(utt_mean, (nframes,1)) std_ratio_matrix = numpy.tile(std_ratio, (nframes,1)) scaled_speech = ((speech - utt_mean_matrix) * std_ratio_matrix) + utt_mean_matrix io_funcs.array_to_binary_file(scaled_speech, outfile) else: os.system('cp %s %s'%(infile, outfile))
def duration_decomposition(self, in_file_list, dimension, out_dimension_dict, file_extension_dict, meta=None): logger = logging.getLogger('param_generation') logger.debug('duration_decomposition for %d files' % len(in_file_list)) state_number = 5 ## hard coding, try removing in future? if len(list(out_dimension_dict.keys())) > 1: logger.critical( "we don't support any additional features along with duration as of now." ) sys.exit(1) else: feature_name = list(out_dimension_dict.keys())[0] io_funcs = BinaryIOCollection() findex = 0 flen = len(in_file_list) for file_name in in_file_list: findex = findex + 1 dir_name = os.path.dirname(file_name) file_id = os.path.splitext(os.path.basename(file_name))[0] features, frame_number = io_funcs.load_binary_file_frame( file_name, dimension) gen_features = numpy.int32(numpy.round(features)) gen_features[gen_features < 1] = 1 if dimension > state_number: gen_features = gen_features[:, state_number] logger.info('processing %4d of %4d: %s' % (findex, flen, file_name)) if meta is not None: gen_features = self.hardcode_duration(meta, gen_features) new_file_name = os.path.join( dir_name, file_id + file_extension_dict[feature_name]) io_funcs.array_to_binary_file(gen_features, new_file_name) logger.debug('wrote to file %s' % new_file_name)
def predict(self, test_x, out_scaler, gen_test_file_list, sequential_training=False, stateful=False): #### compute predictions #### io_funcs = BinaryIOCollection() test_id_list = test_x.keys() test_id_list.sort() test_file_number = len(test_id_list) print("generating features on held-out test data...") with tf.Session() as sess: new_saver=tf.train.import_meta_graph(os.path.join(self.ckpt_dir,"mymodel.ckpt.meta")) print "loading the model parameters..." output_layer=tf.get_collection("output_layer")[0] input_layer=tf.get_collection("input_layer")[0] new_saver.restore(sess,os.path.join(self.ckpt_dir,"mymodel.ckpt")) print "The model parameters are successfully restored" for utt_index in xrange(test_file_number): gen_test_file_name = gen_test_file_list[utt_index] temp_test_x = test_x[test_id_list[utt_index]] num_of_rows = temp_test_x.shape[0] if not sequential_training: is_training_batch=tf.get_collection("is_training_batch")[0] if self.dropout_rate!=0.0: is_training_drop=tf.get_collection("is_training_drop")[0] y_predict=sess.run(output_layer,feed_dict={input_layer:temp_test_x,is_training_drop:False,is_training_batch:False}) else: y_predict=sess.run(output_layer,feed_dict={input_layer:temp_test_x,is_training_batch:False}) else: temp_test_x=np.reshape(temp_test_x,[1,num_of_rows,self.n_in]) hybrid=0 utt_length_placeholder=tf.get_collection("utt_length")[0] if "tanh" in self.hidden_layer_type: hybrid=1 is_training_batch=tf.get_collection("is_training_batch")[0] if self.dropout_rate!=0.0: is_training_drop=tf.get_collection("is_training_drop")[0] if hybrid: y_predict=sess.run(output_layer,feed_dict={input_layer:temp_test_x,utt_length_placeholder:[num_of_rows],is_training_drop:False,is_training_batch:False}) else: y_predict=sess.run(output_layer,feed_dict={input_layer:temp_test_x,utt_length_placeholder:[num_of_rows],is_training_drop:False}) elif hybrid: y_predict=sess.run(output_layer,feed_dict={input_layer:temp_test_x,utt_length_placeholder:[num_of_rows],is_training_batch:False}) else: y_predict=sess.run(output_layer,feed_dict={input_layer:temp_test_x,utt_length_placeholder:[num_of_rows]}) data_utils.denorm_data(y_predict, out_scaler) io_funcs.array_to_binary_file(y_predict, gen_test_file_name) data_utils.drawProgressBar(utt_index+1, test_file_number)
def predict(self, test_x, out_scaler, gen_test_file_list): #### compute predictions #### io_funcs = BinaryIOCollection() test_id_list = test_x.keys() test_id_list.sort() inference_batch_size = len(test_id_list) test_file_number = len(test_id_list) with tf.Session(graph=self.graph) as sess: new_saver = tf.train.import_meta_graph(self.ckpt_dir, "mymodel.ckpt.meta") """Notice change targets=tf.get_collection("targets")[0]""" inputs_data = self.graph.get_collection("inputs_data")[0] """Notice Change decoder_outputs=tf.get_collection("decoder_outputs")[0]""" inputs_sequence_length = self.graph.get_collection( "inputs_sequence_length")[0] target_sequence_length = self.graph.get_collection( "target_sequence_length")[0] print("loading the model parameters...") new_saver.restore(sess, os.path.join(self.ckpt_dir, "mymodel.ckpt")) print("Model parameters are successfully restored") print("generating features on held-out test data...") for utt_index in range(test_file_number): gen_test_file_name = gen_test_file_list[utt_index] temp_test_x = test_x[test_id_list[utt_index]] num_of_rows = temp_test_x.shape[0] #utt_length=[len(utt) for utt in test_x.values()] #max_step=max(utt_length) temp_test_x = tf.reshape(temp_test_x, [1, num_of_rows, self.n_in]) outputs = np.zeros(shape=[len(test_x), max_step, self.n_out], dtype=np.float32) #dec_cell=self.graph.get_collection("decoder_cell")[0] print("Generating speech parameters ...") for t in range(num_of_rows): # outputs=sess.run(inference_output,{inputs_data:temp_test_x,inputs_sequence_length:utt_length,\ # target_sequence_length:utt_length}) _outputs=sess.run(decoder_outputs,feed_dict={inputs_data:temp_test_x,targets:outputs,inputs_sequence_length:[num_of_rows],\ target_sequence_length:[num_of_rows]}) # #print _outputs[:,t,:] outputs[:, t, :] = _outputs[:, t, :] data_utils.denorm_data(outputs, out_scaler) io_funcs.array_to_binary_file(outputs, gen_test_file_name) data_utils.drawProgressBar(utt_index + 1, test_file_number)
def compute_norm_stats(data, stats_file, method="MVN"): #### normalize training data #### io_funcs = BinaryIOCollection() if method=="MVN": scaler = preprocessing.StandardScaler().fit(data) norm_matrix = np.vstack((scaler.mean_, scaler.scale_)) elif method=="MINMAX": scaler = preprocessing.MinMaxScaler(feature_range=(0.01, 0.99)).fit(data) norm_matrix = np.vstack((scaler.min_, scaler.scale_)) print norm_matrix.shape io_funcs.array_to_binary_file(norm_matrix, stats_file) return scaler
def extract_dur_features(self, in_file_name, out_file_name=None, label_type="state_align", feature_type=None, unit_size=None, feat_size=None): logger = logging.getLogger("dur") if label_type=="phone_align": A = self.extract_dur_from_phone_alignment_labels(in_file_name, feature_type, unit_size, feat_size) elif label_type=="state_align": A = self.extract_dur_from_state_alignment_labels(in_file_name, feature_type, unit_size, feat_size) else: logger.critical("we don't support %s labels as of now!!" % (label_type)) sys.exit(1) if out_file_name: io_funcs = BinaryIOCollection() io_funcs.array_to_binary_file(A, out_file_name) else: return A
def compute_norm_stats(data, stats_file, method="MVN"): #### normalize training data #### io_funcs = BinaryIOCollection() if method=="MVN": scaler = preprocessing.StandardScaler().fit(data) norm_matrix = np.vstack((scaler.mean_, scaler.scale_)) elif method=="MINMAX": scaler = preprocessing.MinMaxScaler(feature_range=(0.01, 0.99)).fit(data) norm_matrix = np.vstack((scaler.min_, scaler.scale_)) print(norm_matrix.shape) io_funcs.array_to_binary_file(norm_matrix, stats_file) return scaler
def extract_dur_features(self, in_file_name, out_file_name=None, label_type="state_align", feature_type=None, unit_size=None, feat_size=None): logger = logging.getLogger("dur") if label_type=="phone_align": A = self.extract_dur_from_phone_alignment_labels(in_file_name, feature_type, unit_size, feat_size) elif label_type=="state_align": A = self.extract_dur_from_state_alignment_labels(in_file_name, feature_type, unit_size, feat_size) else: logger.critical("we don't support %s labels as of now!!" % (label_type)) sys.exit(1) if out_file_name: io_funcs = BinaryIOCollection() io_funcs.array_to_binary_file(A, out_file_name) else: return A
def normalise_data(self, in_file_list, out_file_list): file_number = len(in_file_list) fea_max_min_diff = self.max_vector - self.min_vector diff_value = self.target_max_value - self.target_min_value fea_max_min_diff = numpy.reshape(fea_max_min_diff, (1, self.feature_dimension)) target_max_min_diff = numpy.zeros((1, self.feature_dimension)) target_max_min_diff.fill(diff_value) target_max_min_diff[fea_max_min_diff <= 0.0] = 1.0 fea_max_min_diff[fea_max_min_diff <= 0.0] = 1.0 io_funcs = BinaryIOCollection() for i in xrange(file_number): features = io_funcs.load_binary_file(in_file_list[i], self.feature_dimension) frame_number = features.size / self.feature_dimension fea_min_matrix = numpy.tile(self.min_vector, (frame_number, 1)) fea_max_matrix = numpy.tile(self.max_vector, (frame_number, 1)) for m in xrange(features.shape[0]): for n in xrange(features.shape[1]): if features[m][n] < fea_min_matrix[m][n]: features[m][n] = fea_min_matrix[m][n] elif features[m][n] > fea_max_matrix[m][n]: features[m][n] = fea_max_matrix[m][n] target_min_matrix = numpy.tile( self.target_min_value, (frame_number, self.feature_dimension)) fea_diff_matrix = numpy.tile(fea_max_min_diff, (frame_number, 1)) diff_norm_matrix = numpy.tile(target_max_min_diff, (frame_number, 1)) / fea_diff_matrix norm_features = diff_norm_matrix * ( features - fea_min_matrix) + target_min_matrix ## If we are to keep some columns unnormalised, use advanced indexing to ## reinstate original values: m, n = numpy.shape(features) for col in self.exclude_columns: norm_features[range(m), [col] * m] = features[range(m), [col] * m] io_funcs.array_to_binary_file(norm_features, out_file_list[i])
def normal_standardization(self, in_file_list, out_file_list): mean_vector = self.compute_mean(in_file_list) std_vector = self.compute_std(in_file_list, mean_vector) io_funcs = BinaryIOCollection() file_number = len(in_file_list) for i in range(file_number): features = io_funcs.load_binary_file(in_file_list[i], self.feature_dimension) current_frame_number = features.size // self.feature_dimension mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1)) std_matrix = numpy.tile(std_vector, (current_frame_number, 1)) norm_features = (features - mean_matrix) / std_matrix io_funcs.array_to_binary_file(norm_features, out_file_list[i])
def normal_standardization(self, in_file_list, out_file_list): mean_vector = self.compute_mean(in_file_list) std_vector = self.compute_std(in_file_list, mean_vector) io_funcs = BinaryIOCollection() file_number = len(in_file_list) for i in range(file_number): features = io_funcs.load_binary_file(in_file_list[i], self.feature_dimension) current_frame_number = features.size // self.feature_dimension mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1)) std_matrix = numpy.tile(std_vector, (current_frame_number, 1)) norm_features = old_div((features - mean_matrix), std_matrix) io_funcs.array_to_binary_file(norm_features, out_file_list[i])
def remove_silence(self, in_data_list, in_align_list, out_data_list, dur_file_list=None): file_number = len(in_data_list) align_file_number = len(in_align_list) if file_number != align_file_number: print "The number of input and output files does not equal!\n" sys.exit(1) if file_number != len(out_data_list): print "The number of input and output files does not equal!\n" sys.exit(1) io_funcs = BinaryIOCollection() for i in xrange(file_number): if self.label_type == "phone_align": if dur_file_list: dur_file_name = dur_file_list[i] else: dur_file_name = None nonsilence_indices = self.load_phone_alignment( in_align_list[i], dur_file_name) else: nonsilence_indices = self.load_alignment(in_align_list[i]) ori_cmp_data = io_funcs.load_binary_file(in_data_list[i], self.n_cmp) frame_number = ori_cmp_data.size / self.n_cmp if len(nonsilence_indices) == frame_number: print 'WARNING: no silence found!' # previsouly: continue -- in fact we should keep non-silent data! ## if labels have a few extra frames than audio, this can break the indexing, remove them: nonsilence_indices = [ ix for ix in nonsilence_indices if ix < frame_number ] new_cmp_data = ori_cmp_data[nonsilence_indices, ] io_funcs.array_to_binary_file(new_cmp_data, out_data_list[i])
def merge_data(self, binary_label_file_list, new_feat_file_list, out_feat_file_list): ''' merging new features with normalised label features ''' utt_number = len(new_feat_file_list) if utt_number != len(binary_label_file_list): print( "the number of new feature input files and label files should be the same!\n" ) sys.exit(1) new_feat_ext = new_feat_file_list[0].split('/')[-1].split('.')[1] io_funcs = BinaryIOCollection() for i in range(utt_number): lab_file_name = binary_label_file_list[i] new_feat_file_name = new_feat_file_list[i] out_feat_file_name = out_feat_file_list[i] lab_features, lab_frame_number = io_funcs.load_binary_file_frame( lab_file_name, self.lab_dim) new_features, feat_frame_number = io_funcs.load_binary_file_frame( new_feat_file_name, self.feat_dim) if (lab_frame_number - feat_frame_number) > 5: base_file_name = new_feat_file_list[i].split('/')[-1].split( '.')[0] self.logger.critical( "the number of frames in label and new features are different: %d vs %d (%s)" % (lab_frame_number, feat_frame_number, base_file_name)) raise merged_features = numpy.zeros( (lab_frame_number, self.lab_dim + self.feat_dim)) merged_features[0:lab_frame_number, 0:self.lab_dim] = lab_features merged_features[0:feat_frame_number, self.lab_dim:self.lab_dim + self.feat_dim] = new_features[0:lab_frame_number, ] io_funcs.array_to_binary_file(merged_features, out_feat_file_name) self.logger.debug( 'merged new feature %s of %d frames with %d label features' % (new_feat_ext, feat_frame_number, lab_frame_number))
def predict(self,test_x, out_scaler, gen_test_file_list): #### compute predictions #### io_funcs = BinaryIOCollection() test_id_list = test_x.keys() test_id_list.sort() inference_batch_size=len(test_id_list) test_file_number = len(test_id_list) with tf.Session(graph=self.graph) as sess: new_saver=tf.train.import_meta_graph(self.ckpt_dir,"mymodel.ckpt.meta") """Notice change targets=tf.get_collection("targets")[0]""" inputs_data=self.graph.get_collection("inputs_data")[0] """Notice Change decoder_outputs=tf.get_collection("decoder_outputs")[0]""" inputs_sequence_length=self.graph.get_collection("inputs_sequence_length")[0] target_sequence_length=self.graph.get_collection("target_sequence_length")[0] print "loading the model parameters..." new_saver.restore(sess,os.path.join(self.ckpt_dir,"mymodel.ckpt")) print "Model parameters are successfully restored" print("generating features on held-out test data...") for utt_index in xrange(test_file_number): gen_test_file_name = gen_test_file_list[utt_index] temp_test_x = test_x[test_id_list[utt_index]] num_of_rows = temp_test_x.shape[0] #utt_length=[len(utt) for utt in test_x.values()] #max_step=max(utt_length) temp_test_x = tf.reshape(temp_test_x,[1,num_of_rows,self.n_in]) outputs=np.zeros(shape=[len(test_x),max_step,self.n_out],dtype=np.float32) #dec_cell=self.graph.get_collection("decoder_cell")[0] print "Generating speech parameters ..." for t in range(num_of_rows): # outputs=sess.run(inference_output,{inputs_data:temp_test_x,inputs_sequence_length:utt_length,\ # target_sequence_length:utt_length}) _outputs=sess.run(decoder_outputs,feed_dict={inputs_data:temp_test_x,targets:outputs,inputs_sequence_length:[num_of_rows],\ target_sequence_length:[num_of_rows]}) # #print _outputs[:,t,:] outputs[:,t,:]=_outputs[:,t,:] data_utils.denorm_data(outputs, out_scaler) io_funcs.array_to_binary_file(outputs, gen_test_file_name) data_utils.drawProgressBar(utt_index+1, test_file_number)
def denormalise_data(self, in_file_list, out_file_list): logger = logging.getLogger("acoustic_norm") file_number = len(in_file_list) logger.info('MinMaxNormalisation.denormalise_data for %d files' % file_number) # print self.max_vector, self.min_vector fea_max_min_diff = self.max_vector - self.min_vector diff_value = self.target_max_value - self.target_min_value # logger.debug('reshaping fea_max_min_diff from shape %s to (1,%d)' % (fea_max_min_diff.shape, self.feature_dimension) ) fea_max_min_diff = numpy.reshape(fea_max_min_diff, (1, self.feature_dimension)) target_max_min_diff = numpy.zeros((1, self.feature_dimension)) target_max_min_diff.fill(diff_value) target_max_min_diff[fea_max_min_diff <= 0.0] = 1.0 fea_max_min_diff[fea_max_min_diff <= 0.0] = 1.0 io_funcs = BinaryIOCollection() for i in range(file_number): features = io_funcs.load_binary_file(in_file_list[i], self.feature_dimension) frame_number = features.size // self.feature_dimension fea_min_matrix = numpy.tile(self.min_vector, (frame_number, 1)) target_min_matrix = numpy.tile( self.target_min_value, (frame_number, self.feature_dimension)) fea_diff_matrix = numpy.tile(fea_max_min_diff, (frame_number, 1)) diff_norm_matrix = old_div( fea_diff_matrix, numpy.tile(target_max_min_diff, (frame_number, 1))) norm_features = diff_norm_matrix * ( features - target_min_matrix) + fea_min_matrix io_funcs.array_to_binary_file(norm_features, out_file_list[i])
def duration_decomposition(self, in_file_list, dimension, out_dimension_dict, file_extension_dict): logger = logging.getLogger('param_generation') logger.debug('duration_decomposition for %d files' % len(in_file_list) ) state_number = 5 ## hard coding, try removing in future? if len(list(out_dimension_dict.keys()))>1: logger.critical("we don't support any additional features along with duration as of now.") sys.exit(1) else: feature_name = list(out_dimension_dict.keys())[0] io_funcs = BinaryIOCollection() findex=0 flen=len(in_file_list) for file_name in in_file_list: findex=findex+1 dir_name = os.path.dirname(file_name) file_id = os.path.splitext(os.path.basename(file_name))[0] features, frame_number = io_funcs.load_binary_file_frame(file_name, dimension) gen_features = numpy.int32(numpy.round(features)) gen_features[gen_features<1]=1 if dimension > state_number: gen_features = gen_features[:, state_number] logger.info('processing %4d of %4d: %s' % (findex,flen,file_name) ) new_file_name = os.path.join(dir_name, file_id + file_extension_dict[feature_name]) io_funcs.array_to_binary_file(gen_features, new_file_name) logger.debug('wrote to file %s' % new_file_name)
def normal_standardization(self, in_file_list, out_file_list, feature_dimension): # self.dimension_dict = dimension_dict self.feature_dimension = feature_dimension mean_vector = self.compute_mean(in_file_list, 0, feature_dimension) std_vector = self.compute_std(in_file_list, mean_vector, 0, feature_dimension) io_funcs = BinaryIOCollection() file_number = len(in_file_list) for i in xrange(file_number): features, current_frame_number = io_funcs.load_binary_file_frame(in_file_list[i], self.feature_dimension) mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1)) std_matrix = numpy.tile(std_vector, (current_frame_number, 1)) norm_features = (features - mean_matrix) / std_matrix io_funcs.array_to_binary_file(norm_features, out_file_list[i]) return mean_vector, std_vector
def compute_norm_stats(data, stats_file, method="MVN", no_scaling_ind=()): #### normalize training data #### io_funcs = BinaryIOCollection() if method=="MVN": scaler = preprocessing.StandardScaler(copy=False).fit(data) if no_scaling_ind: scaler.mean_[no_scaling_ind] = 0 scaler.scale_[no_scaling_ind] = 1 norm_matrix = np.vstack((scaler.mean_, scaler.scale_)) elif method=="MINMAX": # TODO: this seems strange, if this is used, check the documentation: # http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html scaler = preprocessing.MinMaxScaler(copy=False, feature_range=(0.01, 0.99)).fit(data) norm_matrix = np.vstack((scaler.min_, scaler.scale_)) print(norm_matrix.shape) io_funcs.array_to_binary_file(norm_matrix, stats_file) # TODO: Why don't we make this a text file or pickle? Here it is np.savetxt(stats_file + ".csv", norm_matrix, delimiter=",", fmt='%.2f', newline='\n') return scaler
def remove_silence(self, in_data_list, in_align_list, out_data_list, dur_file_list=None): file_number = len(in_data_list) align_file_number = len(in_align_list) if file_number != align_file_number: print "The number of input and output files does not equal!\n" sys.exit(1) if file_number != len(out_data_list): print "The number of input and output files does not equal!\n" sys.exit(1) io_funcs = BinaryIOCollection() for i in xrange(file_number): if self.label_type=="phone_align": if dur_file_list: dur_file_name = dur_file_list[i] else: dur_file_name = None nonsilence_indices = self.load_phone_alignment(in_align_list[i], dur_file_name) else: nonsilence_indices = self.load_alignment(in_align_list[i]) ori_cmp_data = io_funcs.load_binary_file(in_data_list[i], self.n_cmp) frame_number = ori_cmp_data.size/self.n_cmp if len(nonsilence_indices) == frame_number: print 'WARNING: no silence found!' # previsouly: continue -- in fact we should keep non-silent data! ## if labels have a few extra frames than audio, this can break the indexing, remove them: nonsilence_indices = [ix for ix in nonsilence_indices if ix < frame_number] new_cmp_data = ori_cmp_data[nonsilence_indices,] io_funcs.array_to_binary_file(new_cmp_data, out_data_list[i])
def feature_denormalisation(self, in_file_list, out_file_list, mean_vector, std_vector): io_funcs = BinaryIOCollection() file_number = len(in_file_list) try: assert len(in_file_list) == len(out_file_list) except AssertionError: logger.critical('The input and output file numbers are not the same! %d vs %d' %(len(in_file_list), len(out_file_list))) raise try: assert mean_vector.size == self.feature_dimension and std_vector.size == self.feature_dimension except AssertionError: logger.critical('the dimensionalities of the mean and standard derivation vectors are not the same as the dimensionality of the feature') raise for i in xrange(file_number): features, current_frame_number = io_funcs.load_binary_file_frame(in_file_list[i], self.feature_dimension) mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1)) std_matrix = numpy.tile(std_vector, (current_frame_number, 1)) norm_features = features * std_matrix + mean_matrix io_funcs.array_to_binary_file(norm_features, out_file_list[i])
def generate_wav(gen_dir, file_id_list, cfg): logger = logging.getLogger("wav_generation") SPTK = cfg.SPTK # NND = cfg.NND STRAIGHT = cfg.STRAIGHT WORLD = cfg.WORLD ## to be moved pf_coef = cfg.pf_coef if isinstance(cfg.fw_alpha, basestring): if cfg.fw_alpha=='Bark': fw_coef = bark_alpha(cfg.sr) elif cfg.fw_alpha=='ERB': fw_coef = bark_alpha(cfg.sr) else: raise ValueError('cfg.fw_alpha='+cfg.fw_alpha+' not implemented, the frequency warping coefficient "fw_coef" cannot be deduced.') else: fw_coef = cfg.fw_alpha co_coef = cfg.co_coef fl_coef = cfg.fl if cfg.apply_GV: io_funcs = BinaryIOCollection() logger.info('loading global variance stats from %s' % (cfg.GV_dir)) ref_gv_mean_file = os.path.join(cfg.GV_dir, 'ref_gv.mean') gen_gv_mean_file = os.path.join(cfg.GV_dir, 'gen_gv.mean') ref_gv_std_file = os.path.join(cfg.GV_dir, 'ref_gv.std') gen_gv_std_file = os.path.join(cfg.GV_dir, 'gen_gv.std') ref_gv_mean, frame_number = io_funcs.load_binary_file_frame(ref_gv_mean_file, 1) gen_gv_mean, frame_number = io_funcs.load_binary_file_frame(gen_gv_mean_file, 1) ref_gv_std, frame_number = io_funcs.load_binary_file_frame(ref_gv_std_file, 1) gen_gv_std, frame_number = io_funcs.load_binary_file_frame(gen_gv_std_file, 1) counter=1 max_counter = len(file_id_list) for filename in file_id_list: logger.info('creating waveform for %4d of %4d: %s' % (counter,max_counter,filename) ) counter=counter+1 base = filename files = {'sp' : os.path.join(gen_dir, base + cfg.sp_ext), 'mgc' : os.path.join(gen_dir, base + cfg.mgc_ext), 'f0' : os.path.join(gen_dir, base + '.f0'), 'lf0' : os.path.join(gen_dir, base + cfg.lf0_ext), 'ap' : os.path.join(gen_dir, base + '.ap'), 'bap' : os.path.join(gen_dir, base + cfg.bap_ext), 'wav' : os.path.join(gen_dir, base + '.wav')} mgc_file_name = files['mgc'] bap_file_name = files['bap'] cur_dir = os.getcwd() os.chdir(gen_dir) ### post-filtering if cfg.do_post_filtering: line = "echo 1 1 " for i in range(2, cfg.mgc_dim): line = line + str(pf_coef) + " " run_process('{line} >{weighttxt}' .format(line=line, weighttxt=os.path.join(gen_dir, 'weight.txt'))) run_process('{x2x} +af < {weighttxt} > {weight}' .format(x2x=SPTK['X2X'], weighttxt=os.path.join(gen_dir, 'weight.txt'), weight=os.path.join(gen_dir, 'weight.bin'))) run_process('{freqt} -m {order} -a {fw} -M {co} -A 0 < {mgc} > {temp1}' .format(freqt=SPTK['FREQT'], order=cfg.mgc_dim-1, fw=fw_coef, co=co_coef, mgc=files['mgc'], temp1=files['mgc']+'_r0temp1')) run_process('{c2acr} -m {co} -M 0 -l {fl} <{temp1} > {base_r0}' .format(co=co_coef, c2acr=SPTK['C2ACR'], fl=fl_coef, base_r0=files['mgc']+'_r0', temp1=files['mgc']+'_r0temp1')) run_process('{vopr} -m -n {order} < {mgc} {weight} > {temp2}' .format(vopr=SPTK['VOPR'], order=cfg.mgc_dim-1, mgc=files['mgc'], weight=os.path.join(gen_dir, 'weight.bin'), temp2=files['mgc']+'_mgctemp2')) run_process('{freqt} -m {order} -a {fw} -M {co} -A 0 < {temp2} > {temp3}' .format(order=cfg.mgc_dim-1, freqt=SPTK['FREQT'], fw=fw_coef, co=co_coef, temp2=files['mgc'] + '_mgctemp2', temp3=files['mgc']+'_mgctemp3')) run_process('{c2acr} -m {co} -M 0 -l {fl} < {temp3} > {base_p_r0}' .format(temp3=files['mgc']+'_mgctemp3', co=co_coef, c2acr=SPTK['C2ACR'], fl=fl_coef, base_p_r0=files['mgc']+'_p_r0')) run_process('{vopr} -m -n {order} < {mgc} {weight} > {temp4}' .format(vopr=SPTK['VOPR'], order=cfg.mgc_dim-1, mgc=files['mgc'], weight=os.path.join(gen_dir, 'weight.bin'), temp4=files['mgc'] + '_mgctemp4')) run_process('{mc2b} -m {order} -a {fw} < {temp4} > {temp5}' .format(order=cfg.mgc_dim-1, mc2b=SPTK['MC2B'], fw=fw_coef, temp4=files['mgc'] + '_mgctemp4', temp5=files['mgc'] + '_mgctemp5')) run_process('{bcp} -n {order} -s 0 -e 0 < {temp5} > {base_b0}' .format(order=cfg.mgc_dim-1, bcp=SPTK['BCP'], base_b0=files['mgc']+'_b0', temp5=files['mgc'] + '_mgctemp5')) run_process('{vopr} -d < {base_r0} {base_p_r0} > {temp6}' .format(vopr=SPTK['VOPR'], base_r0=files['mgc']+'_r0', base_p_r0=files['mgc']+'_p_r0', temp6=files['mgc']+'_mgctemp6')) run_process('{sopr} -LN -d 2 < {temp6} > {temp7}' .format(sopr=SPTK['SOPR'], temp6=files['mgc'] + '_mgctemp6', temp7=files['mgc'] + '_mgctemp7')) run_process('{vopr} -a {base_b0} < {temp7} > {base_p_b0}' .format(vopr=SPTK['VOPR'], temp7=files['mgc'] + '_mgctemp7', base_b0=files['mgc']+'_b0', base_p_b0=files['mgc']+'_p_b0')) run_process('{vopr} -m -n {order} < {mgc} {weight} > {temp8}' .format(vopr=SPTK['VOPR'], order=cfg.mgc_dim-1, mgc=files['mgc'], weight=os.path.join(gen_dir, 'weight.bin'), temp8=files['mgc'] + '_mgctemp8')) run_process('{mc2b} -m {order} -a {fw} < {temp8} > {temp9}' .format(order=cfg.mgc_dim-1, mc2b=SPTK['MC2B'], fw=fw_coef, temp8=files['mgc'] + '_mgctemp8', temp9=files['mgc'] + '_mgctemp9')) run_process('{bcp} -n {order} -s 1 -e {order} < {temp9} > {temp10}' .format(order=cfg.mgc_dim-1, bcp=SPTK['BCP'], temp9=files['mgc'] + '_mgctemp9', temp10=files['mgc'] + '_mgctemp10')) run_process('{merge} -n {order2} -s 0 -N 0 {base_p_b0} < {temp10} > {temp11}' .format(merge=SPTK['MERGE'], order2=cfg.mgc_dim-2, base_p_b0=files['mgc']+'_p_b0', temp10=files['mgc'] + '_mgctemp10', temp11=files['mgc'] + '_mgctemp11')) run_process('{b2mc} -m {order} -a {fw} < {temp11} > {base_p_mgc}' .format(order=cfg.mgc_dim-1, fw=fw_coef, b2mc=SPTK['B2MC'], base_p_mgc=files['mgc']+'_p_mgc', temp11=files['mgc'] + '_mgctemp11')) mgc_file_name = files['mgc']+'_p_mgc' if cfg.vocoder_type == "STRAIGHT" and cfg.apply_GV: gen_mgc, frame_number = io_funcs.load_binary_file_frame(mgc_file_name, cfg.mgc_dim) gen_mu = np.reshape(np.mean(gen_mgc, axis=0), (-1, 1)) gen_std = np.reshape(np.std(gen_mgc, axis=0), (-1, 1)) local_gv = (ref_gv_std/gen_gv_std) * (gen_std - gen_gv_mean) + ref_gv_mean; enhanced_mgc = np.repeat(local_gv, frame_number, 1).T / np.repeat(gen_std, frame_number, 1).T * (gen_mgc - np.repeat(gen_mu, frame_number, 1).T) + np.repeat(gen_mu, frame_number, 1).T; new_mgc_file_name = files['mgc']+'_p_mgc' io_funcs.array_to_binary_file(enhanced_mgc, new_mgc_file_name) mgc_file_name = files['mgc']+'_p_mgc' if cfg.do_post_filtering and cfg.apply_GV: logger.critical('Both smoothing techniques together can\'t be applied!!\n' ) raise ###mgc to sp to wav if cfg.vocoder_type == 'STRAIGHT': run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} > {sp}' .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.mgc_dim-1, fl=cfg.fl, mgc=mgc_file_name, sp=files['sp'])) run_process('{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} > {f0}'.format(sopr=SPTK['SOPR'], lf0=files['lf0'], f0=files['f0'])) run_process('{x2x} +fa {f0} > {f0a}'.format(x2x=SPTK['X2X'], f0=files['f0'], f0a=files['f0'] + '.a')) if cfg.use_cep_ap: run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 0 {bap} > {ap}' .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.bap_dim-1, fl=cfg.fl, bap=files['bap'], ap=files['ap'])) else: run_process('{bndap2ap} {bap} > {ap}' .format(bndap2ap=STRAIGHT['BNDAP2AP'], bap=files['bap'], ap=files['ap'])) run_process('{synfft} -f {sr} -spec -fftl {fl} -shift {shift} -sigp 0.0 -cornf 400 -float -apfile {ap} {f0a} {sp} {wav}' .format(synfft=STRAIGHT['SYNTHESIS_FFT'], sr=cfg.sr, fl=cfg.fl, shift=cfg.shift, ap=files['ap'], f0a=files['f0']+'.a', sp=files['sp'], wav=files['wav'])) run_process('rm -f {sp} {f0} {f0a} {ap}' .format(sp=files['sp'],f0=files['f0'],f0a=files['f0']+'.a',ap=files['ap'])) elif cfg.vocoder_type == 'WORLD': run_process('{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} > {temp12}'.format(sopr=SPTK['SOPR'], lf0=files['lf0'], temp12=files['f0'] + '_temp12')) run_process('{x2x} +fd < {temp12} > {f0}'.format(x2x=SPTK['X2X'], f0=files['f0'], temp12=files['f0'] + '_temp12')) run_process('{sopr} -c 0 {bap} > {temp13}'.format(sopr=SPTK['SOPR'],bap=files['bap'],temp13=files['ap'] + '_temp13')) run_process('{x2x} +fd < {temp13} > {ap}'.format(x2x=SPTK['X2X'],ap=files['ap'], temp13=files['ap'] + '_temp13')) ### If using world v2, please comment above line and uncomment this #run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 0 {bap} | {sopr} -d 32768.0 -P | {x2x} +fd > {ap}' # .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.bap_dim, fl=cfg.fl, bap=bap_file_name, sopr=SPTK['SOPR'], x2x=SPTK['X2X'], ap=files['ap'])) run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} > {temp14}' .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.mgc_dim-1, fl=cfg.fl, mgc=mgc_file_name, temp14=files['sp'] + '_temp14')) run_process('{sopr} -d 32768.0 -P < {temp14} > {temp15}' .format(sopr=SPTK['SOPR'], temp14=files['sp'] + '_temp14', temp15=files['sp'] + '_temp15')) run_process('{x2x} +fd < {temp15} > {sp}' .format(x2x=SPTK['X2X'], sp=files['sp'], temp15=files['sp'] + '_temp15')) run_process('{synworld} {fl} {sr} {f0} {sp} {ap} {wav}' .format(synworld=WORLD['SYNTHESIS'], fl=cfg.fl, sr=cfg.sr, f0=files['f0'], sp=files['sp'], ap=files['ap'], wav=files['wav'])) #run_process('rm -f {ap} {sp} {f0}'.format(ap=files['ap'],sp=files['sp'],f0=files['f0'])) else: logger.critical('The vocoder %s is not supported yet!\n' % cfg.vocoder_type ) raise os.chdir(cur_dir)
def prepare_data(self, in_file_list_dict, out_file_list, in_dimension_dict, out_dimension_dict): logger = logging.getLogger("acoustic_comp") stream_start_index = {} stream_dim_index = 0 for stream_name in out_dimension_dict.keys(): if not stream_start_index.has_key(stream_name): stream_start_index[stream_name] = stream_dim_index stream_dim_index += out_dimension_dict[stream_name] io_funcs = BinaryIOCollection() for i in xrange(self.file_number): out_file_name = out_file_list[i] #if os.path.isfile(out_file_name): # logger.info('processing file %4d of %4d : %s exists' % (i+1, self.file_number, out_file_name)) # continue logger.info('processing file %4d of %4d : %s' % (i+1,self.file_number,out_file_name)) out_data_matrix = None out_frame_number = 0 for k in xrange(self.data_stream_number): data_stream_name = self.data_stream_list[k] in_file_name = in_file_list_dict[data_stream_name][i] in_feature_dim = in_dimension_dict[data_stream_name] features, frame_number = io_funcs.load_binary_file_frame(in_file_name, in_feature_dim) if k == 0: out_frame_number = frame_number out_data_matrix = numpy.zeros((out_frame_number, self.out_dimension)) if frame_number > out_frame_number: features = features[0:out_frame_number, ] frame_number = out_frame_number try: assert out_frame_number == frame_number except AssertionError: logger.critical('the frame number of data stream %s is not consistent with others: current %d others %d' %(data_stream_name, out_frame_number, frame_number)) raise dim_index = stream_start_index[data_stream_name] if data_stream_name in ['lf0', 'F0']: ## F0 added for GlottHMM features, vuv_vector = self.interpolate_f0(features) ### if vuv information to be recorded, store it in corresponding column if self.record_vuv: out_data_matrix[0:out_frame_number, stream_start_index['vuv']:stream_start_index['vuv']+1] = vuv_vector out_data_matrix[0:out_frame_number, dim_index:dim_index+in_feature_dim] = features dim_index = dim_index+in_feature_dim if self.compute_dynamic[data_stream_name]: delta_features = self.compute_dynamic_matrix(features, self.delta_win, frame_number, in_feature_dim) acc_features = self.compute_dynamic_matrix(features, self.acc_win, frame_number, in_feature_dim) out_data_matrix[0:out_frame_number, dim_index:dim_index+in_feature_dim] = delta_features dim_index = dim_index+in_feature_dim out_data_matrix[0:out_frame_number, dim_index:dim_index+in_feature_dim] = acc_features ### write data to file io_funcs.array_to_binary_file(out_data_matrix, out_file_name) logger.debug(' wrote %d frames of features',out_frame_number )
def wavgen_straight_type_vocoder(gen_dir, file_id_list, cfg, logger): ''' Waveform generation with STRAIGHT or WORLD vocoders. (whose acoustic parameters are: mgc, bap, and lf0) ''' SPTK = cfg.SPTK # NND = cfg.NND STRAIGHT = cfg.STRAIGHT WORLD = cfg.WORLD ## to be moved pf_coef = cfg.pf_coef if isinstance(cfg.fw_alpha, str): if cfg.fw_alpha == 'Bark': fw_coef = bark_alpha(cfg.sr) elif cfg.fw_alpha == 'ERB': fw_coef = bark_alpha(cfg.sr) else: raise ValueError( 'cfg.fw_alpha=' + cfg.fw_alpha + ' not implemented, the frequency warping coefficient "fw_coef" cannot be deduced.' ) else: fw_coef = cfg.fw_alpha co_coef = cfg.co_coef fl_coef = cfg.fl if cfg.apply_GV: io_funcs = BinaryIOCollection() logger.info('loading global variance stats from %s' % (cfg.GV_dir)) ref_gv_mean_file = os.path.join(cfg.GV_dir, 'ref_gv.mean') gen_gv_mean_file = os.path.join(cfg.GV_dir, 'gen_gv.mean') ref_gv_std_file = os.path.join(cfg.GV_dir, 'ref_gv.std') gen_gv_std_file = os.path.join(cfg.GV_dir, 'gen_gv.std') ref_gv_mean, frame_number = io_funcs.load_binary_file_frame( ref_gv_mean_file, 1) gen_gv_mean, frame_number = io_funcs.load_binary_file_frame( gen_gv_mean_file, 1) ref_gv_std, frame_number = io_funcs.load_binary_file_frame( ref_gv_std_file, 1) gen_gv_std, frame_number = io_funcs.load_binary_file_frame( gen_gv_std_file, 1) counter = 1 max_counter = len(file_id_list) for filename in file_id_list: logger.info('creating waveform for %4d of %4d: %s' % (counter, max_counter, filename)) counter = counter + 1 base = filename files = { 'sp': base + cfg.sp_ext, 'mgc': base + cfg.mgc_ext, 'f0': base + '.f0', 'lf0': base + cfg.lf0_ext, 'ap': base + '.ap', 'bap': base + cfg.bap_ext, 'wav': base + '.wav' } mgc_file_name = files['mgc'] bap_file_name = files['bap'] cur_dir = os.getcwd() os.chdir(gen_dir) ### post-filtering if cfg.do_post_filtering: mgc_file_name = files['mgc'] + '_p_mgc' post_filter(files['mgc'], mgc_file_name, cfg.mgc_dim, pf_coef, fw_coef, co_coef, fl_coef, gen_dir, cfg) if cfg.vocoder_type == "STRAIGHT" and cfg.apply_GV: gen_mgc, frame_number = io_funcs.load_binary_file_frame( mgc_file_name, cfg.mgc_dim) gen_mu = np.reshape(np.mean(gen_mgc, axis=0), (-1, 1)) gen_std = np.reshape(np.std(gen_mgc, axis=0), (-1, 1)) local_gv = (ref_gv_std / gen_gv_std) * (gen_std - gen_gv_mean) + ref_gv_mean enhanced_mgc = np.repeat(local_gv, frame_number, 1).T / np.repeat( gen_std, frame_number, 1).T * (gen_mgc - np.repeat( gen_mu, frame_number, 1).T) + np.repeat( gen_mu, frame_number, 1).T new_mgc_file_name = files['mgc'] + '_p_mgc' io_funcs.array_to_binary_file(enhanced_mgc, new_mgc_file_name) mgc_file_name = files['mgc'] + '_p_mgc' if cfg.do_post_filtering and cfg.apply_GV: logger.critical( 'Both smoothing techniques together can\'t be applied!!\n') raise ###mgc to sp to wav if cfg.vocoder_type == 'STRAIGHT': run_process( '{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} > {sp}' .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.mgc_dim - 1, fl=cfg.fl, mgc=mgc_file_name, sp=files['sp'])) run_process( '{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} > {f0}'.format( sopr=SPTK['SOPR'], lf0=files['lf0'], f0=files['f0'])) run_process('{x2x} +fa {f0} > {f0a}'.format(x2x=SPTK['X2X'], f0=files['f0'], f0a=files['f0'] + '.a')) if cfg.use_cep_ap: run_process( '{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 0 {bap} > {ap}' .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.bap_dim - 1, fl=cfg.fl, bap=files['bap'], ap=files['ap'])) else: run_process('{bndap2ap} {bap} > {ap}'.format( bndap2ap=STRAIGHT['BNDAP2AP'], bap=files['bap'], ap=files['ap'])) run_process( '{synfft} -f {sr} -spec -fftl {fl} -shift {shift} -sigp 1.2 -cornf 4000 -float -apfile {ap} {f0a} {sp} {wav}' .format(synfft=STRAIGHT['SYNTHESIS_FFT'], sr=cfg.sr, fl=cfg.fl, shift=cfg.shift, ap=files['ap'], f0a=files['f0'] + '.a', sp=files['sp'], wav=files['wav'])) run_process('rm -f {sp} {f0} {f0a} {ap}'.format(sp=files['sp'], f0=files['f0'], f0a=files['f0'] + '.a', ap=files['ap'])) elif cfg.vocoder_type == 'WORLD': run_process( '{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} | {x2x} +fd > {f0}' .format(sopr=SPTK['SOPR'], lf0=files['lf0'], x2x=SPTK['X2X'], f0=files['f0'])) run_process('{sopr} -c 0 {bap} | {x2x} +fd > {ap}'.format( sopr=SPTK['SOPR'], bap=files['bap'], x2x=SPTK['X2X'], ap=files['ap'])) ### If using world v2, please comment above line and uncomment this #run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 0 {bap} | {sopr} -d 32768.0 -P | {x2x} +fd > {ap}' # .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.bap_dim, fl=cfg.fl, bap=bap_file_name, sopr=SPTK['SOPR'], x2x=SPTK['X2X'], ap=files['ap'])) run_process( '{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} | {sopr} -d 32768.0 -P | {x2x} +fd > {sp}' .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.mgc_dim - 1, fl=cfg.fl, mgc=mgc_file_name, sopr=SPTK['SOPR'], x2x=SPTK['X2X'], sp=files['sp'])) run_process('{synworld} {fl} {sr} {f0} {sp} {ap} {wav}'.format( synworld=WORLD['SYNTHESIS'], fl=cfg.fl, sr=cfg.sr, f0=files['f0'], sp=files['sp'], ap=files['ap'], wav=files['wav'])) run_process('rm -f {ap} {sp} {f0}'.format(ap=files['ap'], sp=files['sp'], f0=files['f0'])) os.chdir(cur_dir)
def trim_silence(in_list, out_list, in_dimension, label_list, label_dimension, \ silence_feature_index, percent_to_keep=0): ''' Function to trim silence from binary label/speech files based on binary labels. in_list: list of binary label/speech files to trim out_list: trimmed files in_dimension: dimension of data to trim label_list: list of binary labels which contain trimming criterion label_dimesion: silence_feature_index: index of feature in labels which is silence: 1 means silence (trim), 0 means leave. ''' assert len(in_list) == len(out_list) == len(label_list) io_funcs = BinaryIOCollection() for (infile, outfile, label_file) in zip(in_list, out_list, label_list): data = io_funcs.load_binary_file(infile, in_dimension) label = io_funcs.load_binary_file(label_file, label_dimension) audio_label_difference = data.shape[0] - label.shape[0] assert math.fabs(audio_label_difference) < 3, '%s and %s contain different numbers of frames: %s %s' % ( infile, label_file, data.shape[0], label.shape[0]) ## In case they are different, resize -- keep label fixed as we assume this has ## already been processed. (This problem only arose with STRAIGHT features.) if audio_label_difference < 0: ## label is longer -- pad audio to match by repeating last frame: print('audio too short -- pad') padding = numpy.vstack([data[-1, :]] * int(math.fabs(audio_label_difference))) data = numpy.vstack([data, padding]) elif audio_label_difference > 0: ## audio is longer -- cut it print('audio too long -- trim') new_length = label.shape[0] data = data[:new_length, :] # else: -- expected case -- lengths match, so do nothing silence_flag = label[:, silence_feature_index] # print silence_flag if not (numpy.unique(silence_flag) == numpy.array([0, 1])).all(): ## if it's all 0s or 1s, that's ok: assert (numpy.unique(silence_flag) == numpy.array([0]).all()) or \ (numpy.unique(silence_flag) == numpy.array([1]).all()), \ 'dimension %s of %s contains values other than 0 and 1' % (silence_feature_index, infile) print('Remove %d%% of frames (%s frames) as silence... ' % ( 100 * numpy.sum(silence_flag / float(len(silence_flag))), int(numpy.sum(silence_flag)))) non_silence_indices = numpy.nonzero( silence_flag == 0) ## get the indices where silence_flag == 0 is True (i.e. != 0) if percent_to_keep != 0: assert type(percent_to_keep) == int and percent_to_keep > 0 # print silence_flag silence_indices = numpy.nonzero(silence_flag == 1) ## nonzero returns a tuple of arrays, one for each dimension of input array silence_indices = silence_indices[0] every_nth = 100 / percent_to_keep silence_indices_to_keep = silence_indices[::every_nth] ## every_nth used +as step value in slice ## -1 due to weird error with STRAIGHT features at line 144: ## IndexError: index 445 is out of bounds for axis 0 with size 445 if len(silence_indices_to_keep) == 0: silence_indices_to_keep = numpy.array([1]) ## avoid errors in case there is no silence print(' Restore %s%% (every %sth frame: %s frames) of silent frames' % ( percent_to_keep, every_nth, len(silence_indices_to_keep))) ## Append to end of utt -- same function used for labels and audio ## means that violation of temporal order doesn't matter -- will be consistent. ## Later, frame shuffling will disperse silent frames evenly across minibatches: non_silence_indices = (numpy.hstack([non_silence_indices[0], silence_indices_to_keep])) ## ^---- from tuple and back (see nonzero note above) trimmed_data = data[non_silence_indices, :] ## advanced integer indexing io_funcs.array_to_binary_file(trimmed_data, outfile)
def wavgen_straight_type_vocoder(gen_dir, file_id_list, cfg, logger): ''' Waveform generation with STRAIGHT or WORLD vocoders. (whose acoustic parameters are: mgc, bap, and lf0) ''' SPTK = cfg.SPTK # NND = cfg.NND STRAIGHT = cfg.STRAIGHT WORLD = cfg.WORLD ## to be moved pf_coef = cfg.pf_coef if isinstance(cfg.fw_alpha, str): if cfg.fw_alpha=='Bark': fw_coef = bark_alpha(cfg.sr) elif cfg.fw_alpha=='ERB': fw_coef = bark_alpha(cfg.sr) else: raise ValueError('cfg.fw_alpha='+cfg.fw_alpha+' not implemented, the frequency warping coefficient "fw_coef" cannot be deduced.') else: fw_coef = cfg.fw_alpha co_coef = cfg.co_coef fl_coef = cfg.fl if cfg.apply_GV: io_funcs = BinaryIOCollection() logger.info('loading global variance stats from %s' % (cfg.GV_dir)) ref_gv_mean_file = os.path.join(cfg.GV_dir, 'ref_gv.mean') gen_gv_mean_file = os.path.join(cfg.GV_dir, 'gen_gv.mean') ref_gv_std_file = os.path.join(cfg.GV_dir, 'ref_gv.std') gen_gv_std_file = os.path.join(cfg.GV_dir, 'gen_gv.std') ref_gv_mean, frame_number = io_funcs.load_binary_file_frame(ref_gv_mean_file, 1) gen_gv_mean, frame_number = io_funcs.load_binary_file_frame(gen_gv_mean_file, 1) ref_gv_std, frame_number = io_funcs.load_binary_file_frame(ref_gv_std_file, 1) gen_gv_std, frame_number = io_funcs.load_binary_file_frame(gen_gv_std_file, 1) counter=1 max_counter = len(file_id_list) for filename in file_id_list: logger.info('creating waveform for %4d of %4d: %s' % (counter,max_counter,filename) ) counter=counter+1 base = filename files = {'sp' : base + cfg.sp_ext, 'mgc' : base + cfg.mgc_ext, 'f0' : base + '.f0', 'lf0' : base + cfg.lf0_ext, 'ap' : base + '.ap', 'bap' : base + cfg.bap_ext, 'wav' : base + '.wav'} mgc_file_name = files['mgc'] bap_file_name = files['bap'] cur_dir = os.getcwd() os.chdir(gen_dir) ### post-filtering if cfg.do_post_filtering: mgc_file_name = files['mgc']+'_p_mgc' post_filter(files['mgc'], mgc_file_name, cfg.mgc_dim, pf_coef, fw_coef, co_coef, fl_coef, gen_dir, cfg) if cfg.vocoder_type == "STRAIGHT" and cfg.apply_GV: gen_mgc, frame_number = io_funcs.load_binary_file_frame(mgc_file_name, cfg.mgc_dim) gen_mu = np.reshape(np.mean(gen_mgc, axis=0), (-1, 1)) gen_std = np.reshape(np.std(gen_mgc, axis=0), (-1, 1)) local_gv = (ref_gv_std/gen_gv_std) * (gen_std - gen_gv_mean) + ref_gv_mean; enhanced_mgc = np.repeat(local_gv, frame_number, 1).T / np.repeat(gen_std, frame_number, 1).T * (gen_mgc - np.repeat(gen_mu, frame_number, 1).T) + np.repeat(gen_mu, frame_number, 1).T; new_mgc_file_name = files['mgc']+'_p_mgc' io_funcs.array_to_binary_file(enhanced_mgc, new_mgc_file_name) mgc_file_name = files['mgc']+'_p_mgc' if cfg.do_post_filtering and cfg.apply_GV: logger.critical('Both smoothing techniques together can\'t be applied!!\n' ) raise ###mgc to sp to wav if cfg.vocoder_type == 'STRAIGHT': run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} > {sp}' .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.mgc_dim-1, fl=cfg.fl, mgc=mgc_file_name, sp=files['sp'])) run_process('{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} > {f0}'.format(sopr=SPTK['SOPR'], lf0=files['lf0'], f0=files['f0'])) run_process('{x2x} +fa {f0} > {f0a}'.format(x2x=SPTK['X2X'], f0=files['f0'], f0a=files['f0'] + '.a')) if cfg.use_cep_ap: run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 0 {bap} > {ap}' .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.bap_dim-1, fl=cfg.fl, bap=files['bap'], ap=files['ap'])) else: run_process('{bndap2ap} {bap} > {ap}' .format(bndap2ap=STRAIGHT['BNDAP2AP'], bap=files['bap'], ap=files['ap'])) run_process('{synfft} -f {sr} -spec -fftl {fl} -shift {shift} -sigp 1.2 -cornf 4000 -float -apfile {ap} {f0a} {sp} {wav}' .format(synfft=STRAIGHT['SYNTHESIS_FFT'], sr=cfg.sr, fl=cfg.fl, shift=cfg.shift, ap=files['ap'], f0a=files['f0']+'.a', sp=files['sp'], wav=files['wav'])) run_process('rm -f {sp} {f0} {f0a} {ap}' .format(sp=files['sp'],f0=files['f0'],f0a=files['f0']+'.a',ap=files['ap'])) elif cfg.vocoder_type == 'WORLD': run_process('{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} | {x2x} +fd > {f0}'.format(sopr=SPTK['SOPR'], lf0=files['lf0'], x2x=SPTK['X2X'], f0=files['f0'])) run_process('{sopr} -c 0 {bap} | {x2x} +fd > {ap}'.format(sopr=SPTK['SOPR'],bap=files['bap'],x2x=SPTK['X2X'],ap=files['ap'])) ### If using world v2, please comment above line and uncomment this #run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 0 {bap} | {sopr} -d 32768.0 -P | {x2x} +fd > {ap}' # .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.bap_dim, fl=cfg.fl, bap=bap_file_name, sopr=SPTK['SOPR'], x2x=SPTK['X2X'], ap=files['ap'])) run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} | {sopr} -d 32768.0 -P | {x2x} +fd > {sp}' .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.mgc_dim-1, fl=cfg.fl, mgc=mgc_file_name, sopr=SPTK['SOPR'], x2x=SPTK['X2X'], sp=files['sp'])) run_process('{synworld} {fl} {sr} {f0} {sp} {ap} {wav}' .format(synworld=WORLD['SYNTHESIS'], fl=cfg.fl, sr=cfg.sr, f0=files['f0'], sp=files['sp'], ap=files['ap'], wav=files['wav'])) run_process('rm -f {ap} {sp} {f0}'.format(ap=files['ap'],sp=files['sp'],f0=files['f0'])) os.chdir(cur_dir)
def trim_silence(in_list, out_list, in_dimension, label_list, label_dimension, \ silence_feature_index, percent_to_keep=0): ''' Function to trim silence from binary label/speech files based on binary labels. in_list: list of binary label/speech files to trim out_list: trimmed files in_dimension: dimension of data to trim label_list: list of binary labels which contain trimming criterion label_dimesion: silence_feature_index: index of feature in labels which is silence: 1 means silence (trim), 0 means leave. ''' assert len(in_list) == len(out_list) == len(label_list) io_funcs = BinaryIOCollection() for (infile, outfile, label_file) in zip(in_list, out_list, label_list): data = io_funcs.load_binary_file(infile, in_dimension) label = io_funcs.load_binary_file(label_file, label_dimension) audio_label_difference = data.shape[0] - label.shape[0] assert math.fabs( audio_label_difference ) < 3, '%s and %s contain different numbers of frames: %s %s' % ( infile, label_file, data.shape[0], label.shape[0]) ## In case they are different, resize -- keep label fixed as we assume this has ## already been processed. (This problem only arose with STRAIGHT features.) if audio_label_difference < 0: ## label is longer -- pad audio to match by repeating last frame: print('audio too short -- pad') padding = numpy.vstack([data[-1, :]] * int(math.fabs(audio_label_difference))) data = numpy.vstack([data, padding]) elif audio_label_difference > 0: ## audio is longer -- cut it print('audio too long -- trim') new_length = label.shape[0] data = data[:new_length, :] # else: -- expected case -- lengths match, so do nothing silence_flag = label[:, silence_feature_index] # print silence_flag if not (numpy.unique(silence_flag) == numpy.array([0, 1])).all(): ## if it's all 0s or 1s, that's ok: assert (numpy.unique(silence_flag) == numpy.array([0]).all()) or \ (numpy.unique(silence_flag) == numpy.array([1]).all()), \ 'dimension %s of %s contains values other than 0 and 1' % (silence_feature_index, infile) print('Remove %d%% of frames (%s frames) as silence... ' % (100 * numpy.sum(silence_flag / float(len(silence_flag))), int(numpy.sum(silence_flag)))) non_silence_indices = numpy.nonzero( silence_flag == 0) ## get the indices where silence_flag == 0 is True (i.e. != 0) if percent_to_keep != 0: assert type(percent_to_keep) == int and percent_to_keep > 0 # print silence_flag silence_indices = numpy.nonzero(silence_flag == 1) ## nonzero returns a tuple of arrays, one for each dimension of input array silence_indices = silence_indices[0] every_nth = 100 / percent_to_keep silence_indices_to_keep = silence_indices[:: every_nth] ## every_nth used +as step value in slice ## -1 due to weird error with STRAIGHT features at line 144: ## IndexError: index 445 is out of bounds for axis 0 with size 445 if len(silence_indices_to_keep) == 0: silence_indices_to_keep = numpy.array( [1]) ## avoid errors in case there is no silence print( ' Restore %s%% (every %sth frame: %s frames) of silent frames' % (percent_to_keep, every_nth, len(silence_indices_to_keep))) ## Append to end of utt -- same function used for labels and audio ## means that violation of temporal order doesn't matter -- will be consistent. ## Later, frame shuffling will disperse silent frames evenly across minibatches: non_silence_indices = (numpy.hstack( [non_silence_indices[0], silence_indices_to_keep])) ## ^---- from tuple and back (see nonzero note above) trimmed_data = data[ non_silence_indices, :] ## advanced integer indexing io_funcs.array_to_binary_file(trimmed_data, outfile)
def dnn_generation(valid_file_list, nnets_file_name, n_ins, n_outs, out_file_list, target_mean_vector, target_std_vector, out_dimension_dict, file_extension_dict, vocoder='straight'): logger = logging.getLogger("dnn_generation") logger.debug('Starting dnn_generation') inf_float = -1.0e+10 plotlogger = logging.getLogger("plotting") cfg.gen_wav_features if vocoder == 'straight': gen_wav_features = ['mgc', 'lf0', 'bap'] elif vocoder == 'glotthmm': gen_wav_features = ['F0', 'Gain', 'HNR', 'LSF','LSFsource'] ## TODO: take this from config else: sys.exit('unsupported vocoder %s !'%(vocoder)) stream_start_index = {} dimension_index = 0 for feature_name in list(out_dimension_dict.keys()): stream_start_index[feature_name] = dimension_index dimension_index += out_dimension_dict[feature_name] dnn_model = pickle.load(open(nnets_file_name, 'rb')) file_number = len(valid_file_list) io_funcs = BinaryIOCollection() mlpg = MLParameterGenerationFast() for i in range(file_number): logger.info('generating %4d of %4d: %s' % (i+1,file_number,valid_file_list[i]) ) fid_lab = open(valid_file_list[i], 'rb') features = numpy.fromfile(fid_lab, dtype=numpy.float32) fid_lab.close() features = features[:(n_ins * (features.size / n_ins))] features = features.reshape((-1, n_ins)) frame_number = features.shape[0] test_set_x = theano.shared(numpy.asarray(features, dtype=theano.config.floatX)) mean_matrix = numpy.tile(target_mean_vector, (features.shape[0], 1)) std_matrix = numpy.tile(target_std_vector, (features.shape[0], 1)) predicted_mix = dnn_model.parameter_prediction_mix(test_set_x = test_set_x) max_index = numpy.argmax(predicted_mix, axis=1) temp_predicted_mu = dnn_model.parameter_prediction(test_set_x=test_set_x) temp_predicted_sigma = dnn_model.parameter_prediction_sigma(test_set_x = test_set_x) predicted_mu = numpy.zeros((temp_predicted_mu.shape[0], n_outs)) predicted_sigma = numpy.zeros((temp_predicted_sigma.shape[0], n_outs)) for kk in range(temp_predicted_mu.shape[0]): predicted_mu[kk, :] = temp_predicted_mu[kk, max_index[kk]*n_outs:(max_index[kk]+1)*n_outs] predicted_sigma[kk, :] = temp_predicted_sigma[kk, max_index[kk]*n_outs:(max_index[kk]+1)*n_outs] # print predicted_mu.shape # predicted_mu = predicted_mu[aa*n_outs:(aa+1)*n_outs] predicted_mu = predicted_mu * std_matrix + mean_matrix predicted_sigma = ((predicted_sigma ** 0.5) * std_matrix ) ** 2 dir_name = os.path.dirname(out_file_list[i]) file_id = os.path.splitext(os.path.basename(out_file_list[i]))[0] mlpg = MLParameterGenerationFast() for feature_name in gen_wav_features: current_features = predicted_mu[:, stream_start_index[feature_name]:stream_start_index[feature_name]+out_dimension_dict[feature_name]] current_sigma = predicted_sigma[:, stream_start_index[feature_name]:stream_start_index[feature_name]+out_dimension_dict[feature_name]] gen_features = mlpg.generation(current_features, current_sigma, out_dimension_dict[feature_name]/3) if feature_name in ['lf0', 'F0']: if 'vuv' in stream_start_index: vuv_feature = predicted_mu[:, stream_start_index['vuv']:stream_start_index['vuv']+1] for i in range(frame_number): if vuv_feature[i, 0] < 0.5: gen_features[i, 0] = inf_float # print gen_features new_file_name = os.path.join(dir_name, file_id + file_extension_dict[feature_name]) io_funcs.array_to_binary_file(gen_features, new_file_name)
def generate_wav(gen_dir, file_id_list, cfg): logger = logging.getLogger("wav_generation") SPTK = cfg.SPTK # NND = cfg.NND STRAIGHT = cfg.STRAIGHT WORLD = cfg.WORLD ## to be moved pf_coef = cfg.pf_coef if isinstance(cfg.fw_alpha, basestring): if cfg.fw_alpha=='Bark': fw_coef = bark_alpha(cfg.sr) elif cfg.fw_alpha=='ERB': fw_coef = bark_alpha(cfg.sr) else: raise ValueError('cfg.fw_alpha='+cfg.fw_alpha+' not implemented, the frequency warping coefficient "fw_coef" cannot be deduced.') else: fw_coef = cfg.fw_alpha co_coef = cfg.co_coef fl_coef = cfg.fl if cfg.apply_GV: io_funcs = BinaryIOCollection() logger.info('loading global variance stats from %s' % (cfg.GV_dir)) ref_gv_mean_file = os.path.join(cfg.GV_dir, 'ref_gv.mean') gen_gv_mean_file = os.path.join(cfg.GV_dir, 'gen_gv.mean') ref_gv_std_file = os.path.join(cfg.GV_dir, 'ref_gv.std') gen_gv_std_file = os.path.join(cfg.GV_dir, 'gen_gv.std') ref_gv_mean, frame_number = io_funcs.load_binary_file_frame(ref_gv_mean_file, 1) gen_gv_mean, frame_number = io_funcs.load_binary_file_frame(gen_gv_mean_file, 1) ref_gv_std, frame_number = io_funcs.load_binary_file_frame(ref_gv_std_file, 1) gen_gv_std, frame_number = io_funcs.load_binary_file_frame(gen_gv_std_file, 1) counter=1 max_counter = len(file_id_list) for filename in file_id_list: logger.info('creating waveform for %4d of %4d: %s' % (counter,max_counter,filename) ) counter=counter+1 base = filename files = {'sp' : base + cfg.sp_ext, 'mgc' : base + cfg.mgc_ext, 'f0' : base + '.f0', 'lf0' : base + cfg.lf0_ext, 'ap' : base + '.ap', 'bap' : base + cfg.bap_ext, 'wav' : base + '.wav'} mgc_file_name = files['mgc'] bap_file_name = files['bap'] cur_dir = os.getcwd() os.chdir(gen_dir) ### post-filtering if cfg.do_post_filtering: line = "echo 1 1 " for i in range(2, cfg.mgc_dim): line = line + str(pf_coef) + " " run_process('{line} | {x2x} +af > {weight}' .format(line=line, x2x=SPTK['X2X'], weight=os.path.join(gen_dir, 'weight'))) run_process('{freqt} -m {order} -a {fw} -M {co} -A 0 < {mgc} | {c2acr} -m {co} -M 0 -l {fl} > {base_r0}' .format(freqt=SPTK['FREQT'], order=cfg.mgc_dim-1, fw=fw_coef, co=co_coef, mgc=files['mgc'], c2acr=SPTK['C2ACR'], fl=fl_coef, base_r0=files['mgc']+'_r0')) run_process('{vopr} -m -n {order} < {mgc} {weight} | {freqt} -m {order} -a {fw} -M {co} -A 0 | {c2acr} -m {co} -M 0 -l {fl} > {base_p_r0}' .format(vopr=SPTK['VOPR'], order=cfg.mgc_dim-1, mgc=files['mgc'], weight=os.path.join(gen_dir, 'weight'), freqt=SPTK['FREQT'], fw=fw_coef, co=co_coef, c2acr=SPTK['C2ACR'], fl=fl_coef, base_p_r0=files['mgc']+'_p_r0')) run_process('{vopr} -m -n {order} < {mgc} {weight} | {mc2b} -m {order} -a {fw} | {bcp} -n {order} -s 0 -e 0 > {base_b0}' .format(vopr=SPTK['VOPR'], order=cfg.mgc_dim-1, mgc=files['mgc'], weight=os.path.join(gen_dir, 'weight'), mc2b=SPTK['MC2B'], fw=fw_coef, bcp=SPTK['BCP'], base_b0=files['mgc']+'_b0')) run_process('{vopr} -d < {base_r0} {base_p_r0} | {sopr} -LN -d 2 | {vopr} -a {base_b0} > {base_p_b0}' .format(vopr=SPTK['VOPR'], base_r0=files['mgc']+'_r0', base_p_r0=files['mgc']+'_p_r0', sopr=SPTK['SOPR'], base_b0=files['mgc']+'_b0', base_p_b0=files['mgc']+'_p_b0')) run_process('{vopr} -m -n {order} < {mgc} {weight} | {mc2b} -m {order} -a {fw} | {bcp} -n {order} -s 1 -e {order} | {merge} -n {order2} -s 0 -N 0 {base_p_b0} | {b2mc} -m {order} -a {fw} > {base_p_mgc}' .format(vopr=SPTK['VOPR'], order=cfg.mgc_dim-1, mgc=files['mgc'], weight=os.path.join(gen_dir, 'weight'), mc2b=SPTK['MC2B'], fw=fw_coef, bcp=SPTK['BCP'], merge=SPTK['MERGE'], order2=cfg.mgc_dim-2, base_p_b0=files['mgc']+'_p_b0', b2mc=SPTK['B2MC'], base_p_mgc=files['mgc']+'_p_mgc')) mgc_file_name = files['mgc']+'_p_mgc' if cfg.vocoder_type == "STRAIGHT" and cfg.apply_GV: gen_mgc, frame_number = io_funcs.load_binary_file_frame(mgc_file_name, cfg.mgc_dim) gen_mu = np.reshape(np.mean(gen_mgc, axis=0), (-1, 1)) gen_std = np.reshape(np.std(gen_mgc, axis=0), (-1, 1)) local_gv = (ref_gv_std/gen_gv_std) * (gen_std - gen_gv_mean) + ref_gv_mean; enhanced_mgc = np.repeat(local_gv, frame_number, 1).T / np.repeat(gen_std, frame_number, 1).T * (gen_mgc - np.repeat(gen_mu, frame_number, 1).T) + np.repeat(gen_mu, frame_number, 1).T; new_mgc_file_name = files['mgc']+'_p_mgc' io_funcs.array_to_binary_file(enhanced_mgc, new_mgc_file_name) mgc_file_name = files['mgc']+'_p_mgc' if cfg.do_post_filtering and cfg.apply_GV: logger.critical('Both smoothing techniques together can\'t be applied!!\n' ) raise ###mgc to sp to wav if cfg.vocoder_type == 'STRAIGHT': run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} > {sp}' .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.mgc_dim-1, fl=cfg.fl, mgc=mgc_file_name, sp=files['sp'])) run_process('{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} > {f0}'.format(sopr=SPTK['SOPR'], lf0=files['lf0'], f0=files['f0'])) run_process('{x2x} +fa {f0} > {f0a}'.format(x2x=SPTK['X2X'], f0=files['f0'], f0a=files['f0'] + '.a')) if cfg.use_cep_ap: run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 0 {bap} > {ap}' .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.bap_dim-1, fl=cfg.fl, bap=files['bap'], ap=files['ap'])) else: run_process('{bndap2ap} {bap} > {ap}' .format(bndap2ap=STRAIGHT['BNDAP2AP'], bap=files['bap'], ap=files['ap'])) run_process('{synfft} -f {sr} -spec -fftl {fl} -shift {shift} -sigp 0.0 -cornf 400 -float -apfile {ap} {f0a} {sp} {wav}' .format(synfft=STRAIGHT['SYNTHESIS_FFT'], sr=cfg.sr, fl=cfg.fl, shift=cfg.shift, ap=files['ap'], f0a=files['f0']+'.a', sp=files['sp'], wav=files['wav'])) run_process('rm -f {sp} {f0} {f0a} {ap}' .format(sp=files['sp'],f0=files['f0'],f0a=files['f0']+'.a',ap=files['ap'])) elif cfg.vocoder_type == 'WORLD': run_process('{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} | {x2x} +fd > {f0}'.format(sopr=SPTK['SOPR'], lf0=files['lf0'], x2x=SPTK['X2X'], f0=files['f0'])) run_process('{sopr} -c 0 {bap} | {x2x} +fd > {ap}'.format(sopr=SPTK['SOPR'],bap=files['bap'],x2x=SPTK['X2X'],ap=files['ap'])) ### If using world v2, please comment above line and uncomment this #run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 0 {bap} | {sopr} -d 32768.0 -P | {x2x} +fd > {ap}' # .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.bap_dim, fl=cfg.fl, bap=bap_file_name, sopr=SPTK['SOPR'], x2x=SPTK['X2X'], ap=files['ap'])) run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} | {sopr} -d 32768.0 -P | {x2x} +fd > {sp}' .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.mgc_dim-1, fl=cfg.fl, mgc=mgc_file_name, sopr=SPTK['SOPR'], x2x=SPTK['X2X'], sp=files['sp'])) run_process('{synworld} {fl} {sr} {f0} {sp} {ap} {wav}' .format(synworld=WORLD['SYNTHESIS'], fl=cfg.fl, sr=cfg.sr, f0=files['f0'], sp=files['sp'], ap=files['ap'], wav=files['wav'])) run_process('rm -f {ap} {sp} {f0}'.format(ap=files['ap'],sp=files['sp'],f0=files['f0'])) else: logger.critical('The vocoder %s is not supported yet!\n' % cfg.vocoder_type ) raise os.chdir(cur_dir)
def make_labels(self,input_file_descriptors,out_file_name=None,\ fill_missing_values=False,iterate_over_frames=False): ## input_file_descriptors is e.g. {'xpath': <open XML file for reading>} # file_descriptors is a dictionary of open label files all for the same utterance # currently supports XPATH or HTS file formats only # keys should be 'xpath' or 'hts' # an array in which to assemble all the features all_labels = None try: assert self.configuration except AssertionError: self.logger.critical('no label configuration loaded, so cannot make labels') raise # now iterate through the features, and create the features from the appropriate open label file xpath_list = [] ## gather all here and extact all features in one pass mapper_list = [] for (item_number, feature_specification) in enumerate(self.configuration.labels): #osw# self.logger.debug('constructing feature %.80s ...' % feature_specification ) ## osw -- we'll append frame features to the data for the *LAST* ## feature_specification in our list add_frame_features = False if item_number+1 == len(self.configuration.labels): add_frame_features = True #osw# self.logger.debug('append frame features') # which label file should we use? if feature_specification.has_key('xpath'): # xpath and hts are mutually exclusive label styles assert not feature_specification.has_key('hts') #osw# self.logger.debug(' feature style: xpath ; XPATH: %s' % feature_specification['xpath'] ) # actually make the features from this open file and the current XPATH try: assert self.configuration.target_nodes except: self.logger.critical('When using XPATH features, "target_nodes" must be defined in the label config file') raise try: xpath_list.append(feature_specification['xpath']) if feature_specification.has_key('mapper'): mapper_list.append(feature_specification['mapper']) else: mapper_list.append(None) except: self.logger.critical('error creating XMLLabelNormalisation object for feature %s' % feature_specification ) raise if feature_specification.has_key('hts'): assert not feature_specification.has_key('xpath') # not yet implemented ! self.logger.warning('HTS features not implemented - ignoring them!') #these_labels=None # to do, with implementation: deal with fill_missing_values correctly ## Now extract all feats in one go -- go straight to all_labels -- don't compose from 'these_labels': label_normaliser = XMLLabelNormalisation(xpath=xpath_list,mapper=mapper_list,fill_missing_values=fill_missing_values,target_nodes=self.configuration.target_nodes,use_compiled_xpath=self.use_precompiled_xpaths,iterate_over_frames=iterate_over_frames) try: all_labels = label_normaliser.extract_linguistic_features(input_file_descriptors['xpath'], add_frame_features=add_frame_features) except KeyError: self.logger.critical('no open xpath label file available to create feature %s' % feature_specification ) raise # # add these_features as additional columns of all_features # if (these_labels != None): # if all_labels != None: # all_labels = numpy.hstack((all_labels,these_labels)) # else: # all_labels= these_labels if all_labels != None: self.logger.debug(' composed features now have dimension %d' % all_labels.shape[1]) #osw# self.logger.debug( 'first line of labels: ' + str(all_labels[0,:])) # finally, save the labels if out_file_name: io_funcs = BinaryIOCollection() io_funcs.array_to_binary_file(all_labels, out_file_name) ## osw: useful for debugging: ##numpy.savetxt(out_file_name + '.TXT', all_labels, delimiter='\t') # debug # with printoptions(threshold=3000, linewidth=1000, edgeitems=1000, precision=1, suppress=True): # # print all_labels # print all_labels.sum(axis=1) self.logger.info('saved numerical features of shape %s to %s' % (all_labels.shape,out_file_name) ) else: return all_features
def acoustic_decomposition(self, in_file_list, dimension, out_dimension_dict, file_extension_dict, var_file_dict, do_MLPG=True, cfg=None): logger = logging.getLogger('param_generation') logger.debug('acoustic_decomposition for %d files' % len(in_file_list) ) self.load_covariance(var_file_dict, out_dimension_dict) stream_start_index = {} dimension_index = 0 recorded_vuv = False vuv_dimension = None for feature_name in list(out_dimension_dict.keys()): # if feature_name != 'vuv': stream_start_index[feature_name] = dimension_index # else: # vuv_dimension = dimension_index # recorded_vuv = True dimension_index += out_dimension_dict[feature_name] io_funcs = BinaryIOCollection() mlpg_algo = MLParameterGeneration() findex=0 flen=len(in_file_list) for file_name in in_file_list: findex=findex+1 dir_name = os.path.dirname(file_name) file_id = os.path.splitext(os.path.basename(file_name))[0] features, frame_number = io_funcs.load_binary_file_frame(file_name, dimension) logger.info('processing %4d of %4d: %s' % (findex,flen,file_name) ) for feature_name in self.gen_wav_features: logger.debug(' feature: %s' % feature_name) current_features = features[:, stream_start_index[feature_name]:stream_start_index[feature_name]+out_dimension_dict[feature_name]] if FAST_MLPG: ### fast version wants variance per frame, not single global one: var = self.var[feature_name] var = numpy.transpose(numpy.tile(var,frame_number)) else: var = self.var[feature_name] # print var.shape[1] if do_MLPG == False: gen_features = current_features else: gen_features = mlpg_algo.generation(current_features, var, out_dimension_dict[feature_name]//3) # else: # self.logger.critical("the dimensions do not match for MLPG: %d vs %d" %(var.shape[1], out_dimension_dict[feature_name])) # raise logger.debug(' feature dimensions: %d by %d' %(gen_features.shape[0], gen_features.shape[1])) if feature_name in ['lf0', 'F0']: if 'vuv' in stream_start_index: vuv_feature = features[:, stream_start_index['vuv']:stream_start_index['vuv']+1] for i in range(frame_number): if vuv_feature[i, 0] < 0.5 or gen_features[i, 0] < numpy.log(20): gen_features[i, 0] = self.inf_float new_file_name = os.path.join(dir_name, file_id + file_extension_dict[feature_name]) if self.enforce_silence: silence_pattern = cfg.silence_pattern label_align_dir = cfg.in_label_align_dir in_f = open(label_align_dir+'/'+file_id+'.lab','r') for line in in_f.readlines(): line = line.strip() if len(line) < 1: continue temp_list = re.split('\s+', line) start_time = int(int(temp_list[0])*(10**-4)/5) end_time = int(int(temp_list[1])*(10**-4)/5) full_label = temp_list[2] label_binary_flag = self.check_silence_pattern(full_label, silence_pattern) if label_binary_flag: if feature_name in ['lf0', 'F0', 'mag']: gen_features[start_time:end_time, :] = self.inf_float else: gen_features[start_time:end_time, :] = 0.0 io_funcs.array_to_binary_file(gen_features, new_file_name) logger.debug(' wrote to file %s' % new_file_name)
def generate_wav(data, gen_dir, base, sptk_dir, world_dir, norm_info_file, do_post_filtering=True, mgc_dim=60, fl=1024, sr=16000): io_funcs = BinaryIOCollection() file_name = os.path.join(gen_dir, base + ".cmp") fid = open(norm_info_file, 'rb') cmp_info = numpy.fromfile(fid, dtype=numpy.float32) fid.close() cmp_info = cmp_info.reshape((2, -1)) cmp_mean = cmp_info[0, ] cmp_std = cmp_info[1, ] data = data * cmp_std + cmp_mean io_funcs.array_to_binary_file(data, file_name) # This code was adapted from Merlin. I should add the license. out_dimension_dict = {'bap': 1, 'lf0': 1, 'mgc': 60, 'vuv': 1} stream_start_index = {} file_extension_dict = { 'mgc': '.mgc', 'bap': '.bap', 'lf0': '.lf0', 'dur': '.dur', 'cmp': '.cmp' } gen_wav_features = ['mgc', 'lf0', 'bap'] dimension_index = 0 for feature_name in out_dimension_dict.keys(): stream_start_index[feature_name] = dimension_index dimension_index += out_dimension_dict[feature_name] dir_name = os.path.dirname(file_name) file_id = os.path.splitext(os.path.basename(file_name))[0] features, frame_number = io_funcs.load_binary_file_frame(file_name, 63) for feature_name in gen_wav_features: current_features = features[:, stream_start_index[feature_name]: stream_start_index[feature_name] + out_dimension_dict[feature_name]] gen_features = current_features if feature_name in ['lf0', 'F0']: if 'vuv' in stream_start_index.keys(): vuv_feature = features[:, stream_start_index['vuv']: stream_start_index['vuv'] + 1] for i in xrange(frame_number): if vuv_feature[i, 0] < 0.5: gen_features[i, 0] = -1.0e+10 # self.inf_float new_file_name = os.path.join( dir_name, file_id + file_extension_dict[feature_name]) io_funcs.array_to_binary_file(gen_features, new_file_name) pf_coef = 1.4 fw_alpha = 0.58 co_coef = 511 sptk_path = { 'SOPR': sptk_dir + 'sopr', 'FREQT': sptk_dir + 'freqt', 'VSTAT': sptk_dir + 'vstat', 'MGC2SP': sptk_dir + 'mgc2sp', 'MERGE': sptk_dir + 'merge', 'BCP': sptk_dir + 'bcp', 'MC2B': sptk_dir + 'mc2b', 'C2ACR': sptk_dir + 'c2acr', 'MLPG': sptk_dir + 'mlpg', 'VOPR': sptk_dir + 'vopr', 'B2MC': sptk_dir + 'b2mc', 'X2X': sptk_dir + 'x2x', 'VSUM': sptk_dir + 'vsum' } world_path = { 'ANALYSIS': world_dir + 'analysis', 'SYNTHESIS': world_dir + 'synth' } fw_coef = fw_alpha fl_coef = fl files = { 'sp': base + '.sp', 'mgc': base + '.mgc', 'f0': base + '.f0', 'lf0': base + '.lf0', 'ap': base + '.ap', 'bap': base + '.bap', 'wav': base + '.wav' } mgc_file_name = files['mgc'] cur_dir = os.getcwd() os.chdir(gen_dir) # post-filtering if do_post_filtering: line = "echo 1 1 " for i in range(2, mgc_dim): line = line + str(pf_coef) + " " run_process('{line} | {x2x} +af > {weight}'.format( line=line, x2x=sptk_path['X2X'], weight=os.path.join(gen_dir, 'weight'))) run_process('{freqt} -m {order} -a {fw} -M {co} -A 0 < {mgc} | ' '{c2acr} -m {co} -M 0 -l {fl} > {base_r0}'.format( freqt=sptk_path['FREQT'], order=mgc_dim - 1, fw=fw_coef, co=co_coef, mgc=files['mgc'], c2acr=sptk_path['C2ACR'], fl=fl_coef, base_r0=files['mgc'] + '_r0')) run_process('{vopr} -m -n {order} < {mgc} {weight} | ' '{freqt} -m {order} -a {fw} -M {co} -A 0 | ' '{c2acr} -m {co} -M 0 -l {fl} > {base_p_r0}'.format( vopr=sptk_path['VOPR'], order=mgc_dim - 1, mgc=files['mgc'], weight=os.path.join(gen_dir, 'weight'), freqt=sptk_path['FREQT'], fw=fw_coef, co=co_coef, c2acr=sptk_path['C2ACR'], fl=fl_coef, base_p_r0=files['mgc'] + '_p_r0')) run_process('{vopr} -m -n {order} < {mgc} {weight} | ' '{mc2b} -m {order} -a {fw} | ' '{bcp} -n {order} -s 0 -e 0 > {base_b0}'.format( vopr=sptk_path['VOPR'], order=mgc_dim - 1, mgc=files['mgc'], weight=os.path.join(gen_dir, 'weight'), mc2b=sptk_path['MC2B'], fw=fw_coef, bcp=sptk_path['BCP'], base_b0=files['mgc'] + '_b0')) run_process( '{vopr} -d < {base_r0} {base_p_r0} | ' '{sopr} -LN -d 2 | {vopr} -a {base_b0} > {base_p_b0}'.format( vopr=sptk_path['VOPR'], base_r0=files['mgc'] + '_r0', base_p_r0=files['mgc'] + '_p_r0', sopr=sptk_path['SOPR'], base_b0=files['mgc'] + '_b0', base_p_b0=files['mgc'] + '_p_b0')) run_process('{vopr} -m -n {order} < {mgc} {weight} | ' '{mc2b} -m {order} -a {fw} | ' '{bcp} -n {order} -s 1 -e {order} | ' '{merge} -n {order2} -s 0 -N 0 {base_p_b0} | ' '{b2mc} -m {order} -a {fw} > {base_p_mgc}'.format( vopr=sptk_path['VOPR'], order=mgc_dim - 1, mgc=files['mgc'], weight=os.path.join(gen_dir, 'weight'), mc2b=sptk_path['MC2B'], fw=fw_coef, bcp=sptk_path['BCP'], merge=sptk_path['MERGE'], order2=mgc_dim - 2, base_p_b0=files['mgc'] + '_p_b0', b2mc=sptk_path['B2MC'], base_p_mgc=files['mgc'] + '_p_mgc')) mgc_file_name = files['mgc'] + '_p_mgc' # Vocoder WORLD run_process('{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} | ' '{x2x} +fd > {f0}'.format(sopr=sptk_path['SOPR'], lf0=files['lf0'], x2x=sptk_path['X2X'], f0=files['f0'])) run_process('{sopr} -c 0 {bap} | {x2x} +fd > {ap}'.format( sopr=sptk_path['SOPR'], bap=files['bap'], x2x=sptk_path['X2X'], ap=files['ap'])) run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} | ' '{sopr} -d 32768.0 -P | {x2x} +fd > {sp}'.format( mgc2sp=sptk_path['MGC2SP'], alpha=fw_alpha, order=mgc_dim - 1, fl=fl, mgc=mgc_file_name, sopr=sptk_path['SOPR'], x2x=sptk_path['X2X'], sp=files['sp'])) run_process('{synworld} {fl} {sr} {f0} {sp} {ap} {wav}'.format( synworld=world_path['SYNTHESIS'], fl=fl, sr=sr, f0=files['f0'], sp=files['sp'], ap=files['ap'], wav=files['wav'])) run_process('rm -f {ap} {sp} {f0} {bap} {lf0} {mgc} {mgc}_b0 {mgc}_p_b0 ' '{mgc}_p_mgc {mgc}_p_r0 {mgc}_r0 {cmp} weight'.format( ap=files['ap'], sp=files['sp'], f0=files['f0'], bap=files['bap'], lf0=files['lf0'], mgc=files['mgc'], cmp=base + '.cmp')) os.chdir(cur_dir)
max_d=max_d, # plot a horizontal cut-off line ) plt.show() clusters_contours = [] for xi in range(k): ind_cnt = np.where(clusters == xi+1) clusters_contours = np.concatenate((clusters_contours, np.mean(Y1[ind_cnt], axis=0)), axis=0) final_clusters = clusters_contours.reshape(k,coef_size-1) if k%2==0: plot_templates(final_clusters) io_funcs.array_to_binary_file(final_clusters, clusters_file) # ## comment below line to run full list of files # break; ### breaks after processing one file - to check errors train_clusters = [] dev_clusters = [] test_clusters = [] train_utt = 3850; valid_utt = 116; test_utt=271; if templatefeats: stat_fname = feat_dir_path + '.txt' stats_template_file = os.path.join(work_dir, 'Data/inter-module/'+speaker+'/misc/', stat_fname) filelist = os.path.join(work_dir, 'Data/fileList/'+speaker+'.scp') list_arr = io_funcs.load_file_list(filelist) prosody_feats = []; flens = [];syl_dur_lens=[]
def prepare_data(self, in_file_list_dict, out_file_list, in_dimension_dict, out_dimension_dict): logger = logging.getLogger("acoustic_comp") stream_start_index = {} stream_dim_index = 0 # prepare data # counter = 0 # sort_order = {'bap': 0, 'lf0': 1, 'mgc': 2,'vuv': 3, 'spk': 4} # for i in out_dimension_dict: # odrded_keys = sorted(i,key=lambda x: sort_order[x]) # # tmp = OrderedDict(sorted(i.items(),key=lambda x: sort_order[x[0]])) # # data[counter].update(tmp) # # counter+=1 # # print(data) data = defaultdict(OrderedDict) source = [out_dimension_dict] counter = 0 if not 'dur' in out_dimension_dict: sort_order = {'bap': 1, 'lf0': 2, 'vuv': 3,'mgc': 0, 'spk': 4} else: sort_order = {'dur': 0,'spk':1} for i in source: odrded_keys = sorted(i,key=lambda x: sort_order[x]) tmp = OrderedDict(sorted(i.items(),key=lambda x: sort_order[x[0]])) data[counter].update(tmp) counter+=1 out_dimension_dict=data[0] # print('keys of out dimensioon {}'.format(out_dimension_dict.keys())) for stream_name in list(out_dimension_dict.keys()): if stream_name not in stream_start_index: stream_start_index[stream_name] = stream_dim_index stream_dim_index += out_dimension_dict[stream_name] # print('out put dimension dict {}'.format(out_dimension_dict)) io_funcs = BinaryIOCollection() for i in range(self.file_number): out_file_name = out_file_list[i] #if os.path.isfile(out_file_name): # logger.info('processing file %4d of %4d : %s exists' % (i+1, self.file_number, out_file_name)) # continue logger.info('processing file %4d of %4d : %s' % (i+1,self.file_number,out_file_name)) out_data_matrix = None out_frame_number = 0 for k in range(self.data_stream_number): data_stream_name = self.data_stream_list[k] in_file_name = in_file_list_dict[data_stream_name][i] in_feature_dim = in_dimension_dict[data_stream_name] features, frame_number = io_funcs.load_binary_file_frame(in_file_name, in_feature_dim) # if in_file_name.split('.')[1]=='spk': # print('load features related to speaker ') # print(features) # print('prepare data from the acoustic composition {} in_file_name {} in_feature_dim {} frame_number {} '.format(k,in_file_name,in_feature_dim,frame_number)) if k == 0: out_frame_number = frame_number out_data_matrix = numpy.zeros((out_frame_number, self.out_dimension)) if frame_number > out_frame_number: features = features[0:out_frame_number, ] frame_number = out_frame_number try: assert out_frame_number == frame_number except AssertionError: logger.critical('the frame number of data stream %s is not consistent with others: current %d others %d' %(data_stream_name, out_frame_number, frame_number)) raise dim_index = stream_start_index[data_stream_name] if data_stream_name in ['lf0', 'F0']: ## F0 added for GlottHMM features, vuv_vector = self.interpolate_f0(features) ### if vuv information to be recorded, store it in corresponding column if self.record_vuv: out_data_matrix[0:out_frame_number, stream_start_index['vuv']:stream_start_index['vuv']+1] = vuv_vector out_data_matrix[0:out_frame_number, dim_index:dim_index+in_feature_dim] = features dim_index = dim_index+in_feature_dim if self.compute_dynamic[data_stream_name]: delta_features = self.compute_dynamic_matrix(features, self.delta_win, frame_number, in_feature_dim) acc_features = self.compute_dynamic_matrix(features, self.acc_win, frame_number, in_feature_dim) out_data_matrix[0:out_frame_number, dim_index:dim_index+in_feature_dim] = delta_features dim_index = dim_index+in_feature_dim out_data_matrix[0:out_frame_number, dim_index:dim_index+in_feature_dim] = acc_features ### write data to file io_funcs.array_to_binary_file(out_data_matrix, out_file_name) logger.debug(' wrote %d frames of features',out_frame_number )
def prepare_data(self, in_file_list_dict, out_file_list, in_dimension_dict, out_dimension_dict): logger = logging.getLogger("acoustic_comp") stream_start_index = {} stream_dim_index = 0 for stream_name in out_dimension_dict.keys(): if not stream_start_index.has_key(stream_name): stream_start_index[stream_name] = stream_dim_index stream_dim_index += out_dimension_dict[stream_name] io_funcs = BinaryIOCollection() for i in xrange(self.file_number): out_file_name = out_file_list[i] #if os.path.isfile(out_file_name): # logger.info('processing file %4d of %4d : %s exists' % (i+1, self.file_number, out_file_name)) # continue logger.info('processing file %4d of %4d : %s' % (i + 1, self.file_number, out_file_name)) out_data_matrix = None out_frame_number = 0 for k in xrange(self.data_stream_number): data_stream_name = self.data_stream_list[k] in_file_name = in_file_list_dict[data_stream_name][i] in_feature_dim = in_dimension_dict[data_stream_name] features, frame_number = io_funcs.load_binary_file_frame( in_file_name, in_feature_dim) if k == 0: out_frame_number = frame_number out_data_matrix = numpy.zeros( (out_frame_number, self.out_dimension)) if frame_number > out_frame_number: features = features[0:out_frame_number, ] frame_number = out_frame_number try: assert out_frame_number == frame_number except AssertionError: logger.critical( 'the frame number of data stream %s is not consistent with others: current %d others %d' % (data_stream_name, out_frame_number, frame_number)) raise dim_index = stream_start_index[data_stream_name] if data_stream_name in ['lf0', 'F0']: ## F0 added for GlottHMM features, vuv_vector = self.interpolate_f0(features) ### if vuv information to be recorded, store it in corresponding column if self.record_vuv: out_data_matrix[0:out_frame_number, stream_start_index['vuv']: stream_start_index['vuv'] + 1] = vuv_vector out_data_matrix[0:out_frame_number, dim_index:dim_index + in_feature_dim] = features dim_index = dim_index + in_feature_dim if self.compute_dynamic[data_stream_name]: print features.shape, out_file_name delta_features = self.compute_dynamic_matrix( features, self.delta_win, frame_number, in_feature_dim) acc_features = self.compute_dynamic_matrix( features, self.acc_win, frame_number, in_feature_dim) out_data_matrix[0:out_frame_number, dim_index:dim_index + in_feature_dim] = delta_features dim_index = dim_index + in_feature_dim out_data_matrix[0:out_frame_number, dim_index:dim_index + in_feature_dim] = acc_features ### write data to file io_funcs.array_to_binary_file(out_data_matrix, out_file_name) logger.debug(' wrote %d frames of features', out_frame_number)
def generate_wav( data, gen_dir, base, sptk_dir, world_dir, norm_info_file, do_post_filtering=True, mgc_dim=60, fl=1024, sr=16000): io_funcs = BinaryIOCollection() file_name = os.path.join(gen_dir, base + ".cmp") fid = open(norm_info_file, 'rb') cmp_info = numpy.fromfile(fid, dtype=numpy.float32) fid.close() cmp_info = cmp_info.reshape((2, -1)) cmp_mean = cmp_info[0, ] cmp_std = cmp_info[1, ] data = data * cmp_std + cmp_mean io_funcs.array_to_binary_file(data, file_name) # This code was adapted from Merlin. I should add the license. out_dimension_dict = {'bap': 1, 'lf0': 1, 'mgc': 60, 'vuv': 1} stream_start_index = {} file_extension_dict = { 'mgc': '.mgc', 'bap': '.bap', 'lf0': '.lf0', 'dur': '.dur', 'cmp': '.cmp'} gen_wav_features = ['mgc', 'lf0', 'bap'] dimension_index = 0 for feature_name in out_dimension_dict.keys(): stream_start_index[feature_name] = dimension_index dimension_index += out_dimension_dict[feature_name] dir_name = os.path.dirname(file_name) file_id = os.path.splitext(os.path.basename(file_name))[0] features, frame_number = io_funcs.load_binary_file_frame(file_name, 63) for feature_name in gen_wav_features: current_features = features[ :, stream_start_index[feature_name]: stream_start_index[feature_name] + out_dimension_dict[feature_name]] gen_features = current_features if feature_name in ['lf0', 'F0']: if 'vuv' in stream_start_index.keys(): vuv_feature = features[ :, stream_start_index['vuv']:stream_start_index['vuv'] + 1] for i in xrange(frame_number): if vuv_feature[i, 0] < 0.5: gen_features[i, 0] = -1.0e+10 # self.inf_float new_file_name = os.path.join( dir_name, file_id + file_extension_dict[feature_name]) io_funcs.array_to_binary_file(gen_features, new_file_name) pf_coef = 1.4 fw_alpha = 0.58 co_coef = 511 sptk_path = { 'SOPR': sptk_dir + 'sopr', 'FREQT': sptk_dir + 'freqt', 'VSTAT': sptk_dir + 'vstat', 'MGC2SP': sptk_dir + 'mgc2sp', 'MERGE': sptk_dir + 'merge', 'BCP': sptk_dir + 'bcp', 'MC2B': sptk_dir + 'mc2b', 'C2ACR': sptk_dir + 'c2acr', 'MLPG': sptk_dir + 'mlpg', 'VOPR': sptk_dir + 'vopr', 'B2MC': sptk_dir + 'b2mc', 'X2X': sptk_dir + 'x2x', 'VSUM': sptk_dir + 'vsum'} world_path = { 'ANALYSIS': world_dir + 'analysis', 'SYNTHESIS': world_dir + 'synth'} fw_coef = fw_alpha fl_coef = fl files = {'sp': base + '.sp', 'mgc': base + '.mgc', 'f0': base + '.f0', 'lf0': base + '.lf0', 'ap': base + '.ap', 'bap': base + '.bap', 'wav': base + '.wav'} mgc_file_name = files['mgc'] cur_dir = os.getcwd() os.chdir(gen_dir) # post-filtering if do_post_filtering: line = "echo 1 1 " for i in range(2, mgc_dim): line = line + str(pf_coef) + " " run_process( '{line} | {x2x} +af > {weight}' .format( line=line, x2x=sptk_path['X2X'], weight=os.path.join(gen_dir, 'weight'))) run_process( '{freqt} -m {order} -a {fw} -M {co} -A 0 < {mgc} | ' '{c2acr} -m {co} -M 0 -l {fl} > {base_r0}' .format( freqt=sptk_path['FREQT'], order=mgc_dim - 1, fw=fw_coef, co=co_coef, mgc=files['mgc'], c2acr=sptk_path['C2ACR'], fl=fl_coef, base_r0=files['mgc'] + '_r0')) run_process( '{vopr} -m -n {order} < {mgc} {weight} | ' '{freqt} -m {order} -a {fw} -M {co} -A 0 | ' '{c2acr} -m {co} -M 0 -l {fl} > {base_p_r0}' .format( vopr=sptk_path['VOPR'], order=mgc_dim - 1, mgc=files['mgc'], weight=os.path.join(gen_dir, 'weight'), freqt=sptk_path['FREQT'], fw=fw_coef, co=co_coef, c2acr=sptk_path['C2ACR'], fl=fl_coef, base_p_r0=files['mgc'] + '_p_r0')) run_process( '{vopr} -m -n {order} < {mgc} {weight} | ' '{mc2b} -m {order} -a {fw} | ' '{bcp} -n {order} -s 0 -e 0 > {base_b0}' .format( vopr=sptk_path['VOPR'], order=mgc_dim - 1, mgc=files['mgc'], weight=os.path.join(gen_dir, 'weight'), mc2b=sptk_path['MC2B'], fw=fw_coef, bcp=sptk_path['BCP'], base_b0=files['mgc'] + '_b0')) run_process( '{vopr} -d < {base_r0} {base_p_r0} | ' '{sopr} -LN -d 2 | {vopr} -a {base_b0} > {base_p_b0}' .format( vopr=sptk_path['VOPR'], base_r0=files['mgc'] + '_r0', base_p_r0=files['mgc'] + '_p_r0', sopr=sptk_path['SOPR'], base_b0=files['mgc'] + '_b0', base_p_b0=files['mgc'] + '_p_b0')) run_process( '{vopr} -m -n {order} < {mgc} {weight} | ' '{mc2b} -m {order} -a {fw} | ' '{bcp} -n {order} -s 1 -e {order} | ' '{merge} -n {order2} -s 0 -N 0 {base_p_b0} | ' '{b2mc} -m {order} -a {fw} > {base_p_mgc}' .format( vopr=sptk_path['VOPR'], order=mgc_dim - 1, mgc=files['mgc'], weight=os.path.join(gen_dir, 'weight'), mc2b=sptk_path['MC2B'], fw=fw_coef, bcp=sptk_path['BCP'], merge=sptk_path['MERGE'], order2=mgc_dim - 2, base_p_b0=files['mgc'] + '_p_b0', b2mc=sptk_path['B2MC'], base_p_mgc=files['mgc'] + '_p_mgc')) mgc_file_name = files['mgc'] + '_p_mgc' # Vocoder WORLD run_process( '{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} | ' '{x2x} +fd > {f0}' .format( sopr=sptk_path['SOPR'], lf0=files['lf0'], x2x=sptk_path['X2X'], f0=files['f0'])) run_process( '{sopr} -c 0 {bap} | {x2x} +fd > {ap}'.format( sopr=sptk_path['SOPR'], bap=files['bap'], x2x=sptk_path['X2X'], ap=files['ap'])) run_process( '{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} | ' '{sopr} -d 32768.0 -P | {x2x} +fd > {sp}'.format( mgc2sp=sptk_path['MGC2SP'], alpha=fw_alpha, order=mgc_dim - 1, fl=fl, mgc=mgc_file_name, sopr=sptk_path['SOPR'], x2x=sptk_path['X2X'], sp=files['sp'])) run_process( '{synworld} {fl} {sr} {f0} {sp} {ap} {wav}'.format( synworld=world_path['SYNTHESIS'], fl=fl, sr=sr, f0=files['f0'], sp=files['sp'], ap=files['ap'], wav=files['wav'])) run_process( 'rm -f {ap} {sp} {f0} {bap} {lf0} {mgc} {mgc}_b0 {mgc}_p_b0 ' '{mgc}_p_mgc {mgc}_p_r0 {mgc}_r0 {cmp} weight'.format( ap=files['ap'], sp=files['sp'], f0=files['f0'], bap=files['bap'], lf0=files['lf0'], mgc=files['mgc'], cmp=base + '.cmp')) os.chdir(cur_dir)