def interpolate_f0(self, f0_file): io_funcs = BinaryIOCollection() data = io_funcs.load_float_file(f0_file) ip_data = data frame_number = len(data) last_value = 0.0 for i in xrange(frame_number): if data[i] <= 0.0: j = i + 1 for j in range(i + 1, frame_number): if data[j] > 0.0: break if j < frame_number - 1: if last_value > 0.0: step = (data[j] - data[i - 1]) / float(j - i) for k in range(i, j): ip_data[k] = data[i - 1] + step * (k - i + 1) else: for k in range(i, j): ip_data[k] = data[j] else: for k in range(i, frame_number): ip_data[k] = last_value else: ip_data[i] = data[i] last_value = data[i] return ip_data
def make_equal_frames(self, in_file_list, ref_file_list, in_dimension_dict): logger = logging.getLogger("acoustic_comp") logger.info('making equal number of lines...') io_funcs = BinaryIOCollection() utt_number = len(in_file_list) for i in xrange(utt_number): in_file_name = in_file_list[i] in_data_stream_name = in_file_name.split('.')[-1] in_feature_dim = in_dimension_dict[in_data_stream_name] in_features, in_frame_number = io_funcs.load_binary_file_frame(in_file_name, in_feature_dim) ref_file_name = ref_file_list[i] ref_data_stream_name = ref_file_name.split('.')[-1] ref_feature_dim = in_dimension_dict[ref_data_stream_name] ref_features, ref_frame_number = io_funcs.load_binary_file_frame(ref_file_name, ref_feature_dim) target_features = numpy.zeros((ref_frame_number, in_feature_dim)) if in_frame_number == ref_frame_number: continue; elif in_frame_number > ref_frame_number: target_features[0:ref_frame_number, ] = in_features[0:ref_frame_number, ] elif in_frame_number < ref_frame_number: target_features[0:in_frame_number, ] = in_features[0:in_frame_number, ] io_funcs.array_to_binary_file(target_features, in_file_name) logger.info('Finished: made equal rows in data stream %s with reference to data stream %s ' %(in_data_stream_name, ref_data_stream_name))
def produce_nn_cmp(self, in_file_list, out_file_list): logger = logging.getLogger("acoustic_norm") delta_win = [-0.5, 0.0, 0.5] acc_win = [1.0, -2.0, 1.0] file_number = len(in_file_list) logger.info('starting creation of %d files' % file_number) for i in xrange(file_number): mgc_data, bap_data, lf0_data = self.load_cmp_file(in_file_list[i]) ip_lf0, vuv_vector = self.interpolate_f0(lf0_data) delta_lf0 = self.compute_delta(ip_lf0, delta_win) acc_lf0 = self.compute_delta(ip_lf0, acc_win) frame_number = ip_lf0.size cmp_data = numpy.concatenate((mgc_data, ip_lf0, delta_lf0, acc_lf0, vuv_vector, bap_data), axis=1) io_funcs = BinaryIOCollection() io_funcs.array_to_binary_file(cmp_data, out_file_list[i]) logger.info('finished creation of %d binary files' % file_number)
def process_utterance(self, utt): # if utt.has_attribute("waveform"): # print "Utt has a natural waveform -- don't synthesise" # return if not self.trained: print('WARNING: Cannot apply processor %s till model is trained' % (self.processor_name)) return label = utt.get_filename(self.input_label_filetype) owave = utt.get_filename(self.output_filetype) streams = self.model.generate(label, variance_expansion=self.variance_expansion, \ fill_unvoiced_gaps=self.fill_unvoiced_gaps) # TODO: save streams to binary files # Streams are a dictionary: {bap, lf0, mgc, vuv} # I can specify path via self.voice_resources['voice'] ('/home/alexander/Documents/text_to_speech/projects/Ossian_py3/voices//ice/ivona/lvl_lex_01_nn') directory = os.path.join(self.voice_resources.path['voice'], 'output', 'cmp') if not os.path.exists(directory): os.makedirs(directory) io = BinaryIOCollection() for name, data in streams.items(): file = os.path.join(directory, utt.data.attrib['utterance_name'] + '.' + name) io.array_to_binary_file(data, file) # writelist(utt_data=data, label_file=file, uni=False) self.world_resynth(streams, owave)
def compute_global_variance(self, file_list, feat_dim, save_dir): logger = logging.getLogger("compute gv") logger.info('computed global variance of length %d') all_std_vector = numpy.zeros((len(file_list), feat_dim)) filenum = 0 io_funcs = BinaryIOCollection() for file_name in file_list: features = io_funcs.load_binary_file(file_name, feat_dim) std_vector = numpy.var(features, axis=0) all_std_vector[filenum, :] = std_vector filenum = filenum + 1 #compute mean and std for all_std_vectors print all_std_vector.shape global_mean = numpy.mean(all_std_vector, axis=0) global_var = numpy.var(all_std_vector, axis=0) gv_mean_name = os.path.join(save_dir, 'gv_mean') fid = open(gv_mean_name, 'wb') global_mean.tofile(fid) fid.close() gv_var_name = os.path.join(save_dir, 'gv_var') fid = open(gv_var_name, 'wb') global_var.tofile(fid) fid.close() print global_mean print global_var
def compute_std(self, file_list, mean_vector, start_index, end_index): local_feature_dimension = end_index - start_index std_vector = numpy.zeros((1, self.feature_dimension)) all_frame_number = 0 io_funcs = BinaryIOCollection() for file_name in file_list: features, current_frame_number = io_funcs.load_binary_file_frame(file_name, self.feature_dimension) mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1)) std_vector += numpy.reshape(numpy.sum((features[:, start_index:end_index] - mean_matrix) ** 2, axis=0), (1, local_feature_dimension)) all_frame_number += current_frame_number std_vector /= float(all_frame_number) std_vector = std_vector ** 0.5 # po=numpy.get_printoptions() # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4) self.logger.info('computed std vector of length %d' % std_vector.shape[1] ) self.logger.info(' std: %s' % std_vector) # restore the print options # numpy.set_printoptions(po) return std_vector
def find_min_max_values(self, in_file_list, start_index, end_index): local_feature_dimension = end_index - start_index file_number = len(in_file_list) min_value_matrix = numpy.zeros((file_number, local_feature_dimension)) max_value_matrix = numpy.zeros((file_number, local_feature_dimension)) io_funcs = BinaryIOCollection() for i in xrange(file_number): features = io_funcs.load_binary_file(in_file_list[i], self.feature_dimension) temp_min = numpy.amin(features[:, start_index:end_index], axis = 0) temp_max = numpy.amax(features[:, start_index:end_index], axis = 0) min_value_matrix[i, ] = temp_min; max_value_matrix[i, ] = temp_max; self.min_vector = numpy.amin(min_value_matrix, axis = 0) self.max_vector = numpy.amax(max_value_matrix, axis = 0) self.min_vector = numpy.reshape(self.min_vector, (1, local_feature_dimension)) self.max_vector = numpy.reshape(self.max_vector, (1, local_feature_dimension)) # po=numpy.get_printoptions() # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4) self.logger.info('found min/max values of length %d:' % local_feature_dimension) self.logger.info(' min: %s' % self.min_vector) self.logger.info(' max: %s' % self.max_vector)
def load_next_utterance_S2SML(self): """Load the data for one utterance. This function will be called when utterance-by-utterance loading is required (e.g., sequential training). """ io_fun = BinaryIOCollection() in_features, lab_frame_number = io_fun.load_binary_file_frame(self.x_files_list[self.file_index], self.n_ins) out_features, out_frame_number = io_fun.load_binary_file_frame(self.y_files_list[self.file_index], self.n_outs) dur_features, dur_frame_number = io_fun.load_binary_file_frame(self.dur_files_list[self.file_index], 1) ### MLU features sub-division ### temp_set_MLU = in_features[0:lab_frame_number, ] temp_set_y = out_features[0:out_frame_number, ] temp_set_phone = numpy.concatenate([temp_set_MLU[:, self.MLU_div['phone'][0]: self.MLU_div['phone'][1]], temp_set_MLU[:, self.MLU_div['phone'][2]: self.MLU_div['phone'][3]]], axis = 1) temp_set_syl = numpy.concatenate([temp_set_MLU[:, self.MLU_div['syl'][0]: self.MLU_div['syl'][1]], temp_set_MLU[:, self.MLU_div['syl'][2]: self.MLU_div['syl'][3]]], axis = 1) temp_set_word = numpy.concatenate([temp_set_MLU[:, self.MLU_div['word'][0]: self.MLU_div['word'][1]], temp_set_MLU[:, self.MLU_div['word'][2]: self.MLU_div['word'][3] ]], axis = 1) ### duration array sub-division ### dur_features = numpy.reshape(dur_features, (-1, )) temp_set_d = dur_features.astype(int) dur_word_syl = temp_set_d[0: -lab_frame_number] num_ph = lab_frame_number num_syl = (numpy.where(numpy.cumsum(dur_word_syl[::-1])==lab_frame_number)[0][0] + 1) num_words = len(dur_word_syl) - num_syl temp_set_dur_phone = temp_set_d[-num_ph:] temp_set_dur_word = dur_word_syl[0: num_words] temp_set_dur_syl = dur_word_syl[num_words: ] ### additional feature matrix (syllable+phone+frame=432) ### num_frames = sum(temp_set_dur_phone) temp_set_af = numpy.empty((num_frames, self.MLU_div['length'][-1])) temp_set_af[0: num_syl, self.MLU_div['length'][0]: self.MLU_div['length'][1] ] = temp_set_syl[numpy.cumsum(temp_set_dur_syl)-1] temp_set_af[0: num_ph, self.MLU_div['length'][1]: self.MLU_div['length'][2]] = temp_set_phone ### input word feature matrix ### temp_set_dur_word_segments = numpy.zeros(num_words, dtype='int32') syl_bound = numpy.cumsum(temp_set_dur_word) for indx in range(num_words): temp_set_dur_word_segments[indx] = int(sum(temp_set_dur_syl[0: syl_bound[indx]])) temp_set_x = temp_set_word[temp_set_dur_word_segments-1] ### rest of the code similar to S2S ### self.file_index += 1 if self.file_index >= self.list_size: self.end_reading = True self.file_index = 0 shared_set_x = self.make_shared(temp_set_x, 'x') shared_set_y = self.make_shared(temp_set_y, 'y') shared_set_d = theano.shared(numpy.asarray(temp_set_d, dtype='int32'), name='d', borrow=True) shared_set_xyd = (shared_set_x, shared_set_y, shared_set_d) return shared_set_xyd, temp_set_x, temp_set_y, temp_set_d, temp_set_af
def read_and_transform_data_from_file_list(in_file_list, dim, seq_length=200, merge_size=1): io_funcs = BinaryIOCollection() num_of_utt = len(in_file_list) temp_set = np.zeros((FRAME_BUFFER_SIZE, dim)) ### read file by file ### current_index = 0 for i in range(num_of_utt): in_file_name = in_file_list[i] in_features, frame_number = io_funcs.load_binary_file_frame( in_file_name, dim) base_file_name = os.path.basename(in_file_name).split(".")[0] temp_set[current_index:current_index + frame_number, ] = in_features current_index += frame_number if (i + 1) % merge_size == 0: current_index = seq_length * (int( np.ceil(float(current_index) / float(seq_length)))) drawProgressBar(i + 1, num_of_utt) sys.stdout.write("\n") num_of_samples = int(np.ceil(float(current_index) / float(seq_length))) temp_set = temp_set[0:num_of_samples * seq_length, ] temp_set = temp_set.reshape(num_of_samples, seq_length) return temp_set
def load_next_utterance_CTC(self): temp_set_x = numpy.empty((self.buffer_size, self.n_ins)) temp_set_y = numpy.empty(self.buffer_size) io_fun = BinaryIOCollection() in_features, lab_frame_number = io_fun.load_binary_file_frame( self.x_files_list[self.file_index], self.n_ins) out_features, out_frame_number = io_fun.load_binary_file_frame( self.y_files_list[self.file_index], self.n_outs) frame_number = lab_frame_number temp_set_x = in_features[0:frame_number, ] temp_set_y = numpy.array([self.n_outs]) for il in numpy.argmax(out_features, axis=1): temp_set_y = numpy.concatenate((temp_set_y, [il, self.n_outs]), axis=0) self.file_index += 1 if self.file_index >= self.list_size: self.end_reading = True self.file_index = 0 shared_set_x = self.make_shared(temp_set_x, 'x') shared_set_y = theano.shared(numpy.asarray(temp_set_y, dtype='int32'), name='y', borrow=True) shared_set_xy = (shared_set_x, shared_set_y) return shared_set_xy, temp_set_x, temp_set_y
def normal_standardization(self, in_file_list, out_file_list, feature_dimension): # self.dimension_dict = dimension_dict self.feature_dimension = feature_dimension mean_vector = self.compute_mean(in_file_list, 0, feature_dimension) std_vector = self.compute_std(in_file_list, mean_vector, 0, feature_dimension) io_funcs = BinaryIOCollection() file_number = len(in_file_list) for i in range(file_number): features, current_frame_number = io_funcs.load_binary_file_frame( in_file_list[i], self.feature_dimension) mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1)) std_matrix = numpy.tile(std_vector, (current_frame_number, 1)) norm_features = (features - mean_matrix) / std_matrix io_funcs.array_to_binary_file(norm_features, out_file_list[i]) return mean_vector, std_vector
def find_min_max_values(self, in_file_list, start_index, end_index): local_feature_dimension = end_index - start_index file_number = len(in_file_list) min_value_matrix = numpy.zeros((file_number, local_feature_dimension)) max_value_matrix = numpy.zeros((file_number, local_feature_dimension)) io_funcs = BinaryIOCollection() for i in range(file_number): features = io_funcs.load_binary_file(in_file_list[i], self.feature_dimension) temp_min = numpy.amin(features[:, start_index:end_index], axis=0) temp_max = numpy.amax(features[:, start_index:end_index], axis=0) min_value_matrix[i, ] = temp_min max_value_matrix[i, ] = temp_max self.min_vector = numpy.amin(min_value_matrix, axis=0) self.max_vector = numpy.amax(max_value_matrix, axis=0) self.min_vector = numpy.reshape(self.min_vector, (1, local_feature_dimension)) self.max_vector = numpy.reshape(self.max_vector, (1, local_feature_dimension)) # po=numpy.get_printoptions() # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4) self.logger.info('found min/max values of length %d:' % local_feature_dimension) self.logger.info(' min: %s' % self.min_vector) self.logger.info(' max: %s' % self.max_vector)
def compute_mean(self, file_list, start_index, end_index): local_feature_dimension = end_index - start_index mean_vector = numpy.zeros((1, local_feature_dimension)) all_frame_number = 0 io_funcs = BinaryIOCollection() for file_name in file_list: features, current_frame_number = io_funcs.load_binary_file_frame( file_name, self.feature_dimension) mean_vector += numpy.reshape( numpy.sum(features[:, start_index:end_index], axis=0), (1, local_feature_dimension)) all_frame_number += current_frame_number mean_vector /= float(all_frame_number) # po=numpy.get_printoptions() # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4) self.logger.info('computed mean vector of length %d :' % mean_vector.shape[1]) self.logger.info(' mean: %s' % mean_vector) # restore the print options # numpy.set_printoptions(po) return mean_vector
def compute_std(self, file_list, mean_vector, start_index, end_index): local_feature_dimension = end_index - start_index std_vector = numpy.zeros((1, self.feature_dimension)) all_frame_number = 0 io_funcs = BinaryIOCollection() for file_name in file_list: features, current_frame_number = io_funcs.load_binary_file_frame( file_name, self.feature_dimension) mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1)) std_vector += numpy.reshape( numpy.sum( (features[:, start_index:end_index] - mean_matrix)**2, axis=0), (1, local_feature_dimension)) all_frame_number += current_frame_number std_vector /= float(all_frame_number) std_vector = std_vector**0.5 # po=numpy.get_printoptions() # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4) self.logger.info('computed std vector of length %d' % std_vector.shape[1]) self.logger.info(' std: %s' % std_vector) # restore the print options # numpy.set_printoptions(po) return std_vector
def feature_denormalisation(self, in_file_list, out_file_list, mean_vector, std_vector): io_funcs = BinaryIOCollection() file_number = len(in_file_list) try: assert len(in_file_list) == len(out_file_list) except AssertionError: logger.critical( 'The input and output file numbers are not the same! %d vs %d' % (len(in_file_list), len(out_file_list))) raise try: assert mean_vector.size == self.feature_dimension and std_vector.size == self.feature_dimension except AssertionError: logger.critical( 'the dimensionalities of the mean and standard derivation vectors are not the same as the dimensionality of the feature' ) raise for i in xrange(file_number): features, current_frame_number = io_funcs.load_binary_file_frame( in_file_list[i], self.feature_dimension) mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1)) std_matrix = numpy.tile(std_vector, (current_frame_number, 1)) norm_features = features * std_matrix + mean_matrix io_funcs.array_to_binary_file(norm_features, out_file_list[i])
def load_next_utterance(self): """Load the data for one utterance. This function will be called when utterance-by-utterance loading is required (e.g., sequential training). """ temp_set_x = np.empty((self.buffer_size, self.n_ins)) temp_set_y = np.empty((self.buffer_size, self.n_outs)) io_fun = BinaryIOCollection() in_features, lab_frame_number = io_fun.load_binary_file_frame(self.x_files_list[self.file_index], self.n_ins) out_features, out_frame_number = io_fun.load_binary_file_frame(self.y_files_list[self.file_index], self.n_outs) frame_number = lab_frame_number if abs(lab_frame_number - out_frame_number) < 5: ## we allow small difference here. may not be correct, but sometimes, there is one/two frames difference if lab_frame_number > out_frame_number: frame_number = out_frame_number else: base_file_name = self.x_files_list[self.file_index].split('/')[-1].split('.')[0] logging.info("the number of frames in label and acoustic features are different: %d vs %d (%s)" % ( lab_frame_number, out_frame_number, base_file_name)) raise temp_set_y = out_features[0:frame_number, ] temp_set_x = in_features[0:frame_number, ] self.file_index += 1 if self.file_index >= self.list_size: self.end_reading = True self.file_index = 0 return temp_set_x, temp_set_y
def find_min_max_values(self, in_file_list): logger = logging.getLogger("acoustic_norm") file_number = len(in_file_list) min_value_matrix = numpy.zeros((file_number, self.feature_dimension)) max_value_matrix = numpy.zeros((file_number, self.feature_dimension)) io_funcs = BinaryIOCollection() for i in range(file_number): features = io_funcs.load_binary_file(in_file_list[i], self.feature_dimension) temp_min = numpy.amin(features, axis = 0) temp_max = numpy.amax(features, axis = 0) min_value_matrix[i, ] = temp_min; max_value_matrix[i, ] = temp_max; self.min_vector = numpy.amin(min_value_matrix, axis = 0) self.max_vector = numpy.amax(max_value_matrix, axis = 0) self.min_vector = numpy.reshape(self.min_vector, (1, self.feature_dimension)) self.max_vector = numpy.reshape(self.max_vector, (1, self.feature_dimension)) # po=numpy.get_printoptions() # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4) logger.info('across %d files found min/max values of length %d:' % (file_number,self.feature_dimension) ) logger.info(' min: %s' % self.min_vector) logger.info(' max: %s' % self.max_vector)
def read_data_from_file_list(inp_file_list, out_file_list, inp_dim, out_dim, sequential_training=True): io_funcs = BinaryIOCollection() utt_len = len(inp_file_list) file_length_dict = {} if sequential_training: temp_set_x = {} temp_set_y = {} else: temp_set_x = np.empty((BUFFER_SIZE, inp_dim)) temp_set_y = np.empty((BUFFER_SIZE, out_dim)) ### read file by file ### current_index = 0 for i in range(utt_len): inp_file_name = inp_file_list[i] out_file_name = out_file_list[i] inp_features, inp_frame_number = io_funcs.load_binary_file_frame( inp_file_name, inp_dim) out_features, out_frame_number = io_funcs.load_binary_file_frame( out_file_name, out_dim) base_file_name = os.path.basename(inp_file_name).split(".")[0] if abs(inp_frame_number - out_frame_number) > 5: print 'the number of frames in input and output features are different: %d vs %d (%s)' % ( inp_frame_number, out_frame_number, base_file_name) sys.exit(0) else: frame_number = min(inp_frame_number, out_frame_number) if sequential_training: temp_set_x[base_file_name] = inp_features temp_set_y[base_file_name] = out_features else: temp_set_x[current_index:current_index + frame_number, ] = inp_features temp_set_y[current_index:current_index + frame_number, ] = out_features current_index += frame_number if frame_number not in file_length_dict: file_length_dict[frame_number] = [base_file_name] else: file_length_dict[frame_number].append(base_file_name) print_status(i, utt_len) sys.stdout.write("\n") if not sequential_training: temp_set_x = temp_set_x[0:current_index, ] temp_set_y = temp_set_y[0:current_index, ] return temp_set_x, temp_set_y, file_length_dict
def compute_std(self, file_list, mean_vector): logger = logging.getLogger("acoustic_norm") std_vector = numpy.zeros((1, self.feature_dimension)) all_frame_number = 0 io_funcs = BinaryIOCollection() for file_name in file_list: features = io_funcs.load_binary_file(file_name, self.feature_dimension) current_frame_number = features.size // self.feature_dimension mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1)) std_vector += numpy.reshape( numpy.sum((features - mean_matrix)**2, axis=0), (1, self.feature_dimension)) all_frame_number += current_frame_number std_vector /= float(all_frame_number) std_vector = std_vector**0.5 # po=numpy.get_printoptions() # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4) logger.info('computed std vector of length %d' % std_vector.shape[1]) logger.info(' std: %s' % std_vector) # restore the print options # numpy.set_printoptions(po) return std_vector
def compute_std(self, file_list, mean_vector): logger = logging.getLogger("acoustic_norm") std_vector = numpy.zeros((1, self.feature_dimension)) all_frame_number = 0 io_funcs = BinaryIOCollection() for file_name in file_list: features = io_funcs.load_binary_file(file_name, self.feature_dimension) current_frame_number = features.size // self.feature_dimension mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1)) std_vector += numpy.reshape(numpy.sum((features - mean_matrix) ** 2, axis=0), (1, self.feature_dimension)) all_frame_number += current_frame_number std_vector /= float(all_frame_number) std_vector = std_vector ** 0.5 # po=numpy.get_printoptions() # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4) logger.info('computed std vector of length %d' % std_vector.shape[1] ) logger.info(' std: %s' % std_vector) # restore the print options # numpy.set_printoptions(po) return std_vector
def denormalise_data(self, in_file_list, out_file_list): logger = logging.getLogger("acoustic_norm") file_number = len(in_file_list) logger.info('MinMaxNormalisation.denormalise_data for %d files' % file_number) # print self.max_vector, self.min_vector fea_max_min_diff = self.max_vector - self.min_vector diff_value = self.target_max_value - self.target_min_value # logger.debug('reshaping fea_max_min_diff from shape %s to (1,%d)' % (fea_max_min_diff.shape, self.feature_dimension) ) fea_max_min_diff = numpy.reshape(fea_max_min_diff, (1, self.feature_dimension)) target_max_min_diff = numpy.zeros((1, self.feature_dimension)) target_max_min_diff.fill(diff_value) target_max_min_diff[fea_max_min_diff <= 0.0] = 1.0 fea_max_min_diff[fea_max_min_diff <= 0.0] = 1.0 io_funcs = BinaryIOCollection() for i in range(file_number): features = io_funcs.load_binary_file(in_file_list[i], self.feature_dimension) frame_number = features.size // self.feature_dimension fea_min_matrix = numpy.tile(self.min_vector, (frame_number, 1)) target_min_matrix = numpy.tile(self.target_min_value, (frame_number, self.feature_dimension)) fea_diff_matrix = numpy.tile(fea_max_min_diff, (frame_number, 1)) diff_norm_matrix = fea_diff_matrix / numpy.tile(target_max_min_diff, (frame_number, 1)) norm_features = diff_norm_matrix * (features - target_min_matrix) + fea_min_matrix io_funcs.array_to_binary_file(norm_features, out_file_list[i])
def extract_dur_features(self, orig_file, output_file): io_funcs = BinaryIOCollection() totalMat = io_funcs.file2matrix(orig_file) self.label_dimension = totalMat.shape[1] - 5 # collum num durMat = totalMat[:, -5:] io_funcs.array_to_binary_file(durMat, output_file)
def compute_std(self, file_list, mean_vector, start_index, end_index): logger = logging.getLogger('feature_normalisation') local_feature_dimension = end_index - start_index std_vector = numpy.zeros((1, self.feature_dimension)) all_frame_number = 0 io_funcs = BinaryIOCollection() for file_name in file_list: features, current_frame_number = io_funcs.load_binary_file_frame(file_name, self.feature_dimension) mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1)) std_vector += numpy.reshape(numpy.sum((features[:, start_index:end_index] - mean_matrix) ** 2, axis=0), (1, local_feature_dimension)) all_frame_number += current_frame_number std_vector /= float(all_frame_number) std_vector = std_vector ** 0.5 # setting the print options in this way seems to break subsequent printing of numpy float32 types # no idea what is going on - removed until this can be solved # po=numpy.get_printoptions() # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4) logger.info('computed std vector of length %d' % std_vector.shape[1] ) logger.info(' std: %s' % std_vector) # restore the print options # numpy.set_printoptions(po) self.std_vector = std_vector return std_vector
def predict(self, test_x, out_scaler, gen_test_file_list, sequential_training=False, stateful=False): #### compute predictions #### io_funcs = BinaryIOCollection() test_file_number = len(gen_test_file_list) print("generating features on held-out test data...") for utt_index in range(test_file_number): gen_test_file_name = gen_test_file_list[utt_index] test_id = os.path.splitext(os.path.basename(gen_test_file_name))[0] temp_test_x = test_x[test_id] num_of_rows = temp_test_x.shape[0] if stateful: temp_test_x = data_utils.get_stateful_input( temp_test_x, self.seq_length, self.batch_size) elif sequential_training: temp_test_x = np.reshape(temp_test_x, (1, num_of_rows, self.n_in)) predictions = self.model.predict(temp_test_x) if sequential_training: predictions = np.reshape(predictions, (num_of_rows, self.n_out)) data_utils.denorm_data(predictions, out_scaler) io_funcs.array_to_binary_file(predictions, gen_test_file_name) data_utils.drawProgressBar(utt_index + 1, test_file_number) sys.stdout.write("\n")
def compute_mean(self, file_list, start_index, end_index): logger = logging.getLogger('feature_normalisation') local_feature_dimension = end_index - start_index mean_vector = numpy.zeros((1, local_feature_dimension)) all_frame_number = 0 io_funcs = BinaryIOCollection() for file_name in file_list: features, current_frame_number = io_funcs.load_binary_file_frame( file_name, self.feature_dimension) mean_vector += numpy.reshape( numpy.sum(features[:, start_index:end_index], axis=0), (1, local_feature_dimension)) all_frame_number += current_frame_number mean_vector /= float(all_frame_number) # setting the print options in this way seems to break subsequent printing of numpy float32 types # no idea what is going on - removed until this can be solved # po=numpy.get_printoptions() # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4) logger.info('computed mean vector of length %d :' % mean_vector.shape[1]) logger.info(' mean: %s' % mean_vector) # restore the print options # numpy.set_printoptions(po) self.mean_vector = mean_vector return mean_vector
def compute_mean(self, file_list): logger = logging.getLogger("acoustic_norm") mean_vector = numpy.zeros((1, self.feature_dimension)) all_frame_number = 0 io_funcs = BinaryIOCollection() for file_name in file_list: features = io_funcs.load_binary_file(file_name, self.feature_dimension) current_frame_number = features.size // self.feature_dimension mean_vector += numpy.reshape(numpy.sum(features, axis=0), (1, self.feature_dimension)) all_frame_number += current_frame_number mean_vector /= float(all_frame_number) # po=numpy.get_printoptions() # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4) logger.info('computed mean vector of length %d :' % mean_vector.shape[1]) logger.info(' mean: %s' % mean_vector) # restore the print options # numpy.set_printoptions(po) return mean_vector
def load_next_utterance_S2SML(self): """Load the data for one utterance. This function will be called when utterance-by-utterance loading is required (e.g., sequential training). """ io_fun = BinaryIOCollection() in_features, lab_frame_number = io_fun.load_binary_file_frame(self.x_files_list[self.file_index], self.n_ins) out_features, out_frame_number = io_fun.load_binary_file_frame(self.y_files_list[self.file_index], self.n_outs) dur_features, dur_frame_number = io_fun.load_binary_file_frame(self.dur_files_list[self.file_index], 1) ### MLU features sub-division ### temp_set_MLU = in_features[0:lab_frame_number, ] temp_set_y = out_features[0:out_frame_number, ] temp_set_phone = numpy.concatenate([temp_set_MLU[:, self.MLU_div['phone'][0]: self.MLU_div['phone'][1]], temp_set_MLU[:, self.MLU_div['phone'][2]: self.MLU_div['phone'][3]]], axis = 1) temp_set_syl = numpy.concatenate([temp_set_MLU[:, self.MLU_div['syl'][0]: self.MLU_div['syl'][1]], temp_set_MLU[:, self.MLU_div['syl'][2]: self.MLU_div['syl'][3]]], axis = 1) temp_set_word = numpy.concatenate([temp_set_MLU[:, self.MLU_div['word'][0]: self.MLU_div['word'][1]], temp_set_MLU[:, self.MLU_div['word'][2]: self.MLU_div['word'][3] ]], axis = 1) ### duration array sub-division ### dur_features = numpy.reshape(dur_features, (-1, )) temp_set_d = dur_features.astype(int) dur_word_syl = temp_set_d[0: -lab_frame_number] num_ph = lab_frame_number num_syl = (numpy.where(numpy.cumsum(dur_word_syl[::-1])==lab_frame_number)[0][0] + 1) num_words = len(dur_word_syl) - num_syl temp_set_dur_phone = temp_set_d[-num_ph:] temp_set_dur_word = dur_word_syl[0: num_words] temp_set_dur_syl = dur_word_syl[num_words: ] ### additional feature matrix (syllable+phone+frame=432) ### num_frames = sum(temp_set_dur_phone) temp_set_af = numpy.empty((num_frames, self.MLU_div['length'][-1])) temp_set_af[0: num_syl, self.MLU_div['length'][0]: self.MLU_div['length'][1] ] = temp_set_syl[numpy.cumsum(temp_set_dur_syl)-1] temp_set_af[0: num_ph, self.MLU_div['length'][1]: self.MLU_div['length'][2]] = temp_set_phone ### input word feature matrix ### temp_set_dur_word_segments = numpy.zeros(num_words, dtype='int32') syl_bound = numpy.cumsum(temp_set_dur_word) for indx in xrange(num_words): temp_set_dur_word_segments[indx] = int(sum(temp_set_dur_syl[0: syl_bound[indx]])) temp_set_x = temp_set_word[temp_set_dur_word_segments-1] ### rest of the code similar to S2S ### self.file_index += 1 if self.file_index >= self.list_size: self.end_reading = True self.file_index = 0 shared_set_x = self.make_shared(temp_set_x, 'x') shared_set_y = self.make_shared(temp_set_y, 'y') shared_set_d = theano.shared(numpy.asarray(temp_set_d, dtype='int32'), name='d', borrow=True) shared_set_xyd = (shared_set_x, shared_set_y, shared_set_d) return shared_set_xyd, temp_set_x, temp_set_y, temp_set_d, temp_set_af
def load_next_utterance_CTC(self): temp_set_x = numpy.empty((self.buffer_size, self.n_ins)) temp_set_y = numpy.empty(self.buffer_size) io_fun = BinaryIOCollection() in_features, lab_frame_number = io_fun.load_binary_file_frame(self.x_files_list[self.file_index], self.n_ins) out_features, out_frame_number = io_fun.load_binary_file_frame(self.y_files_list[self.file_index], self.n_outs) frame_number = lab_frame_number temp_set_x = in_features[0:frame_number, ] temp_set_y = numpy.array([self.n_outs]) for il in numpy.argmax(out_features, axis=1): temp_set_y = numpy.concatenate((temp_set_y, [il, self.n_outs]), axis=0) self.file_index += 1 if self.file_index >= self.list_size: self.end_reading = True self.file_index = 0 shared_set_x = self.make_shared(temp_set_x, 'x') shared_set_y = theano.shared(numpy.asarray(temp_set_y, dtype='int32'), name='y', borrow=True) shared_set_xy = (shared_set_x, shared_set_y) return shared_set_xy, temp_set_x, temp_set_y
def extract_durational_features(self, dur_file_name=None, dur_data=None): if dur_file_name: io_funcs = BinaryIOCollection() dur_dim = 1 ## hard coded for now dur_data = io_funcs.load_binary_file(dur_file_name, dur_dim) ph_count = len(dur_data) total_num_of_frames = int(sum(dur_data)) duration_feature_array = numpy.zeros( (total_num_of_frames, self.frame_feature_size)) frame_index = 0 for i in range(ph_count): frame_number = int(dur_data[i]) if self.subphone_feats == "coarse_coding": cc_feat_matrix = self.extract_coarse_coding_features_relative( frame_number) for j in range(frame_number): duration_feature_array[frame_index, 0] = cc_feat_matrix[j, 0] duration_feature_array[frame_index, 1] = cc_feat_matrix[j, 1] duration_feature_array[frame_index, 2] = cc_feat_matrix[j, 2] duration_feature_array[frame_index, 3] = float(frame_number) frame_index += 1 return duration_feature_array
def get_file_lengths(self): io_funcs = BinaryIOCollection() self.file_length_dict = {'framenum2utt':{}, 'utt2framenum':{}, 'utt2index':{}} ### read file by file ### while True: if self.file_index >= self.list_size: self.end_reading = True self.file_index = 0 break in_features, lab_frame_number = io_funcs.load_binary_file_frame(self.x_files_list[self.file_index], self.n_ins) out_features, out_frame_number = io_funcs.load_binary_file_frame(self.y_files_list[self.file_index], self.n_outs) base_file_name = os.path.basename(self.x_files_list[self.file_index]).split('.')[0] if abs(lab_frame_number - out_frame_number) < 5: ## we allow small difference here. may not be correct, but sometimes, there is one/two frames difference frame_number = min(lab_frame_number, out_frame_number) else: self.logger.critical("the number of frames in label and acoustic features are different: %d vs %d (%s)" %(lab_frame_number, out_frame_number, base_file_name)) raise if frame_number not in self.file_length_dict['framenum2utt']: self.file_length_dict['framenum2utt'][frame_number] = [base_file_name] else: self.file_length_dict['framenum2utt'][frame_number].append(base_file_name) self.file_length_dict['utt2framenum'][base_file_name] = frame_number self.file_length_dict['utt2index'][base_file_name] = self.file_index self.file_index += 1 self.reset()
def compute_distortion(self, file_id_list, reference_dir, generation_dir, file_ext, feature_dim): total_voiced_frame_number = 0 distortion = 0.0 vuv_error = 0 total_frame_number = 0 io_funcs = BinaryIOCollection() ref_all_files_data = numpy.reshape(numpy.array([]), (-1,1)) gen_all_files_data = numpy.reshape(numpy.array([]), (-1,1)) for file_id in file_id_list: ref_file_name = reference_dir + '/' + file_id + file_ext gen_file_name = generation_dir + '/' + file_id + file_ext ref_data, ref_frame_number = io_funcs.load_binary_file_frame(ref_file_name, feature_dim) gen_data, gen_frame_number = io_funcs.load_binary_file_frame(gen_file_name, feature_dim) if ref_frame_number != gen_frame_number: self.logger.critical("The number of frames is not the same: %d vs %d. Error in compute_distortion.py\n." %(ref_frame_number, gen_frame_number)) raise if file_ext == '.lf0': ref_all_files_data = numpy.concatenate((ref_all_files_data, ref_data), axis=0) gen_all_files_data = numpy.concatenate((gen_all_files_data, gen_data), axis=0) temp_distortion, temp_vuv_error, voiced_frame_number = self.compute_f0_mse(ref_data, gen_data) vuv_error += temp_vuv_error total_voiced_frame_number += voiced_frame_number elif file_ext == '.dur': ref_data = numpy.reshape(numpy.sum(ref_data, axis=1), (-1, 1)) gen_data = numpy.reshape(numpy.sum(gen_data, axis=1), (-1, 1)) ref_all_files_data = numpy.concatenate((ref_all_files_data, ref_data), axis=0) gen_all_files_data = numpy.concatenate((gen_all_files_data, gen_data), axis=0) continue; elif file_ext == '.mgc': temp_distortion = self.compute_mse(ref_data[:, 1:feature_dim], gen_data[:, 1:feature_dim]) else: temp_distortion = self.compute_mse(ref_data, gen_data) distortion += temp_distortion total_frame_number += ref_frame_number if file_ext == '.dur': dur_rmse = self.compute_rmse(ref_all_files_data, gen_all_files_data) dur_corr = self.compute_corr(ref_all_files_data, gen_all_files_data) return dur_rmse, dur_corr elif file_ext == '.lf0': distortion /= float(total_voiced_frame_number) vuv_error /= float(total_frame_number) distortion = numpy.sqrt(distortion) f0_corr = self.compute_f0_corr(ref_all_files_data, gen_all_files_data) return distortion, f0_corr, vuv_error else: distortion /= float(total_frame_number) return distortion
def read_and_transform_data_from_file_list(in_file_list, dim, seq_length=200, merge_size=1): io_funcs = BinaryIOCollection() num_of_utt = len(in_file_list) temp_set = np.zeros((FRAME_BUFFER_SIZE, dim)) ### read file by file ### current_index = 0 for i in range(num_of_utt): in_file_name = in_file_list[i] in_features, frame_number = io_funcs.load_binary_file_frame(in_file_name, dim) base_file_name = os.path.basename(in_file_name).split(".")[0] temp_set[current_index:current_index+frame_number, ] = in_features current_index += frame_number if (i+1)%merge_size == 0: current_index = seq_length * (int(np.ceil(float(current_index)/float(seq_length)))) drawProgressBar(i+1, num_of_utt) sys.stdout.write("\n") num_of_samples = int(np.ceil(float(current_index)/float(seq_length))) temp_set = temp_set[0: num_of_samples*seq_length, ] temp_set = temp_set.reshape(num_of_samples, seq_length) return temp_set
def normalise_data(self, in_file_list, out_file_list): file_number = len(in_file_list) fea_max_min_diff = self.max_vector - self.min_vector diff_value = self.target_max_value - self.target_min_value fea_max_min_diff = numpy.reshape(fea_max_min_diff, (1, self.feature_dimension)) target_max_min_diff = numpy.zeros((1, self.feature_dimension)) target_max_min_diff.fill(diff_value) target_max_min_diff[fea_max_min_diff <= 0.0] = 1.0 fea_max_min_diff[fea_max_min_diff <= 0.0] = 1.0 io_funcs = BinaryIOCollection() for i in range(file_number): features = io_funcs.load_binary_file(in_file_list[i], self.feature_dimension) frame_number = features.size // self.feature_dimension fea_min_matrix = numpy.tile(self.min_vector, (frame_number, 1)) target_min_matrix = numpy.tile(self.target_min_value, (frame_number, self.feature_dimension)) fea_diff_matrix = numpy.tile(fea_max_min_diff, (frame_number, 1)) diff_norm_matrix = numpy.tile(target_max_min_diff, (frame_number, 1)) / fea_diff_matrix norm_features = diff_norm_matrix * (features - fea_min_matrix) + target_min_matrix ## If we are to keep some columns unnormalised, use advanced indexing to ## reinstate original values: m,n = numpy.shape(features) for col in self.exclude_columns: norm_features[list(range(m)),[col]*m] = features[list(range(m)),[col]*m] io_funcs.array_to_binary_file(norm_features, out_file_list[i])
def feature_normalisation(self, in_file_list, out_file_list): logger = logging.getLogger('feature_normalisation') # self.feature_dimension = feature_dimension try: assert len(in_file_list) == len(out_file_list) except AssertionError: logger.critical( 'The input and output file numbers are not the same! %d vs %d' % (len(in_file_list), len(out_file_list))) raise if self.mean_vector == None: self.mean_vector = self.compute_mean(in_file_list, 0, self.feature_dimension) if self.std_vector == None: self.std_vector = self.compute_std(in_file_list, self.mean_vector, 0, self.feature_dimension) io_funcs = BinaryIOCollection() file_number = len(in_file_list) for i in xrange(file_number): features, current_frame_number = io_funcs.load_binary_file_frame( in_file_list[i], self.feature_dimension) mean_matrix = numpy.tile(self.mean_vector, (current_frame_number, 1)) std_matrix = numpy.tile(self.std_vector, (current_frame_number, 1)) norm_features = (features - mean_matrix) / std_matrix io_funcs.array_to_binary_file(norm_features, out_file_list[i]) return self.mean_vector, self.std_vector
def merge_data(self, binary_label_file_list, new_feat_file_list, out_feat_file_list): ''' merging new features with normalised label features ''' utt_number = len(new_feat_file_list) if utt_number != len(binary_label_file_list): print("the number of new feature input files and label files should be the same!\n"); sys.exit(1) new_feat_ext = new_feat_file_list[0].split('/')[-1].split('.')[1] io_funcs = BinaryIOCollection() for i in range(utt_number): lab_file_name = binary_label_file_list[i] new_feat_file_name = new_feat_file_list[i] out_feat_file_name = out_feat_file_list[i] lab_features, lab_frame_number = io_funcs.load_binary_file_frame(lab_file_name, self.lab_dim) new_features, feat_frame_number = io_funcs.load_binary_file_frame(new_feat_file_name, self.feat_dim) if (lab_frame_number - feat_frame_number)>5: base_file_name = new_feat_file_list[i].split('/')[-1].split('.')[0] self.logger.critical("the number of frames in label and new features are different: %d vs %d (%s)" %(lab_frame_number, feat_frame_number, base_file_name)) raise merged_features = numpy.zeros((lab_frame_number, self.lab_dim+self.feat_dim)) merged_features[0:lab_frame_number, 0:self.lab_dim] = lab_features merged_features[0:feat_frame_number, self.lab_dim:self.lab_dim+self.feat_dim] = new_features[0:lab_frame_number, ] io_funcs.array_to_binary_file(merged_features, out_feat_file_name) self.logger.debug('merged new feature %s of %d frames with %d label features' % (new_feat_ext, feat_frame_number,lab_frame_number) )
def predict(self, test_x, out_scaler, gen_test_file_list, sequential_training=False, stateful=False): #### compute predictions #### io_funcs = BinaryIOCollection() test_file_number = len(gen_test_file_list) print("generating features on held-out test data...") for utt_index in range(test_file_number): gen_test_file_name = gen_test_file_list[utt_index] test_id = os.path.splitext(os.path.basename(gen_test_file_name))[0] temp_test_x = test_x[test_id] num_of_rows = temp_test_x.shape[0] if stateful: temp_test_x = data_utils.get_stateful_input(temp_test_x, self.seq_length, self.batch_size) elif sequential_training: temp_test_x = np.reshape(temp_test_x, (1, num_of_rows, self.n_in)) predictions = self.model.predict(temp_test_x) if sequential_training: predictions = np.reshape(predictions, (num_of_rows, self.n_out)) data_utils.denorm_data(predictions, out_scaler) io_funcs.array_to_binary_file(predictions, gen_test_file_name) data_utils.drawProgressBar(utt_index+1, test_file_number) sys.stdout.write("\n")
def find_min_max_values(self, in_file_list): logger = logging.getLogger("acoustic_norm") file_number = len(in_file_list) min_value_matrix = numpy.zeros((file_number, self.feature_dimension)) max_value_matrix = numpy.zeros((file_number, self.feature_dimension)) io_funcs = BinaryIOCollection() for i in range(file_number): features = io_funcs.load_binary_file(in_file_list[i], self.feature_dimension) temp_min = numpy.amin(features, axis=0) temp_max = numpy.amax(features, axis=0) min_value_matrix[i, ] = temp_min max_value_matrix[i, ] = temp_max self.min_vector = numpy.amin(min_value_matrix, axis=0) self.max_vector = numpy.amax(max_value_matrix, axis=0) self.min_vector = numpy.reshape(self.min_vector, (1, self.feature_dimension)) self.max_vector = numpy.reshape(self.max_vector, (1, self.feature_dimension)) # po=numpy.get_printoptions() # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4) logger.info('across %d files found min/max values of length %d:' % (file_number, self.feature_dimension)) logger.info(' min: %s' % self.min_vector) logger.info(' max: %s' % self.max_vector)
def feature_normalisation(self, in_file_list, out_file_list): logger = logging.getLogger('feature_normalisation') # self.feature_dimension = feature_dimension try: assert len(in_file_list) == len(out_file_list) except AssertionError: logger.critical('The input and output file numbers are not the same! %d vs %d' %(len(in_file_list), len(out_file_list))) raise if self.mean_vector is None: self.mean_vector = self.compute_mean(in_file_list, 0, self.feature_dimension) if self.std_vector is None: self.std_vector = self.compute_std(in_file_list, self.mean_vector, 0, self.feature_dimension) io_funcs = BinaryIOCollection() file_number = len(in_file_list) for i in range(file_number): features, current_frame_number = io_funcs.load_binary_file_frame(in_file_list[i], self.feature_dimension) mean_matrix = numpy.tile(self.mean_vector, (current_frame_number, 1)) std_matrix = numpy.tile(self.std_vector, (current_frame_number, 1)) norm_features = (features - mean_matrix) / std_matrix print(current_frame_number,in_file_list[i]) norm_features=numpy.concatenate([norm_features[:,:self.feature_dimension-2],features[:,self.feature_dimension-2:]],axis=-1) # in fact the problem is that I normalized the out put xvector pvector and onehotvector.... so we have to in formalized this print(' normalized vector :{}'.format(norm_features[1,:])) io_funcs.array_to_binary_file(norm_features, out_file_list[i]) return self.mean_vector, self.std_vector
def get_file_lengths(self): io_funcs = BinaryIOCollection() self.file_length_dict = {'framenum2utt':{}, 'utt2framenum':{}, 'utt2index':{}} ### read file by file ### while True: if self.file_index >= self.list_size: self.end_reading = True self.file_index = 0 break in_features, lab_frame_number = io_funcs.load_binary_file_frame(self.x_files_list[self.file_index], self.n_ins) out_features, out_frame_number = io_funcs.load_binary_file_frame(self.y_files_list[self.file_index], self.n_outs) base_file_name = os.path.basename(self.x_files_list[self.file_index]).split('.')[0] if abs(lab_frame_number - out_frame_number) < 5: ## we allow small difference here. may not be correct, but sometimes, there is one/two frames difference frame_number = min(lab_frame_number, out_frame_number) else: self.logger.critical("the number of frames in label and acoustic features are different: %d vs %d (%s)" %(lab_frame_number, out_frame_number, base_file_name)) raise if frame_number not in self.file_length_dict['framenum2utt']: self.file_length_dict['framenum2utt'][frame_number] = [base_file_name] else: self.file_length_dict['framenum2utt'][frame_number].append(base_file_name) self.file_length_dict['utt2framenum'][base_file_name] = frame_number self.file_length_dict['utt2index'][base_file_name] = self.file_index self.file_index += 1 self.reset()
def feature_normalisation(self, in_file_list, out_file_list): logger = logging.getLogger('feature_normalisation') # self.feature_dimension = feature_dimension try: assert len(in_file_list) == len(out_file_list) except AssertionError: logger.critical('The input and output file numbers are not the same! %d vs %d' %(len(in_file_list), len(out_file_list))) raise if self.mean_vector == None: self.mean_vector = self.compute_mean(in_file_list, 0, self.feature_dimension) if self.std_vector == None: self.std_vector = self.compute_std(in_file_list, self.mean_vector, 0, self.feature_dimension) io_funcs = BinaryIOCollection() file_number = len(in_file_list) for i in xrange(file_number): features, current_frame_number = io_funcs.load_binary_file_frame(in_file_list[i], self.feature_dimension) mean_matrix = numpy.tile(self.mean_vector, (current_frame_number, 1)) std_matrix = numpy.tile(self.std_vector, (current_frame_number, 1)) norm_features = (features - mean_matrix) / std_matrix io_funcs.array_to_binary_file(norm_features, out_file_list[i]) return self.mean_vector, self.std_vector
def extract_label_features(self, orig_file, output_file): io_funcs = BinaryIOCollection() totalMat = io_funcs.file2matrix(orig_file) self.label_dimension = totalMat.shape[1] - 5 # collum num labelMat = totalMat[:, :-5] #print orig_file, totalMat.shape, labelMat.shape io_funcs.array_to_binary_file(labelMat, output_file)
def load_covariance(var_file_dict, out_dimension_dict): var = {} io_funcs = BinaryIOCollection() for feature_name in var_file_dict.keys(): var_values, dimension = io_funcs.load_binary_file_frame(var_file_dict[feature_name], 1) var_values = numpy.reshape(var_values, (out_dimension_dict[feature_name], 1)) var[feature_name] = var_values return var
def load_covariance(self, var_file_dict, out_dimension_dict): io_funcs = BinaryIOCollection() for feature_name in list(var_file_dict.keys()): var_values, dimension = io_funcs.load_binary_file_frame(var_file_dict[feature_name], 1) var_values = numpy.reshape(var_values, (out_dimension_dict[feature_name], 1)) self.var[feature_name] = var_values
def load_next_utterance(self): """Load the data for one utterance. This function will be called when utterance-by-utterance loading is required (e.g., sequential training). """ temp_set_x = numpy.empty((self.buffer_size, self.n_ins)) temp_set_y = numpy.empty((self.buffer_size, self.n_outs)) io_fun = BinaryIOCollection() in_features, lab_frame_number = io_fun.load_binary_file_frame( self.x_files_list[self.file_index], self.n_ins) out_features, out_frame_number = io_fun.load_binary_file_frame( self.y_files_list[self.file_index], self.n_outs) frame_number = lab_frame_number print(' %%%%% {} : {} / {} '.format(base_file_name, self.n_ins, self.n_outs)) if abs( lab_frame_number - out_frame_number ) < 5: ## we allow small difference here. may not be correct, but sometimes, there is one/two frames difference if lab_frame_number > out_frame_number: frame_number = out_frame_number else: base_file_name = os.path.basename( self.x_files_list[self.file_index]).split('.')[0] self.logger.critical( "the number of frames in label and acoustic features are different: %d vs %d (%s)" % (lab_frame_number, out_frame_number, base_file_name)) raise temp_set_y = out_features[0:frame_number, ] temp_set_x = in_features[0:frame_number, ] self.file_index += 1 if self.file_index >= self.list_size: self.end_reading = True self.file_index = 0 # reshape input-output if self.reshape_io: temp_set_x = numpy.reshape(temp_set_x, (1, temp_set_x.shape[0], self.n_ins)) temp_set_y = numpy.reshape(temp_set_y, (1, temp_set_y.shape[0], self.n_outs)) temp_set_x = numpy.array(temp_set_x, 'float32') temp_set_y = numpy.array(temp_set_y, 'float32') shared_set_x = self.make_shared(temp_set_x, 'x') shared_set_y = self.make_shared(temp_set_y, 'y') shared_set_xy = (shared_set_x, shared_set_y) return shared_set_xy, temp_set_x, temp_set_y
def modify_dur_from_phone_alignment_labels(self, label_file_name, gen_dur_file_name, gen_lab_file_name): logger = logging.getLogger("dur") dur_dim = 1 io_funcs = BinaryIOCollection() dur_features, frame_number = io_funcs.load_binary_file_frame( gen_dur_file_name, dur_dim) fid = open(label_file_name) utt_labels = fid.readlines() fid.close() label_number = len(utt_labels) logger.info('loaded %s, %3d labels' % (label_file_name, label_number)) out_fid = open(gen_lab_file_name, 'w') current_index = 0 prev_end_time = 0 for line in utt_labels: line = line.strip() if len(line) < 1: continue temp_list = re.split('\s+', line) start_time = int(temp_list[0]) end_time = int(temp_list[1]) full_label = temp_list[2] label_binary_flag = self.check_silence_pattern(full_label) if label_binary_flag == 1: current_phone_dur = end_time - start_time out_fid.write( str(prev_end_time) + ' ' + str(prev_end_time + current_phone_dur) + ' ' + full_label + '\n') prev_end_time = prev_end_time + current_phone_dur continue else: phone_dur = dur_features[current_index] phone_dur = int(phone_dur) * 5 * 10000 out_fid.write( str(prev_end_time) + ' ' + str(prev_end_time + phone_dur) + ' ' + full_label + '\n') prev_end_time = prev_end_time + phone_dur current_index += 1 logger.debug( 'modifed label with predicted duration of %d frames x %d features' % dur_features.shape)
def read_data_from_file_list(inp_file_list, out_file_list, inp_dim, out_dim, sequential_training=True): io_funcs = BinaryIOCollection() num_of_utt = len(inp_file_list) file_length_dict = {'framenum2utt':{}, 'utt2framenum':{}} if sequential_training: temp_set_x = {} temp_set_y = {} else: temp_set_x = np.empty((FRAME_BUFFER_SIZE, inp_dim)) temp_set_y = np.empty((FRAME_BUFFER_SIZE, out_dim)) ### read file by file ### current_index = 0 for i in range(num_of_utt): inp_file_name = inp_file_list[i] out_file_name = out_file_list[i] inp_features, inp_frame_number = io_funcs.load_binary_file_frame(inp_file_name, inp_dim) out_features, out_frame_number = io_funcs.load_binary_file_frame(out_file_name, out_dim) base_file_name = os.path.basename(inp_file_name).split(".")[0] if abs(inp_frame_number-out_frame_number)>5: print('the number of frames in input and output features are different: %d vs %d (%s)' %(inp_frame_number, out_frame_number, base_file_name)) sys.exit(0) else: frame_number = min(inp_frame_number, out_frame_number) if sequential_training: temp_set_x[base_file_name] = inp_features[0:frame_number] temp_set_y[base_file_name] = out_features[0:frame_number] else: temp_set_x[current_index:current_index+frame_number, ] = inp_features[0:frame_number] temp_set_y[current_index:current_index+frame_number, ] = out_features[0:frame_number] current_index += frame_number if frame_number not in file_length_dict['framenum2utt']: file_length_dict['framenum2utt'][frame_number] = [base_file_name] else: file_length_dict['framenum2utt'][frame_number].append(base_file_name) file_length_dict['utt2framenum'][base_file_name] = frame_number drawProgressBar(i+1, num_of_utt) sys.stdout.write("\n") if not sequential_training: temp_set_x = temp_set_x[0:current_index, ] temp_set_y = temp_set_y[0:current_index, ] return temp_set_x, temp_set_y, file_length_dict
def load_min_max_values(self, label_norm_file): logger = logging.getLogger("acoustic_norm") io_funcs = BinaryIOCollection() min_max_vector, frame_number = io_funcs.load_binary_file_frame(label_norm_file, 1) min_max_vector = numpy.reshape(min_max_vector, (-1, )) self.min_vector = min_max_vector[0:frame_number//2] self.max_vector = min_max_vector[frame_number//2:] logger.info('Loaded min max values from the trained data for feature dimension of %d' % self.feature_dimension)
def read_data_from_file_list(inp_file_list, out_file_list, inp_dim, out_dim, sequential_training=True): io_funcs = BinaryIOCollection() num_of_utt = len(inp_file_list) file_length_dict = {'framenum2utt':{}, 'utt2framenum':{}} if sequential_training: temp_set_x = {} temp_set_y = {} else: temp_set_x = np.empty((FRAME_BUFFER_SIZE, inp_dim)) temp_set_y = np.empty((FRAME_BUFFER_SIZE, out_dim)) ### read file by file ### current_index = 0 for i in range(num_of_utt): inp_file_name = inp_file_list[i] out_file_name = out_file_list[i] inp_features, inp_frame_number = io_funcs.load_binary_file_frame(inp_file_name, inp_dim) out_features, out_frame_number = io_funcs.load_binary_file_frame(out_file_name, out_dim) base_file_name = os.path.basename(inp_file_name).split(".")[0] if abs(inp_frame_number-out_frame_number)>5: print('the number of frames in input and output features are different: %d vs %d (%s)' %(inp_frame_number, out_frame_number, base_file_name)) sys.exit(0) else: frame_number = min(inp_frame_number, out_frame_number) if sequential_training: temp_set_x[base_file_name] = inp_features[0:frame_number] temp_set_y[base_file_name] = out_features[0:frame_number] else: temp_set_x[current_index:current_index+frame_number, ] = inp_features[0:frame_number] temp_set_y[current_index:current_index+frame_number, ] = out_features[0:frame_number] current_index += frame_number if frame_number not in file_length_dict['framenum2utt']: file_length_dict['framenum2utt'][frame_number] = [base_file_name] else: file_length_dict['framenum2utt'][frame_number].append(base_file_name) file_length_dict['utt2framenum'][base_file_name] = frame_number drawProgressBar(i+1, num_of_utt) sys.stdout.write("\n") if not sequential_training: temp_set_x = temp_set_x[0:current_index, ] temp_set_y = temp_set_y[0:current_index, ] return temp_set_x, temp_set_y, file_length_dict
def simple_scale_variance_CONTINUUM(indir, outdir, var_file_dict, out_dimension_dict, file_id_list): ## Try range of interpolation weights for combining global & local variance all_streams = ['cmp','HNR','F0','LSF','Gain','LSFsource'] streams_to_scale = ['LSF'] static_variances = {} static_dimension_dict = {} for (feature_name,size) in list(out_dimension_dict.items()): static_dimension_dict[feature_name] = size/3 io_funcs = BinaryIOCollection() for feature_name in list(var_file_dict.keys()): var_values, dimension = io_funcs.load_binary_file_frame(var_file_dict[feature_name], 1) static_var_values = var_values[:static_dimension_dict[feature_name], :] static_variances[feature_name] = static_var_values if not os.path.isdir(outdir): os.makedirs(outdir) file_id_list_out = [] for uttname in file_id_list: for gv_weight in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]: local_weight = 1.0 - gv_weight for stream in all_streams: infile = os.path.join(indir, uttname + '.' + stream) extended_uttname = uttname + '_gv' + str(gv_weight) print(extended_uttname) outfile = os.path.join(outdir, extended_uttname + '.' + stream) if not os.path.isfile(infile): sys.exit(infile + ' does not exist') if stream in streams_to_scale: speech, dimension = io_funcs.load_binary_file_frame(infile, static_dimension_dict[stream]) utt_mean = numpy.mean(speech, axis=0) utt_std = numpy.std(speech, axis=0) global_std = numpy.transpose((static_variances[stream])) weighted_global_std = (gv_weight * global_std) + (local_weight * utt_std) std_ratio = weighted_global_std / utt_std nframes, ndim = numpy.shape(speech) utt_mean_matrix = numpy.tile(utt_mean, (nframes,1)) std_ratio_matrix = numpy.tile(std_ratio, (nframes,1)) scaled_speech = ((speech - utt_mean_matrix) * std_ratio_matrix) + utt_mean_matrix io_funcs.array_to_binary_file(scaled_speech, outfile) else: os.system('cp %s %s'%(infile, outfile)) file_id_list_out.append(extended_uttname) return file_id_list_out
def load_mean_std_values(self, acoustic_norm_file): logger = logging.getLogger('feature_normalisation') io_funcs = BinaryIOCollection() mean_std_vector, frame_number = io_funcs.load_binary_file_frame(acoustic_norm_file, 1) mean_std_vector = numpy.reshape(mean_std_vector, (-1, )) self.mean_vector = mean_std_vector[0:frame_number//2] self.std_vector = mean_std_vector[frame_number//2:] logger.info('Loaded mean std values from the trained data for feature dimension of %d' % self.feature_dimension) return self.mean_vector, self.std_vector
def load_mean_std_values(self, acoustic_norm_file): logger = logging.getLogger('feature_normalisation') io_funcs = BinaryIOCollection() mean_std_vector, frame_number = io_funcs.load_binary_file_frame(acoustic_norm_file, 1) mean_std_vector = numpy.reshape(mean_std_vector, (-1, )) self.mean_vector = mean_std_vector[0:frame_number//2] self.std_vector = mean_std_vector[frame_number//2:] logger.info('Loaded mean std values from the trained data for feature dimension of %d' % self.feature_dimension) return self.mean_vector, self.std_vector
def extract_linguistic_features(self, in_file_name, out_file_name=None, label_type="state_align", dur_file_name=None): if label_type=="phone_align": A = self.load_labels_with_phone_alignment(in_file_name, dur_file_name) elif label_type=="state_align": A = self.load_labels_with_state_alignment(in_file_name) else: logger.critical("we don't support %s labels as of now!!" % (label_type)) if out_file_name: io_funcs = BinaryIOCollection() io_funcs.array_to_binary_file(A, out_file_name) else: return A
def simple_scale_variance(indir, outdir, var_file_dict, out_dimension_dict, file_id_list, gv_weight=1.0): ## simple variance scaling (silen et al. 2012, paragraph 3.1) ## TODO: Lots of things like stream names hardcoded here; 3 for delta + delta-delta; ... # all_streams = ['cmp','HNR','F0','LSF','Gain','LSFsource'] # streams_to_scale = ['LSF'] all_streams = ['cmp','mgc','lf0','bap'] streams_to_scale = ['mgc'] static_variances = {} static_dimension_dict = {} for (feature_name,size) in out_dimension_dict.items(): static_dimension_dict[feature_name] = size/3 io_funcs = BinaryIOCollection() for feature_name in var_file_dict.keys(): var_values, dimension = io_funcs.load_binary_file_frame(var_file_dict[feature_name], 1) static_var_values = var_values[:static_dimension_dict[feature_name], :] static_variances[feature_name] = static_var_values if not os.path.isdir(outdir): os.makedirs(outdir) assert gv_weight <= 1.0 and gv_weight >= 0.0 local_weight = 1.0 - gv_weight for uttname in file_id_list: for stream in all_streams: infile = os.path.join(indir, uttname + '.' + stream) outfile = os.path.join(outdir, uttname + '.' + stream) if not os.path.isfile(infile): sys.exit(infile + ' does not exist') if stream in streams_to_scale: speech, dimension = io_funcs.load_binary_file_frame(infile, static_dimension_dict[stream]) utt_mean = numpy.mean(speech, axis=0) utt_std = numpy.std(speech, axis=0) global_std = numpy.transpose((static_variances[stream])) weighted_global_std = (gv_weight * global_std) + (local_weight * utt_std) std_ratio = weighted_global_std / utt_std nframes, ndim = numpy.shape(speech) utt_mean_matrix = numpy.tile(utt_mean, (nframes,1)) std_ratio_matrix = numpy.tile(std_ratio, (nframes,1)) scaled_speech = ((speech - utt_mean_matrix) * std_ratio_matrix) + utt_mean_matrix io_funcs.array_to_binary_file(scaled_speech, outfile) else: os.system('cp %s %s'%(infile, outfile))
def modify_dur_from_state_alignment_labels(self, label_file_name, gen_dur_file_name, gen_lab_file_name): logger = logging.getLogger("dur") state_number = self.state_number dur_dim = state_number io_funcs = BinaryIOCollection() dur_features, frame_number = io_funcs.load_binary_file_frame(gen_dur_file_name, dur_dim) fid = open(label_file_name) utt_labels = fid.readlines() fid.close() label_number = len(utt_labels) logger.info('loaded %s, %3d labels' % (label_file_name, label_number) ) out_fid = open(gen_lab_file_name, 'w') current_index = 0 prev_end_time = 0 for line in utt_labels: line = line.strip() if len(line) < 1: continue temp_list = re.split('\s+', line) start_time = int(temp_list[0]) end_time = int(temp_list[1]) full_label = temp_list[2] full_label_length = len(full_label) - 3 # remove state information [k] state_index = full_label[full_label_length + 1] state_index = int(state_index) - 1 label_binary_flag = self.check_silence_pattern(full_label) if label_binary_flag == 1: current_state_dur = end_time - start_time out_fid.write(str(prev_end_time)+' '+str(prev_end_time+current_state_dur)+' '+full_label+'\n') prev_end_time = prev_end_time+current_state_dur continue; else: state_dur = dur_features[current_index, state_index-1] state_dur = int(state_dur)*5*10000 out_fid.write(str(prev_end_time)+' '+str(prev_end_time+state_dur)+' '+full_label+'\n') prev_end_time = prev_end_time+state_dur if state_index == state_number: current_index += 1 logger.debug('modifed label with predicted duration of %d frames x %d features' % dur_features.shape )
def predict(self, test_x, out_scaler, gen_test_file_list, sequential_training=False, stateful=False): #### compute predictions #### io_funcs = BinaryIOCollection() test_id_list = test_x.keys() test_id_list.sort() test_file_number = len(test_id_list) print("generating features on held-out test data...") with tf.Session() as sess: new_saver=tf.train.import_meta_graph(os.path.join(self.ckpt_dir,"mymodel.ckpt.meta")) print "loading the model parameters..." output_layer=tf.get_collection("output_layer")[0] input_layer=tf.get_collection("input_layer")[0] new_saver.restore(sess,os.path.join(self.ckpt_dir,"mymodel.ckpt")) print "The model parameters are successfully restored" for utt_index in xrange(test_file_number): gen_test_file_name = gen_test_file_list[utt_index] temp_test_x = test_x[test_id_list[utt_index]] num_of_rows = temp_test_x.shape[0] if not sequential_training: is_training_batch=tf.get_collection("is_training_batch")[0] if self.dropout_rate!=0.0: is_training_drop=tf.get_collection("is_training_drop")[0] y_predict=sess.run(output_layer,feed_dict={input_layer:temp_test_x,is_training_drop:False,is_training_batch:False}) else: y_predict=sess.run(output_layer,feed_dict={input_layer:temp_test_x,is_training_batch:False}) else: temp_test_x=np.reshape(temp_test_x,[1,num_of_rows,self.n_in]) hybrid=0 utt_length_placeholder=tf.get_collection("utt_length")[0] if "tanh" in self.hidden_layer_type: hybrid=1 is_training_batch=tf.get_collection("is_training_batch")[0] if self.dropout_rate!=0.0: is_training_drop=tf.get_collection("is_training_drop")[0] if hybrid: y_predict=sess.run(output_layer,feed_dict={input_layer:temp_test_x,utt_length_placeholder:[num_of_rows],is_training_drop:False,is_training_batch:False}) else: y_predict=sess.run(output_layer,feed_dict={input_layer:temp_test_x,utt_length_placeholder:[num_of_rows],is_training_drop:False}) elif hybrid: y_predict=sess.run(output_layer,feed_dict={input_layer:temp_test_x,utt_length_placeholder:[num_of_rows],is_training_batch:False}) else: y_predict=sess.run(output_layer,feed_dict={input_layer:temp_test_x,utt_length_placeholder:[num_of_rows]}) data_utils.denorm_data(y_predict, out_scaler) io_funcs.array_to_binary_file(y_predict, gen_test_file_name) data_utils.drawProgressBar(utt_index+1, test_file_number)
def extract_dur_features(self, in_file_name, out_file_name=None, label_type="state_align", feature_type=None, unit_size=None, feat_size=None): logger = logging.getLogger("dur") if label_type=="phone_align": A = self.extract_dur_from_phone_alignment_labels(in_file_name, feature_type, unit_size, feat_size) elif label_type=="state_align": A = self.extract_dur_from_state_alignment_labels(in_file_name, feature_type, unit_size, feat_size) else: logger.critical("we don't support %s labels as of now!!" % (label_type)) sys.exit(1) if out_file_name: io_funcs = BinaryIOCollection() io_funcs.array_to_binary_file(A, out_file_name) else: return A
def compute_norm_stats(data, stats_file, method="MVN"): #### normalize training data #### io_funcs = BinaryIOCollection() if method=="MVN": scaler = preprocessing.StandardScaler().fit(data) norm_matrix = np.vstack((scaler.mean_, scaler.scale_)) elif method=="MINMAX": scaler = preprocessing.MinMaxScaler(feature_range=(0.01, 0.99)).fit(data) norm_matrix = np.vstack((scaler.min_, scaler.scale_)) print(norm_matrix.shape) io_funcs.array_to_binary_file(norm_matrix, stats_file) return scaler
def extract_durational_features(self, dur_file_name=None, dur_data=None): if dur_file_name: io_funcs = BinaryIOCollection() dur_dim = 1 ## hard coded for now dur_data = io_funcs.load_binary_file(dur_file_name, dur_dim) ph_count = len(dur_data) total_num_of_frames = int(sum(dur_data)) duration_feature_array = numpy.zeros((total_num_of_frames, self.frame_feature_size)) frame_index=0 for i in range(ph_count): frame_number = int(dur_data[i]) if self.subphone_feats == "coarse_coding": cc_feat_matrix = self.extract_coarse_coding_features_relative(frame_number) for j in range(frame_number): duration_feature_array[frame_index, 0] = cc_feat_matrix[j, 0] duration_feature_array[frame_index, 1] = cc_feat_matrix[j, 1] duration_feature_array[frame_index, 2] = cc_feat_matrix[j, 2] duration_feature_array[frame_index, 3] = float(frame_number) frame_index+=1 elif self.subphone_feats == 'full': state_number = 5 # hard coded here phone_duration = sum(dur_data[i, :]) state_duration_base = 0 for state_index in xrange(1, state_number+1): state_index_backward = (state_number - state_index) + 1 frame_number = int(dur_data[i][state_index-1]) for j in xrange(frame_number): duration_feature_array[frame_index, 0] = float(j+1) / float(frame_number) ## fraction through state (forwards) duration_feature_array[frame_index, 1] = float(frame_number - j) / float(frame_number) ## fraction through state (backwards) duration_feature_array[frame_index, 2] = float(frame_number) ## length of state in frames duration_feature_array[frame_index, 3] = float(state_index) ## state index (counting forwards) duration_feature_array[frame_index, 4] = float(state_index_backward) ## state index (counting backwards) duration_feature_array[frame_index, 5] = float(phone_duration) ## length of phone in frames duration_feature_array[frame_index, 6] = float(frame_number) / float(phone_duration) ## fraction of the phone made up by current state duration_feature_array[frame_index, 7] = float(phone_duration - j - state_duration_base) / float(phone_duration) ## fraction through phone (forwards) duration_feature_array[frame_index, 8] = float(state_duration_base + j + 1) / float(phone_duration) ## fraction through phone (backwards) frame_index+=1 state_duration_base += frame_number return duration_feature_array
def read_data_from_file_list(in_file_list, dim): io_funcs = BinaryIOCollection() temp_set = np.empty((500000, dim)) ### read file by file ### current_index = 0 for i in tqdm.tqdm(range(len(in_file_list))): in_file_name = in_file_list[i] in_features, frame_number = io_funcs.load_binary_file_frame(in_file_name, dim) temp_set[current_index:current_index+frame_number, ] = in_features current_index += frame_number temp_set = temp_set[0:current_index, ] return temp_set