def load_next_utterance_S2SML(self): """Load the data for one utterance. This function will be called when utterance-by-utterance loading is required (e.g., sequential training). """ io_fun = BinaryIOCollection() in_features, lab_frame_number = io_fun.load_binary_file_frame(self.x_files_list[self.file_index], self.n_ins) out_features, out_frame_number = io_fun.load_binary_file_frame(self.y_files_list[self.file_index], self.n_outs) dur_features, dur_frame_number = io_fun.load_binary_file_frame(self.dur_files_list[self.file_index], 1) ### MLU features sub-division ### temp_set_MLU = in_features[0:lab_frame_number, ] temp_set_y = out_features[0:out_frame_number, ] temp_set_phone = numpy.concatenate([temp_set_MLU[:, self.MLU_div['phone'][0]: self.MLU_div['phone'][1]], temp_set_MLU[:, self.MLU_div['phone'][2]: self.MLU_div['phone'][3]]], axis = 1) temp_set_syl = numpy.concatenate([temp_set_MLU[:, self.MLU_div['syl'][0]: self.MLU_div['syl'][1]], temp_set_MLU[:, self.MLU_div['syl'][2]: self.MLU_div['syl'][3]]], axis = 1) temp_set_word = numpy.concatenate([temp_set_MLU[:, self.MLU_div['word'][0]: self.MLU_div['word'][1]], temp_set_MLU[:, self.MLU_div['word'][2]: self.MLU_div['word'][3] ]], axis = 1) ### duration array sub-division ### dur_features = numpy.reshape(dur_features, (-1, )) temp_set_d = dur_features.astype(int) dur_word_syl = temp_set_d[0: -lab_frame_number] num_ph = lab_frame_number num_syl = (numpy.where(numpy.cumsum(dur_word_syl[::-1])==lab_frame_number)[0][0] + 1) num_words = len(dur_word_syl) - num_syl temp_set_dur_phone = temp_set_d[-num_ph:] temp_set_dur_word = dur_word_syl[0: num_words] temp_set_dur_syl = dur_word_syl[num_words: ] ### additional feature matrix (syllable+phone+frame=432) ### num_frames = sum(temp_set_dur_phone) temp_set_af = numpy.empty((num_frames, self.MLU_div['length'][-1])) temp_set_af[0: num_syl, self.MLU_div['length'][0]: self.MLU_div['length'][1] ] = temp_set_syl[numpy.cumsum(temp_set_dur_syl)-1] temp_set_af[0: num_ph, self.MLU_div['length'][1]: self.MLU_div['length'][2]] = temp_set_phone ### input word feature matrix ### temp_set_dur_word_segments = numpy.zeros(num_words, dtype='int32') syl_bound = numpy.cumsum(temp_set_dur_word) for indx in xrange(num_words): temp_set_dur_word_segments[indx] = int(sum(temp_set_dur_syl[0: syl_bound[indx]])) temp_set_x = temp_set_word[temp_set_dur_word_segments-1] ### rest of the code similar to S2S ### self.file_index += 1 if self.file_index >= self.list_size: self.end_reading = True self.file_index = 0 shared_set_x = self.make_shared(temp_set_x, 'x') shared_set_y = self.make_shared(temp_set_y, 'y') shared_set_d = theano.shared(numpy.asarray(temp_set_d, dtype='int32'), name='d', borrow=True) shared_set_xyd = (shared_set_x, shared_set_y, shared_set_d) return shared_set_xyd, temp_set_x, temp_set_y, temp_set_d, temp_set_af
def load_next_utterance_CTC(self): temp_set_x = numpy.empty((self.buffer_size, self.n_ins)) temp_set_y = numpy.empty(self.buffer_size) io_fun = BinaryIOCollection() in_features, lab_frame_number = io_fun.load_binary_file_frame(self.x_files_list[self.file_index], self.n_ins) out_features, out_frame_number = io_fun.load_binary_file_frame(self.y_files_list[self.file_index], self.n_outs) frame_number = lab_frame_number temp_set_x = in_features[0:frame_number, ] temp_set_y = numpy.array([self.n_outs]) for il in numpy.argmax(out_features, axis=1): temp_set_y = numpy.concatenate((temp_set_y, [il, self.n_outs]), axis=0) self.file_index += 1 if self.file_index >= self.list_size: self.end_reading = True self.file_index = 0 shared_set_x = self.make_shared(temp_set_x, 'x') shared_set_y = theano.shared(numpy.asarray(temp_set_y, dtype='int32'), name='y', borrow=True) shared_set_xy = (shared_set_x, shared_set_y) return shared_set_xy, temp_set_x, temp_set_y
def load_next_utterance(self): """Load the data for one utterance. This function will be called when utterance-by-utterance loading is required (e.g., sequential training). """ temp_set_x = np.empty((self.buffer_size, self.n_ins)) temp_set_y = np.empty((self.buffer_size, self.n_outs)) io_fun = BinaryIOCollection() in_features, lab_frame_number = io_fun.load_binary_file_frame(self.x_files_list[self.file_index], self.n_ins) out_features, out_frame_number = io_fun.load_binary_file_frame(self.y_files_list[self.file_index], self.n_outs) frame_number = lab_frame_number if abs(lab_frame_number - out_frame_number) < 5: ## we allow small difference here. may not be correct, but sometimes, there is one/two frames difference if lab_frame_number > out_frame_number: frame_number = out_frame_number else: base_file_name = self.x_files_list[self.file_index].split('/')[-1].split('.')[0] logging.info("the number of frames in label and acoustic features are different: %d vs %d (%s)" % ( lab_frame_number, out_frame_number, base_file_name)) raise temp_set_y = out_features[0:frame_number, ] temp_set_x = in_features[0:frame_number, ] self.file_index += 1 if self.file_index >= self.list_size: self.end_reading = True self.file_index = 0 return temp_set_x, temp_set_y
def make_equal_frames(self, in_file_list, ref_file_list, in_dimension_dict): logger = logging.getLogger("acoustic_comp") logger.info('making equal number of lines...') io_funcs = BinaryIOCollection() utt_number = len(in_file_list) for i in range(utt_number): in_file_name = in_file_list[i] in_data_stream_name = in_file_name.split('.')[-1] in_feature_dim = in_dimension_dict[in_data_stream_name] in_features, in_frame_number = io_funcs.load_binary_file_frame(in_file_name, in_feature_dim) ref_file_name = ref_file_list[i] ref_data_stream_name = ref_file_name.split('.')[-1] ref_feature_dim = in_dimension_dict[ref_data_stream_name] ref_features, ref_frame_number = io_funcs.load_binary_file_frame(ref_file_name, ref_feature_dim) target_features = numpy.zeros((ref_frame_number, in_feature_dim)) if in_frame_number == ref_frame_number: continue; elif in_frame_number > ref_frame_number: target_features[0:ref_frame_number, ] = in_features[0:ref_frame_number, ] elif in_frame_number < ref_frame_number: target_features[0:in_frame_number, ] = in_features[0:in_frame_number, ] io_funcs.array_to_binary_file(target_features, in_file_name) logger.info('Finished: made equal rows in data stream %s with reference to data stream %s ' %(in_data_stream_name, ref_data_stream_name))
def read_data_from_file_list(inp_file_list, out_file_list, inp_dim, out_dim, sequential_training=True): io_funcs = BinaryIOCollection() utt_len = len(inp_file_list) file_length_dict = {} if sequential_training: temp_set_x = {} temp_set_y = {} else: temp_set_x = np.empty((BUFFER_SIZE, inp_dim)) temp_set_y = np.empty((BUFFER_SIZE, out_dim)) ### read file by file ### current_index = 0 for i in range(utt_len): inp_file_name = inp_file_list[i] out_file_name = out_file_list[i] inp_features, inp_frame_number = io_funcs.load_binary_file_frame( inp_file_name, inp_dim) out_features, out_frame_number = io_funcs.load_binary_file_frame( out_file_name, out_dim) base_file_name = os.path.basename(inp_file_name).split(".")[0] if abs(inp_frame_number - out_frame_number) > 5: print 'the number of frames in input and output features are different: %d vs %d (%s)' % ( inp_frame_number, out_frame_number, base_file_name) sys.exit(0) else: frame_number = min(inp_frame_number, out_frame_number) if sequential_training: temp_set_x[base_file_name] = inp_features temp_set_y[base_file_name] = out_features else: temp_set_x[current_index:current_index + frame_number, ] = inp_features temp_set_y[current_index:current_index + frame_number, ] = out_features current_index += frame_number if frame_number not in file_length_dict: file_length_dict[frame_number] = [base_file_name] else: file_length_dict[frame_number].append(base_file_name) print_status(i, utt_len) sys.stdout.write("\n") if not sequential_training: temp_set_x = temp_set_x[0:current_index, ] temp_set_y = temp_set_y[0:current_index, ] return temp_set_x, temp_set_y, file_length_dict
def simple_scale_variance_CONTINUUM(indir, outdir, var_file_dict, out_dimension_dict, file_id_list): ## Try range of interpolation weights for combining global & local variance all_streams = ['cmp', 'HNR', 'F0', 'LSF', 'Gain', 'LSFsource'] streams_to_scale = ['LSF'] static_variances = {} static_dimension_dict = {} for (feature_name, size) in out_dimension_dict.items(): static_dimension_dict[feature_name] = size / 3 io_funcs = BinaryIOCollection() for feature_name in var_file_dict.keys(): var_values, dimension = io_funcs.load_binary_file_frame( var_file_dict[feature_name], 1) static_var_values = var_values[:static_dimension_dict[feature_name], :] static_variances[feature_name] = static_var_values if not os.path.isdir(outdir): os.makedirs(outdir) file_id_list_out = [] for uttname in file_id_list: for gv_weight in [ 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0 ]: local_weight = 1.0 - gv_weight for stream in all_streams: infile = os.path.join(indir, uttname + '.' + stream) extended_uttname = uttname + '_gv' + str(gv_weight) print extended_uttname outfile = os.path.join(outdir, extended_uttname + '.' + stream) if not os.path.isfile(infile): sys.exit(infile + ' does not exist') if stream in streams_to_scale: speech, dimension = io_funcs.load_binary_file_frame( infile, static_dimension_dict[stream]) utt_mean = numpy.mean(speech, axis=0) utt_std = numpy.std(speech, axis=0) global_std = numpy.transpose((static_variances[stream])) weighted_global_std = (gv_weight * global_std) + ( local_weight * utt_std) std_ratio = weighted_global_std / utt_std nframes, ndim = numpy.shape(speech) utt_mean_matrix = numpy.tile(utt_mean, (nframes, 1)) std_ratio_matrix = numpy.tile(std_ratio, (nframes, 1)) scaled_speech = ((speech - utt_mean_matrix) * std_ratio_matrix) + utt_mean_matrix io_funcs.array_to_binary_file(scaled_speech, outfile) else: os.system('cp %s %s' % (infile, outfile)) file_id_list_out.append(extended_uttname) return file_id_list_out
def merge_data(self, binary_label_file_list, new_feat_file_list, out_feat_file_list): ''' merging new features with normalised label features ''' utt_number = len(new_feat_file_list) if utt_number != len(binary_label_file_list): print("the number of new feature input files and label files should be the same!\n"); sys.exit(1) new_feat_ext = new_feat_file_list[0].split('/')[-1].split('.')[1] io_funcs = BinaryIOCollection() for i in range(utt_number): lab_file_name = binary_label_file_list[i] new_feat_file_name = new_feat_file_list[i] out_feat_file_name = out_feat_file_list[i] lab_features, lab_frame_number = io_funcs.load_binary_file_frame(lab_file_name, self.lab_dim) new_features, feat_frame_number = io_funcs.load_binary_file_frame(new_feat_file_name, self.feat_dim) if (lab_frame_number - feat_frame_number)>5: base_file_name = new_feat_file_list[i].split('/')[-1].split('.')[0] self.logger.critical("the number of frames in label and new features are different: %d vs %d (%s)" %(lab_frame_number, feat_frame_number, base_file_name)) raise merged_features = numpy.zeros((lab_frame_number, self.lab_dim+self.feat_dim)) merged_features[0:lab_frame_number, 0:self.lab_dim] = lab_features merged_features[0:feat_frame_number, self.lab_dim:self.lab_dim+self.feat_dim] = new_features[0:lab_frame_number, ] io_funcs.array_to_binary_file(merged_features, out_feat_file_name) self.logger.debug('merged new feature %s of %d frames with %d label features' % (new_feat_ext, feat_frame_number,lab_frame_number) )
def load_next_utterance_CTC(self): temp_set_x = numpy.empty((self.buffer_size, self.n_ins)) temp_set_y = numpy.empty(self.buffer_size) io_fun = BinaryIOCollection() in_features, lab_frame_number = io_fun.load_binary_file_frame( self.x_files_list[self.file_index], self.n_ins) out_features, out_frame_number = io_fun.load_binary_file_frame( self.y_files_list[self.file_index], self.n_outs) frame_number = lab_frame_number temp_set_x = in_features[0:frame_number, ] temp_set_y = numpy.array([self.n_outs]) for il in numpy.argmax(out_features, axis=1): temp_set_y = numpy.concatenate((temp_set_y, [il, self.n_outs]), axis=0) self.file_index += 1 if self.file_index >= self.list_size: self.end_reading = True self.file_index = 0 shared_set_x = self.make_shared(temp_set_x, 'x') shared_set_y = theano.shared(numpy.asarray(temp_set_y, dtype='int32'), name='y', borrow=True) shared_set_xy = (shared_set_x, shared_set_y) return shared_set_xy, temp_set_x, temp_set_y
def make_equal_frames(self, in_file_list, ref_file_list, in_dimension_dict): logger = logging.getLogger("acoustic_comp") logger.info('making equal number of lines...') io_funcs = BinaryIOCollection() utt_number = len(in_file_list) for i in xrange(utt_number): in_file_name = in_file_list[i] in_data_stream_name = in_file_name.split('.')[-1] in_feature_dim = in_dimension_dict[in_data_stream_name] in_features, in_frame_number = io_funcs.load_binary_file_frame(in_file_name, in_feature_dim) ref_file_name = ref_file_list[i] ref_data_stream_name = ref_file_name.split('.')[-1] ref_feature_dim = in_dimension_dict[ref_data_stream_name] ref_features, ref_frame_number = io_funcs.load_binary_file_frame(ref_file_name, ref_feature_dim) target_features = numpy.zeros((ref_frame_number, in_feature_dim)) if in_frame_number == ref_frame_number: continue; elif in_frame_number > ref_frame_number: target_features[0:ref_frame_number, ] = in_features[0:ref_frame_number, ] elif in_frame_number < ref_frame_number: target_features[0:in_frame_number, ] = in_features[0:in_frame_number, ] io_funcs.array_to_binary_file(target_features, in_file_name) logger.info('Finished: made equal rows in data stream %s with reference to data stream %s ' %(in_data_stream_name, ref_data_stream_name))
def load_next_utterance_S2SML(self): """Load the data for one utterance. This function will be called when utterance-by-utterance loading is required (e.g., sequential training). """ io_fun = BinaryIOCollection() in_features, lab_frame_number = io_fun.load_binary_file_frame(self.x_files_list[self.file_index], self.n_ins) out_features, out_frame_number = io_fun.load_binary_file_frame(self.y_files_list[self.file_index], self.n_outs) dur_features, dur_frame_number = io_fun.load_binary_file_frame(self.dur_files_list[self.file_index], 1) ### MLU features sub-division ### temp_set_MLU = in_features[0:lab_frame_number, ] temp_set_y = out_features[0:out_frame_number, ] temp_set_phone = numpy.concatenate([temp_set_MLU[:, self.MLU_div['phone'][0]: self.MLU_div['phone'][1]], temp_set_MLU[:, self.MLU_div['phone'][2]: self.MLU_div['phone'][3]]], axis = 1) temp_set_syl = numpy.concatenate([temp_set_MLU[:, self.MLU_div['syl'][0]: self.MLU_div['syl'][1]], temp_set_MLU[:, self.MLU_div['syl'][2]: self.MLU_div['syl'][3]]], axis = 1) temp_set_word = numpy.concatenate([temp_set_MLU[:, self.MLU_div['word'][0]: self.MLU_div['word'][1]], temp_set_MLU[:, self.MLU_div['word'][2]: self.MLU_div['word'][3] ]], axis = 1) ### duration array sub-division ### dur_features = numpy.reshape(dur_features, (-1, )) temp_set_d = dur_features.astype(int) dur_word_syl = temp_set_d[0: -lab_frame_number] num_ph = lab_frame_number num_syl = (numpy.where(numpy.cumsum(dur_word_syl[::-1])==lab_frame_number)[0][0] + 1) num_words = len(dur_word_syl) - num_syl temp_set_dur_phone = temp_set_d[-num_ph:] temp_set_dur_word = dur_word_syl[0: num_words] temp_set_dur_syl = dur_word_syl[num_words: ] ### additional feature matrix (syllable+phone+frame=432) ### num_frames = sum(temp_set_dur_phone) temp_set_af = numpy.empty((num_frames, self.MLU_div['length'][-1])) temp_set_af[0: num_syl, self.MLU_div['length'][0]: self.MLU_div['length'][1] ] = temp_set_syl[numpy.cumsum(temp_set_dur_syl)-1] temp_set_af[0: num_ph, self.MLU_div['length'][1]: self.MLU_div['length'][2]] = temp_set_phone ### input word feature matrix ### temp_set_dur_word_segments = numpy.zeros(num_words, dtype='int32') syl_bound = numpy.cumsum(temp_set_dur_word) for indx in range(num_words): temp_set_dur_word_segments[indx] = int(sum(temp_set_dur_syl[0: syl_bound[indx]])) temp_set_x = temp_set_word[temp_set_dur_word_segments-1] ### rest of the code similar to S2S ### self.file_index += 1 if self.file_index >= self.list_size: self.end_reading = True self.file_index = 0 shared_set_x = self.make_shared(temp_set_x, 'x') shared_set_y = self.make_shared(temp_set_y, 'y') shared_set_d = theano.shared(numpy.asarray(temp_set_d, dtype='int32'), name='d', borrow=True) shared_set_xyd = (shared_set_x, shared_set_y, shared_set_d) return shared_set_xyd, temp_set_x, temp_set_y, temp_set_d, temp_set_af
def get_file_lengths(self): io_funcs = BinaryIOCollection() self.file_length_dict = {'framenum2utt':{}, 'utt2framenum':{}, 'utt2index':{}} ### read file by file ### while True: if self.file_index >= self.list_size: self.end_reading = True self.file_index = 0 break in_features, lab_frame_number = io_funcs.load_binary_file_frame(self.x_files_list[self.file_index], self.n_ins) out_features, out_frame_number = io_funcs.load_binary_file_frame(self.y_files_list[self.file_index], self.n_outs) base_file_name = os.path.basename(self.x_files_list[self.file_index]).split('.')[0] if abs(lab_frame_number - out_frame_number) < 5: ## we allow small difference here. may not be correct, but sometimes, there is one/two frames difference frame_number = min(lab_frame_number, out_frame_number) else: self.logger.critical("the number of frames in label and acoustic features are different: %d vs %d (%s)" %(lab_frame_number, out_frame_number, base_file_name)) raise if frame_number not in self.file_length_dict['framenum2utt']: self.file_length_dict['framenum2utt'][frame_number] = [base_file_name] else: self.file_length_dict['framenum2utt'][frame_number].append(base_file_name) self.file_length_dict['utt2framenum'][base_file_name] = frame_number self.file_length_dict['utt2index'][base_file_name] = self.file_index self.file_index += 1 self.reset()
def get_file_lengths(self): io_funcs = BinaryIOCollection() self.file_length_dict = {'framenum2utt':{}, 'utt2framenum':{}, 'utt2index':{}} ### read file by file ### while True: if self.file_index >= self.list_size: self.end_reading = True self.file_index = 0 break in_features, lab_frame_number = io_funcs.load_binary_file_frame(self.x_files_list[self.file_index], self.n_ins) out_features, out_frame_number = io_funcs.load_binary_file_frame(self.y_files_list[self.file_index], self.n_outs) base_file_name = os.path.basename(self.x_files_list[self.file_index]).split('.')[0] if abs(lab_frame_number - out_frame_number) < 5: ## we allow small difference here. may not be correct, but sometimes, there is one/two frames difference frame_number = min(lab_frame_number, out_frame_number) else: self.logger.critical("the number of frames in label and acoustic features are different: %d vs %d (%s)" %(lab_frame_number, out_frame_number, base_file_name)) raise if frame_number not in self.file_length_dict['framenum2utt']: self.file_length_dict['framenum2utt'][frame_number] = [base_file_name] else: self.file_length_dict['framenum2utt'][frame_number].append(base_file_name) self.file_length_dict['utt2framenum'][base_file_name] = frame_number self.file_length_dict['utt2index'][base_file_name] = self.file_index self.file_index += 1 self.reset()
def simple_scale_variance(indir, outdir, var_file_dict, out_dimension_dict, file_id_list, gv_weight=1.0): ## simple variance scaling (silen et al. 2012, paragraph 3.1) ## TODO: Lots of things like stream names hardcoded here; 3 for delta + delta-delta; ... # all_streams = ['cmp','HNR','F0','LSF','Gain','LSFsource'] # streams_to_scale = ['LSF'] all_streams = ['cmp', 'mgc', 'lf0', 'bap'] streams_to_scale = ['mgc'] static_variances = {} static_dimension_dict = {} for (feature_name, size) in out_dimension_dict.items(): static_dimension_dict[feature_name] = size / 3 io_funcs = BinaryIOCollection() for feature_name in var_file_dict.keys(): var_values, dimension = io_funcs.load_binary_file_frame( var_file_dict[feature_name], 1) static_var_values = var_values[:static_dimension_dict[feature_name], :] static_variances[feature_name] = static_var_values if not os.path.isdir(outdir): os.makedirs(outdir) assert gv_weight <= 1.0 and gv_weight >= 0.0 local_weight = 1.0 - gv_weight for uttname in file_id_list: for stream in all_streams: infile = os.path.join(indir, uttname + '.' + stream) outfile = os.path.join(outdir, uttname + '.' + stream) if not os.path.isfile(infile): sys.exit(infile + ' does not exist') if stream in streams_to_scale: speech, dimension = io_funcs.load_binary_file_frame( infile, static_dimension_dict[stream]) utt_mean = numpy.mean(speech, axis=0) utt_std = numpy.std(speech, axis=0) global_std = numpy.transpose((static_variances[stream])) weighted_global_std = (gv_weight * global_std) + (local_weight * utt_std) std_ratio = weighted_global_std / utt_std nframes, ndim = numpy.shape(speech) utt_mean_matrix = numpy.tile(utt_mean, (nframes, 1)) std_ratio_matrix = numpy.tile(std_ratio, (nframes, 1)) scaled_speech = ((speech - utt_mean_matrix) * std_ratio_matrix) + utt_mean_matrix io_funcs.array_to_binary_file(scaled_speech, outfile) else: os.system('cp %s %s' % (infile, outfile))
def load_next_utterance(self): """Load the data for one utterance. This function will be called when utterance-by-utterance loading is required (e.g., sequential training). """ temp_set_x = numpy.empty((self.buffer_size, self.n_ins)) temp_set_y = numpy.empty((self.buffer_size, self.n_outs)) io_fun = BinaryIOCollection() in_features, lab_frame_number = io_fun.load_binary_file_frame( self.x_files_list[self.file_index], self.n_ins) out_features, out_frame_number = io_fun.load_binary_file_frame( self.y_files_list[self.file_index], self.n_outs) frame_number = lab_frame_number print(' %%%%% {} : {} / {} '.format(base_file_name, self.n_ins, self.n_outs)) if abs( lab_frame_number - out_frame_number ) < 5: ## we allow small difference here. may not be correct, but sometimes, there is one/two frames difference if lab_frame_number > out_frame_number: frame_number = out_frame_number else: base_file_name = os.path.basename( self.x_files_list[self.file_index]).split('.')[0] self.logger.critical( "the number of frames in label and acoustic features are different: %d vs %d (%s)" % (lab_frame_number, out_frame_number, base_file_name)) raise temp_set_y = out_features[0:frame_number, ] temp_set_x = in_features[0:frame_number, ] self.file_index += 1 if self.file_index >= self.list_size: self.end_reading = True self.file_index = 0 # reshape input-output if self.reshape_io: temp_set_x = numpy.reshape(temp_set_x, (1, temp_set_x.shape[0], self.n_ins)) temp_set_y = numpy.reshape(temp_set_y, (1, temp_set_y.shape[0], self.n_outs)) temp_set_x = numpy.array(temp_set_x, 'float32') temp_set_y = numpy.array(temp_set_y, 'float32') shared_set_x = self.make_shared(temp_set_x, 'x') shared_set_y = self.make_shared(temp_set_y, 'y') shared_set_xy = (shared_set_x, shared_set_y) return shared_set_xy, temp_set_x, temp_set_y
def simple_scale_variance_CONTINUUM(indir, outdir, var_file_dict, out_dimension_dict, file_id_list): ## Try range of interpolation weights for combining global & local variance all_streams = ['cmp','HNR','F0','LSF','Gain','LSFsource'] streams_to_scale = ['LSF'] static_variances = {} static_dimension_dict = {} for (feature_name,size) in list(out_dimension_dict.items()): static_dimension_dict[feature_name] = size/3 io_funcs = BinaryIOCollection() for feature_name in list(var_file_dict.keys()): var_values, dimension = io_funcs.load_binary_file_frame(var_file_dict[feature_name], 1) static_var_values = var_values[:static_dimension_dict[feature_name], :] static_variances[feature_name] = static_var_values if not os.path.isdir(outdir): os.makedirs(outdir) file_id_list_out = [] for uttname in file_id_list: for gv_weight in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]: local_weight = 1.0 - gv_weight for stream in all_streams: infile = os.path.join(indir, uttname + '.' + stream) extended_uttname = uttname + '_gv' + str(gv_weight) print(extended_uttname) outfile = os.path.join(outdir, extended_uttname + '.' + stream) if not os.path.isfile(infile): sys.exit(infile + ' does not exist') if stream in streams_to_scale: speech, dimension = io_funcs.load_binary_file_frame(infile, static_dimension_dict[stream]) utt_mean = numpy.mean(speech, axis=0) utt_std = numpy.std(speech, axis=0) global_std = numpy.transpose((static_variances[stream])) weighted_global_std = (gv_weight * global_std) + (local_weight * utt_std) std_ratio = weighted_global_std / utt_std nframes, ndim = numpy.shape(speech) utt_mean_matrix = numpy.tile(utt_mean, (nframes,1)) std_ratio_matrix = numpy.tile(std_ratio, (nframes,1)) scaled_speech = ((speech - utt_mean_matrix) * std_ratio_matrix) + utt_mean_matrix io_funcs.array_to_binary_file(scaled_speech, outfile) else: os.system('cp %s %s'%(infile, outfile)) file_id_list_out.append(extended_uttname) return file_id_list_out
def load_next_utterance(self): """Load the data for one utterance. This function will be called when utterance-by-utterance loading is required (e.g., sequential training). """ shared = [] temp_x = [] temp_y = [] for i in xrange(32): temp_set_x = numpy.empty((self.buffer_size, self.n_ins)) temp_set_y = numpy.empty((self.buffer_size, self.n_outs)) io_fun = BinaryIOCollection() in_features, lab_frame_number = io_fun.load_binary_file_frame( self.x_files_list[self.file_index], self.n_ins) out_features, out_frame_number = io_fun.load_binary_file_frame( self.y_files_list[self.file_index], self.n_outs) frame_number = lab_frame_number if abs( lab_frame_number - out_frame_number ) < 5: ## we allow small difference here. may not be correct, but sometimes, there is one/two frames difference if lab_frame_number > out_frame_number: frame_number = out_frame_number else: base_file_name = self.x_files_list[self.file_index].split( '/')[-1].split('.')[0] self.logger.critical( "the number of frames in label and acoustic features are different: %d vs %d (%s)" % (lab_frame_number, out_frame_number, base_file_name)) raise temp_set_y = out_features[0:frame_number, ] temp_set_x = in_features[0:frame_number, ] shared_set_x = temp_set_x shared_set_y = temp_set_y shared_set_xy = (shared_set_x, shared_set_y) shared.append(shared_set_xy) temp_x.append(numpy.asarray(shared_set_x, dtype=numpy.float32)) temp_y.append(numpy.asarray(shared_set_y, dtype=numpy.float32)) self.file_index += 1 if self.file_index + 31 >= self.list_size: self.end_reading = True self.file_index = 0 return shared, temp_x, temp_y
def simple_scale_variance(indir, outdir, var_file_dict, out_dimension_dict, file_id_list, gv_weight=1.0): ## simple variance scaling (silen et al. 2012, paragraph 3.1) ## TODO: Lots of things like stream names hardcoded here; 3 for delta + delta-delta; ... # all_streams = ['cmp','HNR','F0','LSF','Gain','LSFsource'] # streams_to_scale = ['LSF'] all_streams = ['cmp','mgc','lf0','bap'] streams_to_scale = ['mgc'] static_variances = {} static_dimension_dict = {} for (feature_name,size) in out_dimension_dict.items(): static_dimension_dict[feature_name] = size/3 io_funcs = BinaryIOCollection() for feature_name in var_file_dict.keys(): var_values, dimension = io_funcs.load_binary_file_frame(var_file_dict[feature_name], 1) static_var_values = var_values[:static_dimension_dict[feature_name], :] static_variances[feature_name] = static_var_values if not os.path.isdir(outdir): os.makedirs(outdir) assert gv_weight <= 1.0 and gv_weight >= 0.0 local_weight = 1.0 - gv_weight for uttname in file_id_list: for stream in all_streams: infile = os.path.join(indir, uttname + '.' + stream) outfile = os.path.join(outdir, uttname + '.' + stream) if not os.path.isfile(infile): sys.exit(infile + ' does not exist') if stream in streams_to_scale: speech, dimension = io_funcs.load_binary_file_frame(infile, static_dimension_dict[stream]) utt_mean = numpy.mean(speech, axis=0) utt_std = numpy.std(speech, axis=0) global_std = numpy.transpose((static_variances[stream])) weighted_global_std = (gv_weight * global_std) + (local_weight * utt_std) std_ratio = weighted_global_std / utt_std nframes, ndim = numpy.shape(speech) utt_mean_matrix = numpy.tile(utt_mean, (nframes,1)) std_ratio_matrix = numpy.tile(std_ratio, (nframes,1)) scaled_speech = ((speech - utt_mean_matrix) * std_ratio_matrix) + utt_mean_matrix io_funcs.array_to_binary_file(scaled_speech, outfile) else: os.system('cp %s %s'%(infile, outfile))
def compute_mean(self, file_list, start_index, end_index): local_feature_dimension = end_index - start_index mean_vector = numpy.zeros((1, local_feature_dimension)) all_frame_number = 0 io_funcs = BinaryIOCollection() for file_name in file_list: features, current_frame_number = io_funcs.load_binary_file_frame( file_name, self.feature_dimension) mean_vector += numpy.reshape( numpy.sum(features[:, start_index:end_index], axis=0), (1, local_feature_dimension)) all_frame_number += current_frame_number mean_vector /= float(all_frame_number) # po=numpy.get_printoptions() # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4) self.logger.info('computed mean vector of length %d :' % mean_vector.shape[1]) self.logger.info(' mean: %s' % mean_vector) # restore the print options # numpy.set_printoptions(po) return mean_vector
def merge_label(self, binary_label_file_list, new_feat_file_list, out_feat_file_list): """ merging additional label for each utterance. """ utt_number = len(new_feat_file_list) if utt_number != len(binary_label_file_list): print( "the number of new feature input files and label files should be the same!\n" ) sys.exit(1) io_funcs = BinaryIOCollection() for i in range(utt_number): lab_file_name = binary_label_file_list[i] new_feat_file_name = new_feat_file_list[i] out_feat_file_name = out_feat_file_list[i] lab_features, lab_frame_number = io_funcs.load_binary_file_frame( lab_file_name, self.lab_dim) # shape of new feature shoule be (1, dim) new_features = io_funcs.load_binary_file(new_feat_file_name, self.feat_dim) # expand shape of new feature to (T, dim) new_features = numpy.tile(new_features, (lab_frame_number, 1)) merged_features = numpy.zeros( (lab_frame_number, self.lab_dim + self.feat_dim)) merged_features[0:lab_frame_number, 0:self.lab_dim] = lab_features merged_features[0:lab_frame_number, self.lab_dim:self.lab_dim + self.feat_dim] = new_features[0:lab_frame_number, ] io_funcs.array_to_binary_file(merged_features, out_feat_file_name)
def compute_std(self, file_list, mean_vector, start_index, end_index): local_feature_dimension = end_index - start_index std_vector = numpy.zeros((1, self.feature_dimension)) all_frame_number = 0 io_funcs = BinaryIOCollection() for file_name in file_list: features, current_frame_number = io_funcs.load_binary_file_frame(file_name, self.feature_dimension) mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1)) std_vector += numpy.reshape(numpy.sum((features[:, start_index:end_index] - mean_matrix) ** 2, axis=0), (1, local_feature_dimension)) all_frame_number += current_frame_number std_vector /= float(all_frame_number) std_vector = std_vector ** 0.5 # po=numpy.get_printoptions() # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4) self.logger.info('computed std vector of length %d' % std_vector.shape[1] ) self.logger.info(' std: %s' % std_vector) # restore the print options # numpy.set_printoptions(po) return std_vector
def read_and_transform_data_from_file_list(in_file_list, dim, seq_length=200, merge_size=1): io_funcs = BinaryIOCollection() num_of_utt = len(in_file_list) temp_set = np.zeros((FRAME_BUFFER_SIZE, dim)) ### read file by file ### current_index = 0 for i in range(num_of_utt): in_file_name = in_file_list[i] in_features, frame_number = io_funcs.load_binary_file_frame( in_file_name, dim) base_file_name = os.path.basename(in_file_name).split(".")[0] temp_set[current_index:current_index + frame_number, ] = in_features current_index += frame_number if (i + 1) % merge_size == 0: current_index = seq_length * (int( np.ceil(float(current_index) / float(seq_length)))) drawProgressBar(i + 1, num_of_utt) sys.stdout.write("\n") num_of_samples = int(np.ceil(float(current_index) / float(seq_length))) temp_set = temp_set[0:num_of_samples * seq_length, ] temp_set = temp_set.reshape(num_of_samples, seq_length) return temp_set
def feature_denormalisation(self, in_file_list, out_file_list, mean_vector, std_vector): io_funcs = BinaryIOCollection() file_number = len(in_file_list) try: assert len(in_file_list) == len(out_file_list) except AssertionError: logger.critical( 'The input and output file numbers are not the same! %d vs %d' % (len(in_file_list), len(out_file_list))) raise try: assert mean_vector.size == self.feature_dimension and std_vector.size == self.feature_dimension except AssertionError: logger.critical( 'the dimensionalities of the mean and standard derivation vectors are not the same as the dimensionality of the feature' ) raise for i in xrange(file_number): features, current_frame_number = io_funcs.load_binary_file_frame( in_file_list[i], self.feature_dimension) mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1)) std_matrix = numpy.tile(std_vector, (current_frame_number, 1)) norm_features = features * std_matrix + mean_matrix io_funcs.array_to_binary_file(norm_features, out_file_list[i])
def feature_normalisation(self, in_file_list, out_file_list): logger = logging.getLogger('feature_normalisation') # self.feature_dimension = feature_dimension try: assert len(in_file_list) == len(out_file_list) except AssertionError: logger.critical('The input and output file numbers are not the same! %d vs %d' %(len(in_file_list), len(out_file_list))) raise if self.mean_vector == None: self.mean_vector = self.compute_mean(in_file_list, 0, self.feature_dimension) if self.std_vector == None: self.std_vector = self.compute_std(in_file_list, self.mean_vector, 0, self.feature_dimension) io_funcs = BinaryIOCollection() file_number = len(in_file_list) for i in xrange(file_number): features, current_frame_number = io_funcs.load_binary_file_frame(in_file_list[i], self.feature_dimension) mean_matrix = numpy.tile(self.mean_vector, (current_frame_number, 1)) std_matrix = numpy.tile(self.std_vector, (current_frame_number, 1)) norm_features = (features - mean_matrix) / std_matrix io_funcs.array_to_binary_file(norm_features, out_file_list[i]) return self.mean_vector, self.std_vector
def compute_std(self, file_list, mean_vector, start_index, end_index): logger = logging.getLogger('feature_normalisation') local_feature_dimension = end_index - start_index std_vector = numpy.zeros((1, self.feature_dimension)) all_frame_number = 0 io_funcs = BinaryIOCollection() for file_name in file_list: features, current_frame_number = io_funcs.load_binary_file_frame(file_name, self.feature_dimension) mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1)) std_vector += numpy.reshape(numpy.sum((features[:, start_index:end_index] - mean_matrix) ** 2, axis=0), (1, local_feature_dimension)) all_frame_number += current_frame_number std_vector /= float(all_frame_number) std_vector = std_vector ** 0.5 # setting the print options in this way seems to break subsequent printing of numpy float32 types # no idea what is going on - removed until this can be solved # po=numpy.get_printoptions() # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4) logger.info('computed std vector of length %d' % std_vector.shape[1] ) logger.info(' std: %s' % std_vector) # restore the print options # numpy.set_printoptions(po) self.std_vector = std_vector return std_vector
def feature_normalisation(self, in_file_list, out_file_list): logger = logging.getLogger('feature_normalisation') # self.feature_dimension = feature_dimension try: assert len(in_file_list) == len(out_file_list) except AssertionError: logger.critical( 'The input and output file numbers are not the same! %d vs %d' % (len(in_file_list), len(out_file_list))) raise if self.mean_vector == None: self.mean_vector = self.compute_mean(in_file_list, 0, self.feature_dimension) if self.std_vector == None: self.std_vector = self.compute_std(in_file_list, self.mean_vector, 0, self.feature_dimension) io_funcs = BinaryIOCollection() file_number = len(in_file_list) for i in xrange(file_number): features, current_frame_number = io_funcs.load_binary_file_frame( in_file_list[i], self.feature_dimension) mean_matrix = numpy.tile(self.mean_vector, (current_frame_number, 1)) std_matrix = numpy.tile(self.std_vector, (current_frame_number, 1)) norm_features = (features - mean_matrix) / std_matrix io_funcs.array_to_binary_file(norm_features, out_file_list[i]) return self.mean_vector, self.std_vector
def compute_mean(self, file_list, start_index, end_index): logger = logging.getLogger('feature_normalisation') local_feature_dimension = end_index - start_index mean_vector = numpy.zeros((1, local_feature_dimension)) all_frame_number = 0 io_funcs = BinaryIOCollection() for file_name in file_list: features, current_frame_number = io_funcs.load_binary_file_frame( file_name, self.feature_dimension) mean_vector += numpy.reshape( numpy.sum(features[:, start_index:end_index], axis=0), (1, local_feature_dimension)) all_frame_number += current_frame_number mean_vector /= float(all_frame_number) # setting the print options in this way seems to break subsequent printing of numpy float32 types # no idea what is going on - removed until this can be solved # po=numpy.get_printoptions() # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4) logger.info('computed mean vector of length %d :' % mean_vector.shape[1]) logger.info(' mean: %s' % mean_vector) # restore the print options # numpy.set_printoptions(po) self.mean_vector = mean_vector return mean_vector
def read_and_transform_data_from_file_list(in_file_list, dim, seq_length=200, merge_size=1): io_funcs = BinaryIOCollection() num_of_utt = len(in_file_list) temp_set = np.zeros((FRAME_BUFFER_SIZE, dim)) ### read file by file ### current_index = 0 for i in range(num_of_utt): in_file_name = in_file_list[i] in_features, frame_number = io_funcs.load_binary_file_frame(in_file_name, dim) base_file_name = os.path.basename(in_file_name).split(".")[0] temp_set[current_index:current_index+frame_number, ] = in_features current_index += frame_number if (i+1)%merge_size == 0: current_index = seq_length * (int(np.ceil(float(current_index)/float(seq_length)))) drawProgressBar(i+1, num_of_utt) sys.stdout.write("\n") num_of_samples = int(np.ceil(float(current_index)/float(seq_length))) temp_set = temp_set[0: num_of_samples*seq_length, ] temp_set = temp_set.reshape(num_of_samples, seq_length) return temp_set
def normal_standardization(self, in_file_list, out_file_list, feature_dimension): # self.dimension_dict = dimension_dict self.feature_dimension = feature_dimension mean_vector = self.compute_mean(in_file_list, 0, feature_dimension) std_vector = self.compute_std(in_file_list, mean_vector, 0, feature_dimension) io_funcs = BinaryIOCollection() file_number = len(in_file_list) for i in range(file_number): features, current_frame_number = io_funcs.load_binary_file_frame( in_file_list[i], self.feature_dimension) mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1)) std_matrix = numpy.tile(std_vector, (current_frame_number, 1)) norm_features = (features - mean_matrix) / std_matrix io_funcs.array_to_binary_file(norm_features, out_file_list[i]) return mean_vector, std_vector
def compute_std(self, file_list, mean_vector, start_index, end_index): local_feature_dimension = end_index - start_index std_vector = numpy.zeros((1, self.feature_dimension)) all_frame_number = 0 io_funcs = BinaryIOCollection() for file_name in file_list: features, current_frame_number = io_funcs.load_binary_file_frame( file_name, self.feature_dimension) mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1)) std_vector += numpy.reshape( numpy.sum( (features[:, start_index:end_index] - mean_matrix)**2, axis=0), (1, local_feature_dimension)) all_frame_number += current_frame_number std_vector /= float(all_frame_number) std_vector = std_vector**0.5 # po=numpy.get_printoptions() # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4) self.logger.info('computed std vector of length %d' % std_vector.shape[1]) self.logger.info(' std: %s' % std_vector) # restore the print options # numpy.set_printoptions(po) return std_vector
def compute_distortion(self, file_id_list, reference_dir, generation_dir, file_ext, feature_dim): total_voiced_frame_number = 0 distortion = 0.0 vuv_error = 0 total_frame_number = 0 io_funcs = BinaryIOCollection() ref_all_files_data = numpy.reshape(numpy.array([]), (-1,1)) gen_all_files_data = numpy.reshape(numpy.array([]), (-1,1)) for file_id in file_id_list: ref_file_name = reference_dir + '/' + file_id + file_ext gen_file_name = generation_dir + '/' + file_id + file_ext ref_data, ref_frame_number = io_funcs.load_binary_file_frame(ref_file_name, feature_dim) gen_data, gen_frame_number = io_funcs.load_binary_file_frame(gen_file_name, feature_dim) if ref_frame_number != gen_frame_number: self.logger.critical("The number of frames is not the same: %d vs %d. Error in compute_distortion.py\n." %(ref_frame_number, gen_frame_number)) raise if file_ext == '.lf0': ref_all_files_data = numpy.concatenate((ref_all_files_data, ref_data), axis=0) gen_all_files_data = numpy.concatenate((gen_all_files_data, gen_data), axis=0) temp_distortion, temp_vuv_error, voiced_frame_number = self.compute_f0_mse(ref_data, gen_data) vuv_error += temp_vuv_error total_voiced_frame_number += voiced_frame_number elif file_ext == '.dur': ref_data = numpy.reshape(numpy.sum(ref_data, axis=1), (-1, 1)) gen_data = numpy.reshape(numpy.sum(gen_data, axis=1), (-1, 1)) ref_all_files_data = numpy.concatenate((ref_all_files_data, ref_data), axis=0) gen_all_files_data = numpy.concatenate((gen_all_files_data, gen_data), axis=0) continue; elif file_ext == '.mgc': temp_distortion = self.compute_mse(ref_data[:, 1:feature_dim], gen_data[:, 1:feature_dim]) else: temp_distortion = self.compute_mse(ref_data, gen_data) distortion += temp_distortion total_frame_number += ref_frame_number if file_ext == '.dur': dur_rmse = self.compute_rmse(ref_all_files_data, gen_all_files_data) dur_corr = self.compute_corr(ref_all_files_data, gen_all_files_data) return dur_rmse, dur_corr elif file_ext == '.lf0': distortion /= float(total_voiced_frame_number) vuv_error /= float(total_frame_number) distortion = numpy.sqrt(distortion) f0_corr = self.compute_f0_corr(ref_all_files_data, gen_all_files_data) return distortion, f0_corr, vuv_error else: distortion /= float(total_frame_number) return distortion
def feature_normalisation(self, in_file_list, out_file_list): logger = logging.getLogger('feature_normalisation') # self.feature_dimension = feature_dimension try: assert len(in_file_list) == len(out_file_list) except AssertionError: logger.critical('The input and output file numbers are not the same! %d vs %d' %(len(in_file_list), len(out_file_list))) raise if self.mean_vector is None: self.mean_vector = self.compute_mean(in_file_list, 0, self.feature_dimension) if self.std_vector is None: self.std_vector = self.compute_std(in_file_list, self.mean_vector, 0, self.feature_dimension) io_funcs = BinaryIOCollection() file_number = len(in_file_list) for i in range(file_number): features, current_frame_number = io_funcs.load_binary_file_frame(in_file_list[i], self.feature_dimension) mean_matrix = numpy.tile(self.mean_vector, (current_frame_number, 1)) std_matrix = numpy.tile(self.std_vector, (current_frame_number, 1)) norm_features = (features - mean_matrix) / std_matrix print(current_frame_number,in_file_list[i]) norm_features=numpy.concatenate([norm_features[:,:self.feature_dimension-2],features[:,self.feature_dimension-2:]],axis=-1) # in fact the problem is that I normalized the out put xvector pvector and onehotvector.... so we have to in formalized this print(' normalized vector :{}'.format(norm_features[1,:])) io_funcs.array_to_binary_file(norm_features, out_file_list[i]) return self.mean_vector, self.std_vector
def load_covariance(var_file_dict, out_dimension_dict): var = {} io_funcs = BinaryIOCollection() for feature_name in var_file_dict.keys(): var_values, dimension = io_funcs.load_binary_file_frame(var_file_dict[feature_name], 1) var_values = numpy.reshape(var_values, (out_dimension_dict[feature_name], 1)) var[feature_name] = var_values return var
def load_next_utterance(self): """Load the data for one utterance. This function will be called when utterance-by-utterance loading is required (e.g., sequential training). """ temp_set_x = numpy.empty((self.buffer_size, self.n_ins)) temp_set_y = numpy.empty((self.buffer_size, self.n_outs)) io_fun = BinaryIOCollection() in_features, lab_frame_number = io_fun.load_binary_file_frame(self.x_files_list[self.file_index], self.n_ins) out_features, out_frame_number = io_fun.load_binary_file_frame(self.y_files_list[self.file_index], self.n_outs) frame_number = lab_frame_number if abs(lab_frame_number - out_frame_number) < 5: ## we allow small difference here. may not be correct, but sometimes, there is one/two frames difference if lab_frame_number > out_frame_number: frame_number = out_frame_number else: base_file_name = os.path.basename(self.x_files_list[self.file_index]).split('.')[0] self.logger.critical("the number of frames in label and acoustic features are different: %d vs %d (%s)" %(lab_frame_number, out_frame_number, base_file_name)) raise temp_set_y = out_features[0:frame_number, ] temp_set_x = in_features[0:frame_number, ] self.file_index += 1 if self.file_index >= self.list_size: self.end_reading = True self.file_index = 0 # reshape input-output if self.reshape_io: temp_set_x = numpy.reshape(temp_set_x, (1, temp_set_x.shape[0], self.n_ins)) temp_set_y = numpy.reshape(temp_set_y, (1, temp_set_y.shape[0], self.n_outs)) temp_set_x = numpy.array(temp_set_x, 'float32') temp_set_y = numpy.array(temp_set_y, 'float32') shared_set_x = self.make_shared(temp_set_x, 'x') shared_set_y = self.make_shared(temp_set_y, 'y') shared_set_xy = (shared_set_x, shared_set_y) return shared_set_xy, temp_set_x, temp_set_y
def load_covariance(self, var_file_dict, out_dimension_dict): io_funcs = BinaryIOCollection() for feature_name in list(var_file_dict.keys()): var_values, dimension = io_funcs.load_binary_file_frame(var_file_dict[feature_name], 1) var_values = numpy.reshape(var_values, (out_dimension_dict[feature_name], 1)) self.var[feature_name] = var_values
def merge_data(self, binary_label_file_list, new_feat_file_list, out_feat_file_list): ''' merging new features with normalised label features ''' utt_number = len(new_feat_file_list) if utt_number != len(binary_label_file_list): print( "the number of new feature input files and label files should be the same!\n" ) sys.exit(1) new_feat_ext = new_feat_file_list[0].split('/')[-1].split('.')[1] io_funcs = BinaryIOCollection() for i in range(utt_number): lab_file_name = binary_label_file_list[i] new_feat_file_name = new_feat_file_list[i] out_feat_file_name = out_feat_file_list[i] lab_features, lab_frame_number = io_funcs.load_binary_file_frame( lab_file_name, self.lab_dim) new_features, feat_frame_number = io_funcs.load_binary_file_frame( new_feat_file_name, self.feat_dim) if (lab_frame_number - feat_frame_number) > 5: base_file_name = new_feat_file_list[i].split('/')[-1].split( '.')[0] self.logger.critical( "the number of frames in label and new features are different: %d vs %d (%s)" % (lab_frame_number, feat_frame_number, base_file_name)) raise merged_features = numpy.zeros( (lab_frame_number, self.lab_dim + self.feat_dim)) merged_features[0:lab_frame_number, 0:self.lab_dim] = lab_features merged_features[0:feat_frame_number, self.lab_dim:self.lab_dim + self.feat_dim] = new_features[0:lab_frame_number, ] io_funcs.array_to_binary_file(merged_features, out_feat_file_name) self.logger.debug( 'merged new feature %s of %d frames with %d label features' % (new_feat_ext, feat_frame_number, lab_frame_number))
def load_next_utterance_S2S(self): """Load the data for one utterance. This function will be called when utterance-by-utterance loading is required (e.g., sequential training). """ temp_set_x = numpy.empty((self.buffer_size, self.n_ins)) temp_set_y = numpy.empty((self.buffer_size, self.n_outs)) io_fun = BinaryIOCollection() in_features, lab_frame_number = io_fun.load_binary_file_frame( self.x_files_list[self.file_index], self.n_ins) out_features, out_frame_number = io_fun.load_binary_file_frame( self.y_files_list[self.file_index], self.n_outs) temp_set_x = in_features[0:lab_frame_number, ] temp_set_y = out_features[0:out_frame_number, ] if not self.dur_files_list: dur_frame_number = out_frame_number dur_features = numpy.array([dur_frame_number]) else: dur_features, dur_frame_number = io_fun.load_binary_file_frame( self.dur_files_list[self.file_index], 1) assert sum(dur_features) == out_frame_number dur_features = numpy.reshape(dur_features, (-1, )) temp_set_d = dur_features.astype(int) self.file_index += 1 if self.file_index >= self.list_size: self.end_reading = True self.file_index = 0 shared_set_x = self.make_shared(temp_set_x, 'x') shared_set_y = self.make_shared(temp_set_y, 'y') shared_set_d = theano.shared(numpy.asarray(temp_set_d, dtype='int32'), name='d', borrow=True) shared_set_xyd = (shared_set_x, shared_set_y, shared_set_d) return shared_set_xyd, temp_set_x, temp_set_y, temp_set_d
def modify_dur_from_phone_alignment_labels(self, label_file_name, gen_dur_file_name, gen_lab_file_name): logger = logging.getLogger("dur") dur_dim = 1 io_funcs = BinaryIOCollection() dur_features, frame_number = io_funcs.load_binary_file_frame( gen_dur_file_name, dur_dim) fid = open(label_file_name) utt_labels = fid.readlines() fid.close() label_number = len(utt_labels) logger.info('loaded %s, %3d labels' % (label_file_name, label_number)) out_fid = open(gen_lab_file_name, 'w') current_index = 0 prev_end_time = 0 for line in utt_labels: line = line.strip() if len(line) < 1: continue temp_list = re.split('\s+', line) start_time = int(temp_list[0]) end_time = int(temp_list[1]) full_label = temp_list[2] label_binary_flag = self.check_silence_pattern(full_label) if label_binary_flag == 1: current_phone_dur = end_time - start_time out_fid.write( str(prev_end_time) + ' ' + str(prev_end_time + current_phone_dur) + ' ' + full_label + '\n') prev_end_time = prev_end_time + current_phone_dur continue else: phone_dur = dur_features[current_index] phone_dur = int(phone_dur) * 5 * 10000 out_fid.write( str(prev_end_time) + ' ' + str(prev_end_time + phone_dur) + ' ' + full_label + '\n') prev_end_time = prev_end_time + phone_dur current_index += 1 logger.debug( 'modifed label with predicted duration of %d frames x %d features' % dur_features.shape)
def read_data_from_file_list(inp_file_list, out_file_list, inp_dim, out_dim, sequential_training=True): io_funcs = BinaryIOCollection() num_of_utt = len(inp_file_list) file_length_dict = {'framenum2utt':{}, 'utt2framenum':{}} if sequential_training: temp_set_x = {} temp_set_y = {} else: temp_set_x = np.empty((FRAME_BUFFER_SIZE, inp_dim)) temp_set_y = np.empty((FRAME_BUFFER_SIZE, out_dim)) ### read file by file ### current_index = 0 for i in range(num_of_utt): inp_file_name = inp_file_list[i] out_file_name = out_file_list[i] inp_features, inp_frame_number = io_funcs.load_binary_file_frame(inp_file_name, inp_dim) out_features, out_frame_number = io_funcs.load_binary_file_frame(out_file_name, out_dim) base_file_name = os.path.basename(inp_file_name).split(".")[0] if abs(inp_frame_number-out_frame_number)>5: print('the number of frames in input and output features are different: %d vs %d (%s)' %(inp_frame_number, out_frame_number, base_file_name)) sys.exit(0) else: frame_number = min(inp_frame_number, out_frame_number) if sequential_training: temp_set_x[base_file_name] = inp_features[0:frame_number] temp_set_y[base_file_name] = out_features[0:frame_number] else: temp_set_x[current_index:current_index+frame_number, ] = inp_features[0:frame_number] temp_set_y[current_index:current_index+frame_number, ] = out_features[0:frame_number] current_index += frame_number if frame_number not in file_length_dict['framenum2utt']: file_length_dict['framenum2utt'][frame_number] = [base_file_name] else: file_length_dict['framenum2utt'][frame_number].append(base_file_name) file_length_dict['utt2framenum'][base_file_name] = frame_number drawProgressBar(i+1, num_of_utt) sys.stdout.write("\n") if not sequential_training: temp_set_x = temp_set_x[0:current_index, ] temp_set_y = temp_set_y[0:current_index, ] return temp_set_x, temp_set_y, file_length_dict
def read_data_from_file_list(inp_file_list, out_file_list, inp_dim, out_dim, sequential_training=True): io_funcs = BinaryIOCollection() num_of_utt = len(inp_file_list) file_length_dict = {'framenum2utt':{}, 'utt2framenum':{}} if sequential_training: temp_set_x = {} temp_set_y = {} else: temp_set_x = np.empty((FRAME_BUFFER_SIZE, inp_dim)) temp_set_y = np.empty((FRAME_BUFFER_SIZE, out_dim)) ### read file by file ### current_index = 0 for i in range(num_of_utt): inp_file_name = inp_file_list[i] out_file_name = out_file_list[i] inp_features, inp_frame_number = io_funcs.load_binary_file_frame(inp_file_name, inp_dim) out_features, out_frame_number = io_funcs.load_binary_file_frame(out_file_name, out_dim) base_file_name = os.path.basename(inp_file_name).split(".")[0] if abs(inp_frame_number-out_frame_number)>5: print('the number of frames in input and output features are different: %d vs %d (%s)' %(inp_frame_number, out_frame_number, base_file_name)) sys.exit(0) else: frame_number = min(inp_frame_number, out_frame_number) if sequential_training: temp_set_x[base_file_name] = inp_features[0:frame_number] temp_set_y[base_file_name] = out_features[0:frame_number] else: temp_set_x[current_index:current_index+frame_number, ] = inp_features[0:frame_number] temp_set_y[current_index:current_index+frame_number, ] = out_features[0:frame_number] current_index += frame_number if frame_number not in file_length_dict['framenum2utt']: file_length_dict['framenum2utt'][frame_number] = [base_file_name] else: file_length_dict['framenum2utt'][frame_number].append(base_file_name) file_length_dict['utt2framenum'][base_file_name] = frame_number drawProgressBar(i+1, num_of_utt) sys.stdout.write("\n") if not sequential_training: temp_set_x = temp_set_x[0:current_index, ] temp_set_y = temp_set_y[0:current_index, ] return temp_set_x, temp_set_y, file_length_dict
def load_min_max_values(self, label_norm_file): logger = logging.getLogger("acoustic_norm") io_funcs = BinaryIOCollection() min_max_vector, frame_number = io_funcs.load_binary_file_frame(label_norm_file, 1) min_max_vector = numpy.reshape(min_max_vector, (-1, )) self.min_vector = min_max_vector[0:frame_number//2] self.max_vector = min_max_vector[frame_number//2:] logger.info('Loaded min max values from the trained data for feature dimension of %d' % self.feature_dimension)
def load_mean_std_values(self, acoustic_norm_file): logger = logging.getLogger('feature_normalisation') io_funcs = BinaryIOCollection() mean_std_vector, frame_number = io_funcs.load_binary_file_frame(acoustic_norm_file, 1) mean_std_vector = numpy.reshape(mean_std_vector, (-1, )) self.mean_vector = mean_std_vector[0:frame_number//2] self.std_vector = mean_std_vector[frame_number//2:] logger.info('Loaded mean std values from the trained data for feature dimension of %d' % self.feature_dimension) return self.mean_vector, self.std_vector
def load_mean_std_values(self, acoustic_norm_file): logger = logging.getLogger('feature_normalisation') io_funcs = BinaryIOCollection() mean_std_vector, frame_number = io_funcs.load_binary_file_frame(acoustic_norm_file, 1) mean_std_vector = numpy.reshape(mean_std_vector, (-1, )) self.mean_vector = mean_std_vector[0:frame_number//2] self.std_vector = mean_std_vector[frame_number//2:] logger.info('Loaded mean std values from the trained data for feature dimension of %d' % self.feature_dimension) return self.mean_vector, self.std_vector
def compose_predict_label(self, orig_label_file, gen_label_file, predict_duration_file): io_funcs = BinaryIOCollection() origMat = io_funcs.file2matrix(orig_label_file) state_number = 5 duration, in_frame_number = io_funcs.load_binary_file_frame( predict_duration_file, state_number) assert origMat.shape[0] == in_frame_number origMat[:, -5:] = duration origMat = origMat.astype(int) io_funcs.matrix2file(origMat, gen_label_file)
def load_next_utterance_S2S(self): """Load the data for one utterance. This function will be called when utterance-by-utterance loading is required (e.g., sequential training). """ temp_set_x = numpy.empty((self.buffer_size, self.n_ins)) temp_set_y = numpy.empty((self.buffer_size, self.n_outs)) io_fun = BinaryIOCollection() in_features, lab_frame_number = io_fun.load_binary_file_frame(self.x_files_list[self.file_index], self.n_ins) out_features, out_frame_number = io_fun.load_binary_file_frame(self.y_files_list[self.file_index], self.n_outs) temp_set_x = in_features[0:lab_frame_number, ] temp_set_y = out_features[0:out_frame_number, ] if not self.dur_files_list: dur_frame_number = out_frame_number dur_features = numpy.array([dur_frame_number]) else: dur_features, dur_frame_number = io_fun.load_binary_file_frame(self.dur_files_list[self.file_index], 1) assert sum(dur_features) == out_frame_number dur_features = numpy.reshape(dur_features, (-1, )) temp_set_d = dur_features.astype(int) self.file_index += 1 if self.file_index >= self.list_size: self.end_reading = True self.file_index = 0 shared_set_x = self.make_shared(temp_set_x, 'x') shared_set_y = self.make_shared(temp_set_y, 'y') shared_set_d = theano.shared(numpy.asarray(temp_set_d, dtype='int32'), name='d', borrow=True) shared_set_xyd = (shared_set_x, shared_set_y, shared_set_d) return shared_set_xyd, temp_set_x, temp_set_y, temp_set_d
def modify_dur_from_state_alignment_labels(self, label_file_name, gen_dur_file_name, gen_lab_file_name): logger = logging.getLogger("dur") state_number = self.state_number dur_dim = state_number io_funcs = BinaryIOCollection() dur_features, frame_number = io_funcs.load_binary_file_frame(gen_dur_file_name, dur_dim) fid = open(label_file_name) utt_labels = fid.readlines() fid.close() label_number = len(utt_labels) logger.info('loaded %s, %3d labels' % (label_file_name, label_number) ) out_fid = open(gen_lab_file_name, 'w') current_index = 0 prev_end_time = 0 for line in utt_labels: line = line.strip() if len(line) < 1: continue temp_list = re.split('\s+', line) start_time = int(temp_list[0]) end_time = int(temp_list[1]) full_label = temp_list[2] full_label_length = len(full_label) - 3 # remove state information [k] state_index = full_label[full_label_length + 1] state_index = int(state_index) - 1 label_binary_flag = self.check_silence_pattern(full_label) if label_binary_flag == 1: current_state_dur = end_time - start_time out_fid.write(str(prev_end_time)+' '+str(prev_end_time+current_state_dur)+' '+full_label+'\n') prev_end_time = prev_end_time+current_state_dur continue; else: state_dur = dur_features[current_index, state_index-1] state_dur = int(state_dur)*5*10000 out_fid.write(str(prev_end_time)+' '+str(prev_end_time+state_dur)+' '+full_label+'\n') prev_end_time = prev_end_time+state_dur if state_index == state_number: current_index += 1 logger.debug('modifed label with predicted duration of %d frames x %d features' % dur_features.shape )
def duration_decomposition(self, in_file_list, dimension, out_dimension_dict, file_extension_dict, meta=None): logger = logging.getLogger('param_generation') logger.debug('duration_decomposition for %d files' % len(in_file_list)) state_number = 5 ## hard coding, try removing in future? if len(list(out_dimension_dict.keys())) > 1: logger.critical( "we don't support any additional features along with duration as of now." ) sys.exit(1) else: feature_name = list(out_dimension_dict.keys())[0] io_funcs = BinaryIOCollection() findex = 0 flen = len(in_file_list) for file_name in in_file_list: findex = findex + 1 dir_name = os.path.dirname(file_name) file_id = os.path.splitext(os.path.basename(file_name))[0] features, frame_number = io_funcs.load_binary_file_frame( file_name, dimension) gen_features = numpy.int32(numpy.round(features)) gen_features[gen_features < 1] = 1 if dimension > state_number: gen_features = gen_features[:, state_number] logger.info('processing %4d of %4d: %s' % (findex, flen, file_name)) if meta is not None: gen_features = self.hardcode_duration(meta, gen_features) new_file_name = os.path.join( dir_name, file_id + file_extension_dict[feature_name]) io_funcs.array_to_binary_file(gen_features, new_file_name) logger.debug('wrote to file %s' % new_file_name)
def read_data_from_file_list_shared(speaker_id_list, inp_file_list, out_file_list, inp_dim, out_dim): io_funcs = BinaryIOCollection() num_of_utt = len(inp_file_list) num_of_spk = len(speaker_id_list) file_length_dict = {'framenum2utt':{}, 'utt2framenum':{}} temp_set_x = {} temp_set_y = {} ### read file by file ### for i in range(num_of_utt): inp_file_name = inp_file_list[i] out_file_name = out_file_list[i] inp_features, inp_frame_number = io_funcs.load_binary_file_frame(inp_file_name, inp_dim) out_features, out_frame_number = io_funcs.load_binary_file_frame(out_file_name, out_dim) base_file_name = os.path.basename(inp_file_name).split(".")[0] if abs(inp_frame_number-out_frame_number)>5: print('the number of frames in input and output features are different: %d vs %d (%s)' %(inp_frame_number, out_frame_number, base_file_name)) sys.exit(0) else: frame_number = min(inp_frame_number, out_frame_number) # Write to dictionaries temp_set_x[base_file_name] = inp_features[0:frame_number] temp_set_y[base_file_name] = out_features[0:frame_number] if frame_number not in file_length_dict['framenum2utt']: file_length_dict['framenum2utt'][frame_number] = [base_file_name] else: file_length_dict['framenum2utt'][frame_number].append(base_file_name) file_length_dict['utt2framenum'][base_file_name] = frame_number drawProgressBar(i+1, num_of_utt) sys.stdout.write("\n") set_x = temp_set_x set_y = {speaker: {} for speaker in speaker_id_list} for base_file_name in temp_set_y.keys(): speaker_ind = np.where([speaker_id in base_file_name for speaker_id in speaker_id_list]) speaker = speaker_id_list[int(speaker_ind[0])] set_y[speaker][base_file_name] = temp_set_y[base_file_name] return set_x, set_y, file_length_dict
def load_min_max_values(self, label_norm_file): logger = logging.getLogger("acoustic_norm") io_funcs = BinaryIOCollection() min_max_vector, frame_number = io_funcs.load_binary_file_frame( label_norm_file, 1) min_max_vector = numpy.reshape(min_max_vector, (-1, )) self.min_vector = min_max_vector[0:frame_number // 2] self.max_vector = min_max_vector[frame_number // 2:] logger.info( 'Loaded min max values from the trained data for feature dimension of %d' % self.feature_dimension)
def load_prev_fea(self,): # load acoustic var and mean and linguistic feature fid = open(self.norm_info_file, 'rb') cmp_min_max = np.fromfile(fid, dtype=np.float32) fid.close() cmp_min_max = cmp_min_max.reshape((2, -1)) cmp_mean_vector = cmp_min_max[0, ] cmp_std_vector = cmp_min_max[1, ] io_funcs = BinaryIOCollection() inp_features, frame_number = io_funcs.load_binary_file_frame( self.test_norm_path, self.n_in) test_lin_x, test_lab_x = np.hsplit(inp_features, np.array([-1])) # set 100 as vary utterance embedding test_lab_x = np.tile(np.array(100), (test_lab_x.shape[0], 1)) return cmp_mean_vector, cmp_std_vector, test_lin_x, test_lab_x
def read_data_from_file_list(in_file_list, dim): io_funcs = BinaryIOCollection() temp_set = np.empty((500000, dim)) ### read file by file ### current_index = 0 for i in tqdm.tqdm(range(len(in_file_list))): in_file_name = in_file_list[i] in_features, frame_number = io_funcs.load_binary_file_frame(in_file_name, dim) temp_set[current_index:current_index+frame_number, ] = in_features current_index += frame_number temp_set = temp_set[0:current_index, ] return temp_set
def load_norm_stats(stats_file, dim, method="MVN"): #### load norm stats #### io_funcs = BinaryIOCollection() norm_matrix, frame_number = io_funcs.load_binary_file_frame(stats_file, dim) assert frame_number==2 if method=="MVN": scaler = preprocessing.StandardScaler() scaler.mean_ = norm_matrix[0, :] scaler.scale_ = norm_matrix[1, :] elif method=="MINMAX": scaler = preprocessing.MinMaxScaler(feature_range=(0.01, 0.99)) scaler.min_ = norm_matrix[0, :] scaler.scale_ = norm_matrix[1, :] return scaler
def read_test_data_from_file_list(inp_file_list, inp_dim, sequential_training=True): io_funcs = BinaryIOCollection() num_of_utt = len(inp_file_list) file_length_dict = {'framenum2utt':{}, 'utt2framenum':{}} if sequential_training: temp_set_x = {} else: temp_set_x = np.empty((FRAME_BUFFER_SIZE, inp_dim)) ### read file by file ### current_index = 0 for i in range(num_of_utt): inp_file_name = inp_file_list[i] inp_features, frame_number = io_funcs.load_binary_file_frame(inp_file_name, inp_dim) base_file_name = os.path.basename(inp_file_name).split(".")[0] if sequential_training: temp_set_x[base_file_name] = inp_features else: temp_set_x[current_index:current_index+frame_number, ] = inp_features[0:frame_number] current_index += frame_number if frame_number not in file_length_dict['framenum2utt']: file_length_dict['framenum2utt'][frame_number] = [base_file_name] else: file_length_dict['framenum2utt'][frame_number].append(base_file_name) file_length_dict['utt2framenum'][base_file_name] = frame_number drawProgressBar(i+1, num_of_utt) sys.stdout.write("\n") if not sequential_training: temp_set_x = temp_set_x[0:current_index, ] return temp_set_x, file_length_dict
def duration_decomposition(self, in_file_list, dimension, out_dimension_dict, file_extension_dict): logger = logging.getLogger('param_generation') logger.debug('duration_decomposition for %d files' % len(in_file_list) ) state_number = 5 ## hard coding, try removing in future? if len(list(out_dimension_dict.keys()))>1: logger.critical("we don't support any additional features along with duration as of now.") sys.exit(1) else: feature_name = list(out_dimension_dict.keys())[0] io_funcs = BinaryIOCollection() findex=0 flen=len(in_file_list) for file_name in in_file_list: findex=findex+1 dir_name = os.path.dirname(file_name) file_id = os.path.splitext(os.path.basename(file_name))[0] features, frame_number = io_funcs.load_binary_file_frame(file_name, dimension) gen_features = numpy.int32(numpy.round(features)) gen_features[gen_features<1]=1 if dimension > state_number: gen_features = gen_features[:, state_number] logger.info('processing %4d of %4d: %s' % (findex,flen,file_name) ) new_file_name = os.path.join(dir_name, file_id + file_extension_dict[feature_name]) io_funcs.array_to_binary_file(gen_features, new_file_name) logger.debug('wrote to file %s' % new_file_name)
def normal_standardization(self, in_file_list, out_file_list, feature_dimension): # self.dimension_dict = dimension_dict self.feature_dimension = feature_dimension mean_vector = self.compute_mean(in_file_list, 0, feature_dimension) std_vector = self.compute_std(in_file_list, mean_vector, 0, feature_dimension) io_funcs = BinaryIOCollection() file_number = len(in_file_list) for i in xrange(file_number): features, current_frame_number = io_funcs.load_binary_file_frame(in_file_list[i], self.feature_dimension) mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1)) std_matrix = numpy.tile(std_vector, (current_frame_number, 1)) norm_features = (features - mean_matrix) / std_matrix io_funcs.array_to_binary_file(norm_features, out_file_list[i]) return mean_vector, std_vector
def feature_denormalisation(self, in_file_list, out_file_list, mean_vector, std_vector): io_funcs = BinaryIOCollection() file_number = len(in_file_list) try: assert len(in_file_list) == len(out_file_list) except AssertionError: logger.critical('The input and output file numbers are not the same! %d vs %d' %(len(in_file_list), len(out_file_list))) raise try: assert mean_vector.size == self.feature_dimension and std_vector.size == self.feature_dimension except AssertionError: logger.critical('the dimensionalities of the mean and standard derivation vectors are not the same as the dimensionality of the feature') raise for i in xrange(file_number): features, current_frame_number = io_funcs.load_binary_file_frame(in_file_list[i], self.feature_dimension) mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1)) std_matrix = numpy.tile(std_vector, (current_frame_number, 1)) norm_features = features * std_matrix + mean_matrix io_funcs.array_to_binary_file(norm_features, out_file_list[i])
def compute_mean(self, file_list, start_index, end_index): local_feature_dimension = end_index - start_index mean_vector = numpy.zeros((1, local_feature_dimension)) all_frame_number = 0 io_funcs = BinaryIOCollection() for file_name in file_list: features, current_frame_number = io_funcs.load_binary_file_frame(file_name, self.feature_dimension) mean_vector += numpy.reshape(numpy.sum(features[:, start_index:end_index], axis=0), (1, local_feature_dimension)) all_frame_number += current_frame_number mean_vector /= float(all_frame_number) # po=numpy.get_printoptions() # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4) self.logger.info('computed mean vector of length %d :' % mean_vector.shape[1] ) self.logger.info(' mean: %s' % mean_vector) # restore the print options # numpy.set_printoptions(po) return mean_vector
def load_next_batch(self): io_funcs = BinaryIOCollection() ## set sequence length for batch training if(self.training_algo == 1): # set seq length to maximum seq length from current batch self.set_seq_length_from_current_batch() elif(self.training_algo == 2): # set seq length to maximum seq length from current bucket while not self.current_bucket_size: self.get_next_bucket() elif(self.training_algo == 3): # seq length is set based on default/user configuration pass; temp_set_x = numpy.zeros((self.buffer_size, self.n_ins)) temp_set_y = numpy.zeros((self.buffer_size, self.n_outs)) ### read file by file ### current_index = 0 while True: if current_index >= self.buffer_size: print('buffer size reached by file index %d' %(self.file_index)) break if self.training_algo == 2: # choose utterance from current bucket list base_file_name = self.current_bucket_list[self.bucket_file_index] self.utt_index = self.file_length_dict['utt2index'][base_file_name] else: # choose utterance randomly from current file list #self.utt_index = numpy.random.randint(self.list_size) ## choose utterance in serial order self.utt_index = self.file_index base_file_name = os.path.basename(self.x_files_list[self.utt_index]).split('.')[0] in_features, lab_frame_number = io_funcs.load_binary_file_frame(self.x_files_list[self.utt_index], self.n_ins) out_features, out_frame_number = io_funcs.load_binary_file_frame(self.y_files_list[self.utt_index], self.n_outs) frame_number = self.file_length_dict['utt2framenum'][base_file_name] temp_set_x[current_index:current_index+frame_number, ] = in_features temp_set_y[current_index:current_index+frame_number, ] = out_features current_index += frame_number if((self.file_index+1)%self.merge_size == 0): num_of_samples = int(numpy.ceil(float(current_index)/float(self.seq_length))) current_index = self.seq_length * num_of_samples self.file_index += 1 # break for any of the below conditions if self.training_algo == 2: self.bucket_file_index += 1 if(self.bucket_file_index >= self.current_bucket_size): self.current_bucket_size = 0 break; if(self.bucket_file_index%self.batch_size==0): break; else: if(self.file_index%self.batch_size==0) or (self.file_index >= self.list_size): break if self.file_index >= self.list_size: self.end_reading = True self.file_index = 0 num_of_samples = int(numpy.ceil(float(current_index)/float(self.seq_length))) temp_set_x = temp_set_x[0: num_of_samples*self.seq_length, ] temp_set_y = temp_set_y[0: num_of_samples*self.seq_length, ] temp_set_x = temp_set_x.reshape(num_of_samples, self.seq_length, self.n_ins) temp_set_y = temp_set_y.reshape(num_of_samples, self.seq_length, self.n_outs) shared_set_x = self.make_shared(temp_set_x, 'x') shared_set_y = self.make_shared(temp_set_y, 'y') shared_set_xy = (shared_set_x, shared_set_y) return shared_set_xy, temp_set_x, temp_set_y
def load_next_batch_S2S(self): """Load the data for one utterance. This function will be called when utterance-by-utterance loading is required (e.g., sequential training). """ temp_set_x = numpy.empty((self.buffer_size, self.n_ins)) temp_set_y = numpy.empty((self.buffer_size, self.n_outs)) temp_set_d = numpy.empty((self.buffer_size, 1)) io_fun = BinaryIOCollection() lab_start_frame_number = 0 lab_end_frame_number = 0 out_start_frame_number = 0 out_end_frame_number = 0 new_x_files_list = self.x_files_list[self.file_index].split(',') new_y_files_list = self.y_files_list[self.file_index].split(',') new_dur_files_list = self.dur_files_list[self.file_index].split(',') for new_file_index in xrange(len(new_x_files_list)): in_features, lab_frame_number = io_fun.load_binary_file_frame(new_x_files_list[new_file_index], self.n_ins) out_features, out_frame_number = io_fun.load_binary_file_frame(new_y_files_list[new_file_index], self.n_outs) lab_end_frame_number+=lab_frame_number out_end_frame_number+=out_frame_number temp_set_x[lab_start_frame_number: lab_end_frame_number, ] = in_features[0:lab_frame_number, ] temp_set_y[out_start_frame_number: out_end_frame_number, ] = out_features[0:out_frame_number, ] if not self.dur_files_list: dur_frame_number = out_end_frame_number temp_set_d = numpy.array([dur_frame_number]) else: dur_features, dur_frame_number = io_fun.load_binary_file_frame(new_dur_files_list[new_file_index], 1) assert sum(dur_features) == out_frame_number temp_set_d[lab_start_frame_number: lab_end_frame_number, ] = dur_features[0:lab_frame_number, ] lab_start_frame_number = lab_end_frame_number out_start_frame_number = out_end_frame_number temp_set_x = temp_set_x[0:lab_end_frame_number, ] temp_set_y = temp_set_y[0:out_end_frame_number, ] temp_set_d = temp_set_d[0:lab_end_frame_number, ] temp_set_d = numpy.reshape(temp_set_d, (-1, )) temp_set_d = temp_set_d.astype(int) self.file_index += 1 if self.file_index >= self.list_size: self.end_reading = True self.file_index = 0 shared_set_x = self.make_shared(temp_set_x, 'x') shared_set_y = self.make_shared(temp_set_y, 'y') shared_set_d = theano.shared(numpy.asarray(temp_set_d, dtype='int32'), name='d', borrow=True) shared_set_xyd = (shared_set_x, shared_set_y, shared_set_d) return shared_set_xyd, temp_set_x, temp_set_y, temp_set_d
def acoustic_decomposition(self, in_file_list, dimension, out_dimension_dict, file_extension_dict, var_file_dict, do_MLPG=True, cfg=None): logger = logging.getLogger('param_generation') logger.debug('acoustic_decomposition for %d files' % len(in_file_list) ) self.load_covariance(var_file_dict, out_dimension_dict) stream_start_index = {} dimension_index = 0 recorded_vuv = False vuv_dimension = None for feature_name in list(out_dimension_dict.keys()): # if feature_name != 'vuv': stream_start_index[feature_name] = dimension_index # else: # vuv_dimension = dimension_index # recorded_vuv = True dimension_index += out_dimension_dict[feature_name] io_funcs = BinaryIOCollection() mlpg_algo = MLParameterGeneration() findex=0 flen=len(in_file_list) for file_name in in_file_list: findex=findex+1 dir_name = os.path.dirname(file_name) file_id = os.path.splitext(os.path.basename(file_name))[0] features, frame_number = io_funcs.load_binary_file_frame(file_name, dimension) logger.info('processing %4d of %4d: %s' % (findex,flen,file_name) ) for feature_name in self.gen_wav_features: logger.debug(' feature: %s' % feature_name) current_features = features[:, stream_start_index[feature_name]:stream_start_index[feature_name]+out_dimension_dict[feature_name]] if FAST_MLPG: ### fast version wants variance per frame, not single global one: var = self.var[feature_name] var = numpy.transpose(numpy.tile(var,frame_number)) else: var = self.var[feature_name] # print var.shape[1] if do_MLPG == False: gen_features = current_features else: gen_features = mlpg_algo.generation(current_features, var, out_dimension_dict[feature_name]//3) # else: # self.logger.critical("the dimensions do not match for MLPG: %d vs %d" %(var.shape[1], out_dimension_dict[feature_name])) # raise logger.debug(' feature dimensions: %d by %d' %(gen_features.shape[0], gen_features.shape[1])) if feature_name in ['lf0', 'F0']: if 'vuv' in stream_start_index: vuv_feature = features[:, stream_start_index['vuv']:stream_start_index['vuv']+1] for i in range(frame_number): if vuv_feature[i, 0] < 0.5 or gen_features[i, 0] < numpy.log(20): gen_features[i, 0] = self.inf_float new_file_name = os.path.join(dir_name, file_id + file_extension_dict[feature_name]) if self.enforce_silence: silence_pattern = cfg.silence_pattern label_align_dir = cfg.in_label_align_dir in_f = open(label_align_dir+'/'+file_id+'.lab','r') for line in in_f.readlines(): line = line.strip() if len(line) < 1: continue temp_list = re.split('\s+', line) start_time = int(int(temp_list[0])*(10**-4)/5) end_time = int(int(temp_list[1])*(10**-4)/5) full_label = temp_list[2] label_binary_flag = self.check_silence_pattern(full_label, silence_pattern) if label_binary_flag: if feature_name in ['lf0', 'F0', 'mag']: gen_features[start_time:end_time, :] = self.inf_float else: gen_features[start_time:end_time, :] = 0.0 io_funcs.array_to_binary_file(gen_features, new_file_name) logger.debug(' wrote to file %s' % new_file_name)
def load_next_partition(self): """Load one block data. The number of frames will be the buffer size set during intialisation. """ self.logger.debug('loading next partition') temp_set_x = numpy.empty((self.buffer_size, self.n_ins)) temp_set_y = numpy.empty((self.buffer_size, self.n_outs)) current_index = 0 ### first check whether there are remaining data from previous utterance if self.remain_frame_number > 0: temp_set_x[current_index:self.remain_frame_number, ] = self.remain_data_x temp_set_y[current_index:self.remain_frame_number, ] = self.remain_data_y current_index += self.remain_frame_number self.remain_frame_number = 0 io_fun = BinaryIOCollection() while True: if current_index >= self.buffer_size: break if self.file_index >= self.list_size: self.end_reading = True self.file_index = 0 break in_features, lab_frame_number = io_fun.load_binary_file_frame(self.x_files_list[self.file_index], self.n_ins) out_features, out_frame_number = io_fun.load_binary_file_frame(self.y_files_list[self.file_index], self.n_outs) frame_number = lab_frame_number if abs(lab_frame_number - out_frame_number) < 5: ## we allow small difference here. may not be correct, but sometimes, there is one/two frames difference if lab_frame_number > out_frame_number: frame_number = out_frame_number else: base_file_name = os.path.basename(self.x_files_list[self.file_index]).split('.')[0] self.logger.critical("the number of frames in label and acoustic features are different: %d vs %d (%s)" %(lab_frame_number, out_frame_number, base_file_name)) raise out_features = out_features[0:frame_number, ] in_features = in_features[0:frame_number, ] if current_index + frame_number <= self.buffer_size: temp_set_x[current_index:current_index+frame_number, ] = in_features temp_set_y[current_index:current_index+frame_number, ] = out_features current_index = current_index + frame_number else: ## if current utterance cannot be stored in the block, then leave the remaining part for the next block used_frame_number = self.buffer_size - current_index temp_set_x[current_index:self.buffer_size, ] = in_features[0:used_frame_number, ] temp_set_y[current_index:self.buffer_size, ] = out_features[0:used_frame_number, ] current_index = self.buffer_size self.remain_data_x = in_features[used_frame_number:frame_number, ] self.remain_data_y = out_features[used_frame_number:frame_number, ] self.remain_frame_number = frame_number - used_frame_number self.file_index += 1 temp_set_x = temp_set_x[0:current_index, ] temp_set_y = temp_set_y[0:current_index, ] numpy.random.seed(271639) numpy.random.shuffle(temp_set_x) numpy.random.seed(271639) numpy.random.shuffle(temp_set_y) shared_set_x = self.make_shared(temp_set_x, 'x') shared_set_y = self.make_shared(temp_set_y, 'y') shared_set_xy = (shared_set_x, shared_set_y) # temp_set_x = self.make_shared(temp_set_x, 'x') # temp_set_y = self.make_shared(temp_set_y, 'y') return shared_set_xy, temp_set_x, temp_set_y