Пример #1
0
    def load_next_utterance_S2SML(self):
        """Load the data for one utterance. This function will be called when utterance-by-utterance loading is required (e.g., sequential training).
        
        """
        
        io_fun = BinaryIOCollection()

        in_features, lab_frame_number = io_fun.load_binary_file_frame(self.x_files_list[self.file_index], self.n_ins)
        out_features, out_frame_number = io_fun.load_binary_file_frame(self.y_files_list[self.file_index], self.n_outs)
        dur_features, dur_frame_number = io_fun.load_binary_file_frame(self.dur_files_list[self.file_index], 1)
      
        ### MLU features sub-division ###
        temp_set_MLU = in_features[0:lab_frame_number, ]
        temp_set_y   = out_features[0:out_frame_number, ]
      
        temp_set_phone = numpy.concatenate([temp_set_MLU[:, self.MLU_div['phone'][0]: self.MLU_div['phone'][1]], temp_set_MLU[:, self.MLU_div['phone'][2]: self.MLU_div['phone'][3]]], axis = 1)
        temp_set_syl   = numpy.concatenate([temp_set_MLU[:, self.MLU_div['syl'][0]: self.MLU_div['syl'][1]], temp_set_MLU[:, self.MLU_div['syl'][2]: self.MLU_div['syl'][3]]], axis = 1)
        temp_set_word  = numpy.concatenate([temp_set_MLU[:, self.MLU_div['word'][0]: self.MLU_div['word'][1]], temp_set_MLU[:, self.MLU_div['word'][2]: self.MLU_div['word'][3] ]], axis = 1)
        
        ### duration array sub-division ###
        dur_features = numpy.reshape(dur_features, (-1, ))
        temp_set_d   = dur_features.astype(int)   
        dur_word_syl = temp_set_d[0: -lab_frame_number]    
        
        num_ph    = lab_frame_number
        num_syl   = (numpy.where(numpy.cumsum(dur_word_syl[::-1])==lab_frame_number)[0][0] + 1)
        num_words = len(dur_word_syl) - num_syl 
        
        temp_set_dur_phone = temp_set_d[-num_ph:] 
        temp_set_dur_word  = dur_word_syl[0: num_words]
        temp_set_dur_syl   = dur_word_syl[num_words: ]
        
        ### additional feature matrix (syllable+phone+frame=432) ###
        num_frames = sum(temp_set_dur_phone)
        temp_set_af = numpy.empty((num_frames, self.MLU_div['length'][-1]))
        
        temp_set_af[0: num_syl, self.MLU_div['length'][0]: self.MLU_div['length'][1] ] = temp_set_syl[numpy.cumsum(temp_set_dur_syl)-1]
        temp_set_af[0: num_ph, self.MLU_div['length'][1]: self.MLU_div['length'][2]] = temp_set_phone
        
        ### input word feature matrix ###
        temp_set_dur_word_segments = numpy.zeros(num_words, dtype='int32')
        syl_bound = numpy.cumsum(temp_set_dur_word)
        for indx in xrange(num_words):
            temp_set_dur_word_segments[indx] = int(sum(temp_set_dur_syl[0: syl_bound[indx]]))
        temp_set_x = temp_set_word[temp_set_dur_word_segments-1]
        
        ### rest of the code similar to S2S ###
        self.file_index += 1

        if  self.file_index >= self.list_size:
            self.end_reading = True
            self.file_index = 0

        shared_set_x  = self.make_shared(temp_set_x, 'x')
        shared_set_y  = self.make_shared(temp_set_y, 'y')
        shared_set_d  = theano.shared(numpy.asarray(temp_set_d, dtype='int32'), name='d', borrow=True)

        shared_set_xyd = (shared_set_x, shared_set_y, shared_set_d)
        
        return shared_set_xyd, temp_set_x, temp_set_y, temp_set_d, temp_set_af
Пример #2
0
    def load_next_utterance_CTC(self):

        temp_set_x = numpy.empty((self.buffer_size, self.n_ins))
        temp_set_y = numpy.empty(self.buffer_size)

        io_fun = BinaryIOCollection()

        in_features, lab_frame_number = io_fun.load_binary_file_frame(self.x_files_list[self.file_index], self.n_ins)
        out_features, out_frame_number = io_fun.load_binary_file_frame(self.y_files_list[self.file_index], self.n_outs)

        frame_number = lab_frame_number
        temp_set_x = in_features[0:frame_number, ]

        temp_set_y = numpy.array([self.n_outs])
        for il in numpy.argmax(out_features, axis=1):
            temp_set_y = numpy.concatenate((temp_set_y, [il, self.n_outs]), axis=0)

        self.file_index += 1

        if  self.file_index >= self.list_size:
            self.end_reading = True
            self.file_index = 0

        shared_set_x = self.make_shared(temp_set_x, 'x')
        shared_set_y = theano.shared(numpy.asarray(temp_set_y, dtype='int32'), name='y', borrow=True)

        shared_set_xy = (shared_set_x, shared_set_y)

        return shared_set_xy, temp_set_x, temp_set_y
Пример #3
0
    def load_next_utterance(self):
        """Load the data for one utterance. This function will be called when utterance-by-utterance loading is required (e.g., sequential training).

        """

        temp_set_x = np.empty((self.buffer_size, self.n_ins))
        temp_set_y = np.empty((self.buffer_size, self.n_outs))

        io_fun = BinaryIOCollection()

        in_features, lab_frame_number = io_fun.load_binary_file_frame(self.x_files_list[self.file_index], self.n_ins)
        out_features, out_frame_number = io_fun.load_binary_file_frame(self.y_files_list[self.file_index], self.n_outs)

        frame_number = lab_frame_number
        if abs(lab_frame_number - out_frame_number) < 5:  ## we allow small difference here. may not be correct, but sometimes, there is one/two frames difference
            if lab_frame_number > out_frame_number:
                frame_number = out_frame_number
        else:
            base_file_name = self.x_files_list[self.file_index].split('/')[-1].split('.')[0]
            logging.info("the number of frames in label and acoustic features are different: %d vs %d (%s)" % (
            lab_frame_number, out_frame_number, base_file_name))
            raise

        temp_set_y = out_features[0:frame_number, ]
        temp_set_x = in_features[0:frame_number, ]

        self.file_index += 1

        if self.file_index >= self.list_size:
            self.end_reading = True
            self.file_index = 0


        return temp_set_x, temp_set_y
Пример #4
0
    def make_equal_frames(self, in_file_list, ref_file_list, in_dimension_dict):
        logger = logging.getLogger("acoustic_comp")

        logger.info('making equal number of lines...')

        io_funcs = BinaryIOCollection()

        utt_number = len(in_file_list)

        for i in range(utt_number):
            in_file_name = in_file_list[i]
            in_data_stream_name = in_file_name.split('.')[-1]
            in_feature_dim = in_dimension_dict[in_data_stream_name]
            in_features, in_frame_number = io_funcs.load_binary_file_frame(in_file_name, in_feature_dim)

            ref_file_name = ref_file_list[i]
            ref_data_stream_name = ref_file_name.split('.')[-1]
            ref_feature_dim = in_dimension_dict[ref_data_stream_name]
            ref_features, ref_frame_number = io_funcs.load_binary_file_frame(ref_file_name, ref_feature_dim)

            target_features = numpy.zeros((ref_frame_number, in_feature_dim))
            if in_frame_number == ref_frame_number:
                continue;
            elif in_frame_number > ref_frame_number:
                target_features[0:ref_frame_number, ] = in_features[0:ref_frame_number, ]
            elif in_frame_number < ref_frame_number:
                target_features[0:in_frame_number, ] = in_features[0:in_frame_number, ]
            io_funcs.array_to_binary_file(target_features, in_file_name)

        logger.info('Finished: made equal rows in data stream %s with reference to data stream %s ' %(in_data_stream_name, ref_data_stream_name))
def read_data_from_file_list(inp_file_list,
                             out_file_list,
                             inp_dim,
                             out_dim,
                             sequential_training=True):
    io_funcs = BinaryIOCollection()

    utt_len = len(inp_file_list)

    file_length_dict = {}

    if sequential_training:
        temp_set_x = {}
        temp_set_y = {}
    else:
        temp_set_x = np.empty((BUFFER_SIZE, inp_dim))
        temp_set_y = np.empty((BUFFER_SIZE, out_dim))

    ### read file by file ###
    current_index = 0
    for i in range(utt_len):
        inp_file_name = inp_file_list[i]
        out_file_name = out_file_list[i]
        inp_features, inp_frame_number = io_funcs.load_binary_file_frame(
            inp_file_name, inp_dim)
        out_features, out_frame_number = io_funcs.load_binary_file_frame(
            out_file_name, out_dim)
        base_file_name = os.path.basename(inp_file_name).split(".")[0]

        if abs(inp_frame_number - out_frame_number) > 5:
            print 'the number of frames in input and output features are different: %d vs %d (%s)' % (
                inp_frame_number, out_frame_number, base_file_name)
            sys.exit(0)
        else:
            frame_number = min(inp_frame_number, out_frame_number)

        if sequential_training:
            temp_set_x[base_file_name] = inp_features
            temp_set_y[base_file_name] = out_features
        else:
            temp_set_x[current_index:current_index +
                       frame_number, ] = inp_features
            temp_set_y[current_index:current_index +
                       frame_number, ] = out_features
            current_index += frame_number

        if frame_number not in file_length_dict:
            file_length_dict[frame_number] = [base_file_name]
        else:
            file_length_dict[frame_number].append(base_file_name)

        print_status(i, utt_len)

    sys.stdout.write("\n")

    if not sequential_training:
        temp_set_x = temp_set_x[0:current_index, ]
        temp_set_y = temp_set_y[0:current_index, ]

    return temp_set_x, temp_set_y, file_length_dict
Пример #6
0
def simple_scale_variance_CONTINUUM(indir, outdir, var_file_dict,
                                    out_dimension_dict, file_id_list):
    ## Try range of interpolation weights for combining global & local variance
    all_streams = ['cmp', 'HNR', 'F0', 'LSF', 'Gain', 'LSFsource']
    streams_to_scale = ['LSF']

    static_variances = {}

    static_dimension_dict = {}
    for (feature_name, size) in out_dimension_dict.items():
        static_dimension_dict[feature_name] = size / 3

    io_funcs = BinaryIOCollection()
    for feature_name in var_file_dict.keys():
        var_values, dimension = io_funcs.load_binary_file_frame(
            var_file_dict[feature_name], 1)
        static_var_values = var_values[:static_dimension_dict[feature_name], :]
        static_variances[feature_name] = static_var_values

    if not os.path.isdir(outdir):
        os.makedirs(outdir)

    file_id_list_out = []
    for uttname in file_id_list:
        for gv_weight in [
                0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0
        ]:
            local_weight = 1.0 - gv_weight
            for stream in all_streams:
                infile = os.path.join(indir, uttname + '.' + stream)
                extended_uttname = uttname + '_gv' + str(gv_weight)
                print extended_uttname
                outfile = os.path.join(outdir, extended_uttname + '.' + stream)
                if not os.path.isfile(infile):
                    sys.exit(infile + ' does not exist')
                if stream in streams_to_scale:
                    speech, dimension = io_funcs.load_binary_file_frame(
                        infile, static_dimension_dict[stream])
                    utt_mean = numpy.mean(speech, axis=0)
                    utt_std = numpy.std(speech, axis=0)

                    global_std = numpy.transpose((static_variances[stream]))

                    weighted_global_std = (gv_weight * global_std) + (
                        local_weight * utt_std)

                    std_ratio = weighted_global_std / utt_std

                    nframes, ndim = numpy.shape(speech)
                    utt_mean_matrix = numpy.tile(utt_mean, (nframes, 1))
                    std_ratio_matrix = numpy.tile(std_ratio, (nframes, 1))

                    scaled_speech = ((speech - utt_mean_matrix) *
                                     std_ratio_matrix) + utt_mean_matrix
                    io_funcs.array_to_binary_file(scaled_speech, outfile)

                else:
                    os.system('cp %s %s' % (infile, outfile))
            file_id_list_out.append(extended_uttname)
    return file_id_list_out
Пример #7
0
    def merge_data(self, binary_label_file_list, new_feat_file_list, out_feat_file_list):
        '''
        merging new features with normalised label features
        '''
        utt_number = len(new_feat_file_list)
        if utt_number != len(binary_label_file_list):
            print("the number of new feature input files and label files should be the same!\n");
            sys.exit(1)

        new_feat_ext   = new_feat_file_list[0].split('/')[-1].split('.')[1]

        io_funcs = BinaryIOCollection()
        for i in range(utt_number):
            lab_file_name = binary_label_file_list[i]
            new_feat_file_name = new_feat_file_list[i]
            out_feat_file_name = out_feat_file_list[i]

            lab_features, lab_frame_number  = io_funcs.load_binary_file_frame(lab_file_name, self.lab_dim)
            new_features, feat_frame_number = io_funcs.load_binary_file_frame(new_feat_file_name, self.feat_dim)


            if (lab_frame_number - feat_frame_number)>5:
                base_file_name = new_feat_file_list[i].split('/')[-1].split('.')[0]
                self.logger.critical("the number of frames in label and new features are different: %d vs %d (%s)" %(lab_frame_number, feat_frame_number, base_file_name))
                raise

            merged_features = numpy.zeros((lab_frame_number, self.lab_dim+self.feat_dim))

            merged_features[0:lab_frame_number, 0:self.lab_dim] = lab_features
            merged_features[0:feat_frame_number, self.lab_dim:self.lab_dim+self.feat_dim] = new_features[0:lab_frame_number, ]

            io_funcs.array_to_binary_file(merged_features, out_feat_file_name)
            self.logger.debug('merged new feature %s of %d frames with %d label features' % (new_feat_ext, feat_frame_number,lab_frame_number) )
Пример #8
0
    def load_next_utterance_CTC(self):

        temp_set_x = numpy.empty((self.buffer_size, self.n_ins))
        temp_set_y = numpy.empty(self.buffer_size)

        io_fun = BinaryIOCollection()

        in_features, lab_frame_number = io_fun.load_binary_file_frame(
            self.x_files_list[self.file_index], self.n_ins)
        out_features, out_frame_number = io_fun.load_binary_file_frame(
            self.y_files_list[self.file_index], self.n_outs)

        frame_number = lab_frame_number
        temp_set_x = in_features[0:frame_number, ]

        temp_set_y = numpy.array([self.n_outs])
        for il in numpy.argmax(out_features, axis=1):
            temp_set_y = numpy.concatenate((temp_set_y, [il, self.n_outs]),
                                           axis=0)

        self.file_index += 1

        if self.file_index >= self.list_size:
            self.end_reading = True
            self.file_index = 0

        shared_set_x = self.make_shared(temp_set_x, 'x')
        shared_set_y = theano.shared(numpy.asarray(temp_set_y, dtype='int32'),
                                     name='y',
                                     borrow=True)

        shared_set_xy = (shared_set_x, shared_set_y)

        return shared_set_xy, temp_set_x, temp_set_y
Пример #9
0
    def make_equal_frames(self, in_file_list, ref_file_list, in_dimension_dict):
        logger = logging.getLogger("acoustic_comp")
        
        logger.info('making equal number of lines...')
        
        io_funcs = BinaryIOCollection()

        utt_number = len(in_file_list)

        for i in xrange(utt_number):
            in_file_name = in_file_list[i]
            in_data_stream_name = in_file_name.split('.')[-1]
            in_feature_dim = in_dimension_dict[in_data_stream_name]
            in_features, in_frame_number = io_funcs.load_binary_file_frame(in_file_name, in_feature_dim)
            
            ref_file_name = ref_file_list[i]
            ref_data_stream_name = ref_file_name.split('.')[-1]
            ref_feature_dim = in_dimension_dict[ref_data_stream_name]
            ref_features, ref_frame_number = io_funcs.load_binary_file_frame(ref_file_name, ref_feature_dim)

            target_features = numpy.zeros((ref_frame_number, in_feature_dim))
            if in_frame_number == ref_frame_number:
                continue;
            elif in_frame_number > ref_frame_number:
                target_features[0:ref_frame_number, ] = in_features[0:ref_frame_number, ]
            elif in_frame_number < ref_frame_number:
                target_features[0:in_frame_number, ] = in_features[0:in_frame_number, ]
            io_funcs.array_to_binary_file(target_features, in_file_name)
        
        logger.info('Finished: made equal rows in data stream %s with reference to data stream %s ' %(in_data_stream_name, ref_data_stream_name))
Пример #10
0
    def load_next_utterance_S2SML(self):
        """Load the data for one utterance. This function will be called when utterance-by-utterance loading is required (e.g., sequential training).
        
        """
        
        io_fun = BinaryIOCollection()

        in_features, lab_frame_number = io_fun.load_binary_file_frame(self.x_files_list[self.file_index], self.n_ins)
        out_features, out_frame_number = io_fun.load_binary_file_frame(self.y_files_list[self.file_index], self.n_outs)
        dur_features, dur_frame_number = io_fun.load_binary_file_frame(self.dur_files_list[self.file_index], 1)
      
        ### MLU features sub-division ###
        temp_set_MLU = in_features[0:lab_frame_number, ]
        temp_set_y   = out_features[0:out_frame_number, ]
      
        temp_set_phone = numpy.concatenate([temp_set_MLU[:, self.MLU_div['phone'][0]: self.MLU_div['phone'][1]], temp_set_MLU[:, self.MLU_div['phone'][2]: self.MLU_div['phone'][3]]], axis = 1)
        temp_set_syl   = numpy.concatenate([temp_set_MLU[:, self.MLU_div['syl'][0]: self.MLU_div['syl'][1]], temp_set_MLU[:, self.MLU_div['syl'][2]: self.MLU_div['syl'][3]]], axis = 1)
        temp_set_word  = numpy.concatenate([temp_set_MLU[:, self.MLU_div['word'][0]: self.MLU_div['word'][1]], temp_set_MLU[:, self.MLU_div['word'][2]: self.MLU_div['word'][3] ]], axis = 1)
        
        ### duration array sub-division ###
        dur_features = numpy.reshape(dur_features, (-1, ))
        temp_set_d   = dur_features.astype(int)   
        dur_word_syl = temp_set_d[0: -lab_frame_number]    
        
        num_ph    = lab_frame_number
        num_syl   = (numpy.where(numpy.cumsum(dur_word_syl[::-1])==lab_frame_number)[0][0] + 1)
        num_words = len(dur_word_syl) - num_syl 
        
        temp_set_dur_phone = temp_set_d[-num_ph:] 
        temp_set_dur_word  = dur_word_syl[0: num_words]
        temp_set_dur_syl   = dur_word_syl[num_words: ]
        
        ### additional feature matrix (syllable+phone+frame=432) ###
        num_frames = sum(temp_set_dur_phone)
        temp_set_af = numpy.empty((num_frames, self.MLU_div['length'][-1]))
        
        temp_set_af[0: num_syl, self.MLU_div['length'][0]: self.MLU_div['length'][1] ] = temp_set_syl[numpy.cumsum(temp_set_dur_syl)-1]
        temp_set_af[0: num_ph, self.MLU_div['length'][1]: self.MLU_div['length'][2]] = temp_set_phone
        
        ### input word feature matrix ###
        temp_set_dur_word_segments = numpy.zeros(num_words, dtype='int32')
        syl_bound = numpy.cumsum(temp_set_dur_word)
        for indx in range(num_words):
            temp_set_dur_word_segments[indx] = int(sum(temp_set_dur_syl[0: syl_bound[indx]]))
        temp_set_x = temp_set_word[temp_set_dur_word_segments-1]
        
        ### rest of the code similar to S2S ###
        self.file_index += 1

        if  self.file_index >= self.list_size:
            self.end_reading = True
            self.file_index = 0

        shared_set_x  = self.make_shared(temp_set_x, 'x')
        shared_set_y  = self.make_shared(temp_set_y, 'y')
        shared_set_d  = theano.shared(numpy.asarray(temp_set_d, dtype='int32'), name='d', borrow=True)

        shared_set_xyd = (shared_set_x, shared_set_y, shared_set_d)
        
        return shared_set_xyd, temp_set_x, temp_set_y, temp_set_d, temp_set_af
Пример #11
0
    def get_file_lengths(self):
        io_funcs = BinaryIOCollection()

        self.file_length_dict = {'framenum2utt':{}, 'utt2framenum':{}, 'utt2index':{}}

        ### read file by file ###
        while True:
            if  self.file_index >= self.list_size:
                self.end_reading = True
                self.file_index = 0
                break

            in_features, lab_frame_number = io_funcs.load_binary_file_frame(self.x_files_list[self.file_index], self.n_ins)
            out_features, out_frame_number = io_funcs.load_binary_file_frame(self.y_files_list[self.file_index], self.n_outs)
         
            base_file_name = os.path.basename(self.x_files_list[self.file_index]).split('.')[0]
            if abs(lab_frame_number - out_frame_number) < 5:    ## we allow small difference here. may not be correct, but sometimes, there is one/two frames difference
                frame_number = min(lab_frame_number, out_frame_number)
            else:
                self.logger.critical("the number of frames in label and acoustic features are different: %d vs %d (%s)" %(lab_frame_number, out_frame_number, base_file_name))
                raise

            if frame_number not in self.file_length_dict['framenum2utt']:
                self.file_length_dict['framenum2utt'][frame_number] = [base_file_name]
            else:
                self.file_length_dict['framenum2utt'][frame_number].append(base_file_name)

            self.file_length_dict['utt2framenum'][base_file_name] = frame_number
            self.file_length_dict['utt2index'][base_file_name] = self.file_index
            self.file_index += 1

        self.reset()
Пример #12
0
    def get_file_lengths(self):
        io_funcs = BinaryIOCollection()

        self.file_length_dict = {'framenum2utt':{}, 'utt2framenum':{}, 'utt2index':{}}

        ### read file by file ###
        while True:
            if  self.file_index >= self.list_size:
                self.end_reading = True
                self.file_index = 0
                break

            in_features, lab_frame_number = io_funcs.load_binary_file_frame(self.x_files_list[self.file_index], self.n_ins)
            out_features, out_frame_number = io_funcs.load_binary_file_frame(self.y_files_list[self.file_index], self.n_outs)
         
            base_file_name = os.path.basename(self.x_files_list[self.file_index]).split('.')[0]
            if abs(lab_frame_number - out_frame_number) < 5:    ## we allow small difference here. may not be correct, but sometimes, there is one/two frames difference
                frame_number = min(lab_frame_number, out_frame_number)
            else:
                self.logger.critical("the number of frames in label and acoustic features are different: %d vs %d (%s)" %(lab_frame_number, out_frame_number, base_file_name))
                raise

            if frame_number not in self.file_length_dict['framenum2utt']:
                self.file_length_dict['framenum2utt'][frame_number] = [base_file_name]
            else:
                self.file_length_dict['framenum2utt'][frame_number].append(base_file_name)

            self.file_length_dict['utt2framenum'][base_file_name] = frame_number
            self.file_length_dict['utt2index'][base_file_name] = self.file_index
            self.file_index += 1

        self.reset()
Пример #13
0
def simple_scale_variance(indir,
                          outdir,
                          var_file_dict,
                          out_dimension_dict,
                          file_id_list,
                          gv_weight=1.0):
    ## simple variance scaling (silen et al. 2012, paragraph 3.1)
    ## TODO: Lots of things like stream names hardcoded here; 3 for delta + delta-delta; ...
    #     all_streams = ['cmp','HNR','F0','LSF','Gain','LSFsource']
    #     streams_to_scale = ['LSF']
    all_streams = ['cmp', 'mgc', 'lf0', 'bap']
    streams_to_scale = ['mgc']

    static_variances = {}

    static_dimension_dict = {}
    for (feature_name, size) in out_dimension_dict.items():
        static_dimension_dict[feature_name] = size / 3

    io_funcs = BinaryIOCollection()
    for feature_name in var_file_dict.keys():
        var_values, dimension = io_funcs.load_binary_file_frame(
            var_file_dict[feature_name], 1)
        static_var_values = var_values[:static_dimension_dict[feature_name], :]
        static_variances[feature_name] = static_var_values

    if not os.path.isdir(outdir):
        os.makedirs(outdir)

    assert gv_weight <= 1.0 and gv_weight >= 0.0
    local_weight = 1.0 - gv_weight

    for uttname in file_id_list:
        for stream in all_streams:
            infile = os.path.join(indir, uttname + '.' + stream)
            outfile = os.path.join(outdir, uttname + '.' + stream)
            if not os.path.isfile(infile):
                sys.exit(infile + ' does not exist')
            if stream in streams_to_scale:
                speech, dimension = io_funcs.load_binary_file_frame(
                    infile, static_dimension_dict[stream])
                utt_mean = numpy.mean(speech, axis=0)
                utt_std = numpy.std(speech, axis=0)

                global_std = numpy.transpose((static_variances[stream]))
                weighted_global_std = (gv_weight *
                                       global_std) + (local_weight * utt_std)
                std_ratio = weighted_global_std / utt_std

                nframes, ndim = numpy.shape(speech)
                utt_mean_matrix = numpy.tile(utt_mean, (nframes, 1))
                std_ratio_matrix = numpy.tile(std_ratio, (nframes, 1))

                scaled_speech = ((speech - utt_mean_matrix) *
                                 std_ratio_matrix) + utt_mean_matrix
                io_funcs.array_to_binary_file(scaled_speech, outfile)

            else:
                os.system('cp %s %s' % (infile, outfile))
Пример #14
0
    def load_next_utterance(self):
        """Load the data for one utterance. This function will be called when utterance-by-utterance loading is required (e.g., sequential training).

        """

        temp_set_x = numpy.empty((self.buffer_size, self.n_ins))
        temp_set_y = numpy.empty((self.buffer_size, self.n_outs))

        io_fun = BinaryIOCollection()

        in_features, lab_frame_number = io_fun.load_binary_file_frame(
            self.x_files_list[self.file_index], self.n_ins)
        out_features, out_frame_number = io_fun.load_binary_file_frame(
            self.y_files_list[self.file_index], self.n_outs)

        frame_number = lab_frame_number
        print(' %%%%%  {} : {} /  {}   '.format(base_file_name, self.n_ins,
                                                self.n_outs))

        if abs(
                lab_frame_number - out_frame_number
        ) < 5:  ## we allow small difference here. may not be correct, but sometimes, there is one/two frames difference
            if lab_frame_number > out_frame_number:
                frame_number = out_frame_number
        else:
            base_file_name = os.path.basename(
                self.x_files_list[self.file_index]).split('.')[0]
            self.logger.critical(
                "the number of frames in label and acoustic features are different: %d vs %d (%s)"
                % (lab_frame_number, out_frame_number, base_file_name))
            raise

        temp_set_y = out_features[0:frame_number, ]
        temp_set_x = in_features[0:frame_number, ]

        self.file_index += 1

        if self.file_index >= self.list_size:
            self.end_reading = True
            self.file_index = 0

        # reshape input-output
        if self.reshape_io:
            temp_set_x = numpy.reshape(temp_set_x,
                                       (1, temp_set_x.shape[0], self.n_ins))
            temp_set_y = numpy.reshape(temp_set_y,
                                       (1, temp_set_y.shape[0], self.n_outs))

            temp_set_x = numpy.array(temp_set_x, 'float32')
            temp_set_y = numpy.array(temp_set_y, 'float32')

        shared_set_x = self.make_shared(temp_set_x, 'x')
        shared_set_y = self.make_shared(temp_set_y, 'y')

        shared_set_xy = (shared_set_x, shared_set_y)

        return shared_set_xy, temp_set_x, temp_set_y
Пример #15
0
def simple_scale_variance_CONTINUUM(indir, outdir, var_file_dict, out_dimension_dict, file_id_list):
    ## Try range of interpolation weights for combining global & local variance
    all_streams = ['cmp','HNR','F0','LSF','Gain','LSFsource']
    streams_to_scale = ['LSF']

    static_variances = {}

    static_dimension_dict = {}
    for (feature_name,size) in list(out_dimension_dict.items()):
        static_dimension_dict[feature_name] = size/3

    io_funcs = BinaryIOCollection()
    for feature_name in list(var_file_dict.keys()):
        var_values, dimension = io_funcs.load_binary_file_frame(var_file_dict[feature_name], 1)
        static_var_values = var_values[:static_dimension_dict[feature_name], :]
        static_variances[feature_name] = static_var_values

    if not os.path.isdir(outdir):
        os.makedirs(outdir)

    file_id_list_out = []
    for uttname in file_id_list:
        for gv_weight in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
            local_weight = 1.0 - gv_weight
            for stream in all_streams:
                infile = os.path.join(indir, uttname + '.' + stream)
                extended_uttname = uttname + '_gv' + str(gv_weight)
                print(extended_uttname)
                outfile = os.path.join(outdir, extended_uttname + '.' + stream)
                if not os.path.isfile(infile):
                    sys.exit(infile + ' does not exist')
                if stream in streams_to_scale:
                    speech, dimension = io_funcs.load_binary_file_frame(infile, static_dimension_dict[stream])
                    utt_mean = numpy.mean(speech, axis=0)
                    utt_std =  numpy.std(speech, axis=0)

                    global_std = numpy.transpose((static_variances[stream]))

                    weighted_global_std = (gv_weight * global_std) + (local_weight * utt_std)

                    std_ratio = weighted_global_std / utt_std

                    nframes, ndim = numpy.shape(speech)
                    utt_mean_matrix = numpy.tile(utt_mean, (nframes,1))
                    std_ratio_matrix = numpy.tile(std_ratio, (nframes,1))

                    scaled_speech = ((speech - utt_mean_matrix) * std_ratio_matrix) + utt_mean_matrix
                    io_funcs.array_to_binary_file(scaled_speech, outfile)


                else:
                    os.system('cp %s %s'%(infile, outfile))
            file_id_list_out.append(extended_uttname)
    return file_id_list_out
Пример #16
0
    def load_next_utterance(self):
        """Load the data for one utterance. This function will be called when utterance-by-utterance loading is required (e.g., sequential training).
        
        """
        shared = []
        temp_x = []
        temp_y = []

        for i in xrange(32):

            temp_set_x = numpy.empty((self.buffer_size, self.n_ins))
            temp_set_y = numpy.empty((self.buffer_size, self.n_outs))

            io_fun = BinaryIOCollection()

            in_features, lab_frame_number = io_fun.load_binary_file_frame(
                self.x_files_list[self.file_index], self.n_ins)
            out_features, out_frame_number = io_fun.load_binary_file_frame(
                self.y_files_list[self.file_index], self.n_outs)

            frame_number = lab_frame_number
            if abs(
                    lab_frame_number - out_frame_number
            ) < 5:  ## we allow small difference here. may not be correct, but sometimes, there is one/two frames difference
                if lab_frame_number > out_frame_number:
                    frame_number = out_frame_number
            else:
                base_file_name = self.x_files_list[self.file_index].split(
                    '/')[-1].split('.')[0]
                self.logger.critical(
                    "the number of frames in label and acoustic features are different: %d vs %d (%s)"
                    % (lab_frame_number, out_frame_number, base_file_name))
                raise

            temp_set_y = out_features[0:frame_number, ]
            temp_set_x = in_features[0:frame_number, ]

            shared_set_x = temp_set_x
            shared_set_y = temp_set_y

            shared_set_xy = (shared_set_x, shared_set_y)
            shared.append(shared_set_xy)
            temp_x.append(numpy.asarray(shared_set_x, dtype=numpy.float32))
            temp_y.append(numpy.asarray(shared_set_y, dtype=numpy.float32))

            self.file_index += 1

        if self.file_index + 31 >= self.list_size:
            self.end_reading = True
            self.file_index = 0

        return shared, temp_x, temp_y
Пример #17
0
def simple_scale_variance(indir, outdir, var_file_dict, out_dimension_dict, file_id_list, gv_weight=1.0):
    ## simple variance scaling (silen et al. 2012, paragraph 3.1)
    ## TODO: Lots of things like stream names hardcoded here; 3 for delta + delta-delta; ...
#     all_streams = ['cmp','HNR','F0','LSF','Gain','LSFsource']
#     streams_to_scale = ['LSF']
    all_streams = ['cmp','mgc','lf0','bap']
    streams_to_scale = ['mgc']
    
    static_variances = {}
 
    static_dimension_dict = {}
    for (feature_name,size) in out_dimension_dict.items():
        static_dimension_dict[feature_name] = size/3

    io_funcs = BinaryIOCollection()
    for feature_name in var_file_dict.keys():
        var_values, dimension = io_funcs.load_binary_file_frame(var_file_dict[feature_name], 1)
        static_var_values = var_values[:static_dimension_dict[feature_name], :]
        static_variances[feature_name] = static_var_values

    if not os.path.isdir(outdir):
        os.makedirs(outdir)

    assert gv_weight <= 1.0 and gv_weight >= 0.0
    local_weight = 1.0 - gv_weight

    for uttname in file_id_list:
        for stream in all_streams:
            infile = os.path.join(indir, uttname + '.' + stream)
            outfile = os.path.join(outdir, uttname + '.' + stream)
            if not os.path.isfile(infile):
                sys.exit(infile + ' does not exist')
            if stream in streams_to_scale:
                speech, dimension = io_funcs.load_binary_file_frame(infile, static_dimension_dict[stream])
                utt_mean = numpy.mean(speech, axis=0) 
                utt_std =  numpy.std(speech, axis=0) 

                global_std = numpy.transpose((static_variances[stream]))
                weighted_global_std = (gv_weight * global_std) + (local_weight * utt_std)
                std_ratio = weighted_global_std / utt_std 

                nframes, ndim = numpy.shape(speech)
                utt_mean_matrix = numpy.tile(utt_mean, (nframes,1))
                std_ratio_matrix = numpy.tile(std_ratio, (nframes,1))

                scaled_speech = ((speech - utt_mean_matrix) * std_ratio_matrix) + utt_mean_matrix
                io_funcs.array_to_binary_file(scaled_speech, outfile)


            else:
                os.system('cp %s %s'%(infile, outfile))
Пример #18
0
    def compute_mean(self, file_list, start_index, end_index):

        local_feature_dimension = end_index - start_index

        mean_vector = numpy.zeros((1, local_feature_dimension))
        all_frame_number = 0

        io_funcs = BinaryIOCollection()
        for file_name in file_list:
            features, current_frame_number = io_funcs.load_binary_file_frame(
                file_name, self.feature_dimension)

            mean_vector += numpy.reshape(
                numpy.sum(features[:, start_index:end_index], axis=0),
                (1, local_feature_dimension))
            all_frame_number += current_frame_number

        mean_vector /= float(all_frame_number)

        # po=numpy.get_printoptions()
        # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4)
        self.logger.info('computed mean vector of length %d :' %
                         mean_vector.shape[1])
        self.logger.info(' mean: %s' % mean_vector)
        # restore the print options
        # numpy.set_printoptions(po)

        return mean_vector
Пример #19
0
    def merge_label(self, binary_label_file_list, new_feat_file_list,
                    out_feat_file_list):
        """
            merging additional label for each utterance. 
        """
        utt_number = len(new_feat_file_list)
        if utt_number != len(binary_label_file_list):
            print(
                "the number of new feature input files and label files should be the same!\n"
            )
            sys.exit(1)

        io_funcs = BinaryIOCollection()
        for i in range(utt_number):

            lab_file_name = binary_label_file_list[i]
            new_feat_file_name = new_feat_file_list[i]
            out_feat_file_name = out_feat_file_list[i]

            lab_features, lab_frame_number = io_funcs.load_binary_file_frame(
                lab_file_name, self.lab_dim)
            # shape of new feature shoule be (1, dim)
            new_features = io_funcs.load_binary_file(new_feat_file_name,
                                                     self.feat_dim)
            # expand shape of new feature to (T, dim)
            new_features = numpy.tile(new_features, (lab_frame_number, 1))
            merged_features = numpy.zeros(
                (lab_frame_number, self.lab_dim + self.feat_dim))

            merged_features[0:lab_frame_number, 0:self.lab_dim] = lab_features
            merged_features[0:lab_frame_number, self.lab_dim:self.lab_dim +
                            self.feat_dim] = new_features[0:lab_frame_number, ]

            io_funcs.array_to_binary_file(merged_features, out_feat_file_name)
Пример #20
0
    def compute_std(self, file_list, mean_vector, start_index, end_index):
        local_feature_dimension = end_index - start_index

        std_vector = numpy.zeros((1, self.feature_dimension))
        all_frame_number = 0

        io_funcs = BinaryIOCollection()
        for file_name in file_list:
            features, current_frame_number = io_funcs.load_binary_file_frame(file_name, self.feature_dimension)

            mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1))
            
            std_vector += numpy.reshape(numpy.sum((features[:, start_index:end_index] - mean_matrix) ** 2, axis=0), (1, local_feature_dimension))
            all_frame_number += current_frame_number
            
        std_vector /= float(all_frame_number)
        
        std_vector = std_vector ** 0.5
        
        # po=numpy.get_printoptions()
        # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4)
        self.logger.info('computed  std vector of length %d' % std_vector.shape[1] )
        self.logger.info('  std: %s' % std_vector)
        # restore the print options
        # numpy.set_printoptions(po)
        
        return  std_vector
Пример #21
0
def read_and_transform_data_from_file_list(in_file_list,
                                           dim,
                                           seq_length=200,
                                           merge_size=1):
    io_funcs = BinaryIOCollection()

    num_of_utt = len(in_file_list)

    temp_set = np.zeros((FRAME_BUFFER_SIZE, dim))

    ### read file by file ###
    current_index = 0
    for i in range(num_of_utt):
        in_file_name = in_file_list[i]
        in_features, frame_number = io_funcs.load_binary_file_frame(
            in_file_name, dim)
        base_file_name = os.path.basename(in_file_name).split(".")[0]

        temp_set[current_index:current_index + frame_number, ] = in_features
        current_index += frame_number

        if (i + 1) % merge_size == 0:
            current_index = seq_length * (int(
                np.ceil(float(current_index) / float(seq_length))))

        drawProgressBar(i + 1, num_of_utt)

    sys.stdout.write("\n")

    num_of_samples = int(np.ceil(float(current_index) / float(seq_length)))

    temp_set = temp_set[0:num_of_samples * seq_length, ]
    temp_set = temp_set.reshape(num_of_samples, seq_length)

    return temp_set
    def feature_denormalisation(self, in_file_list, out_file_list, mean_vector,
                                std_vector):
        io_funcs = BinaryIOCollection()
        file_number = len(in_file_list)
        try:
            assert len(in_file_list) == len(out_file_list)
        except AssertionError:
            logger.critical(
                'The input and output file numbers are not the same! %d vs %d'
                % (len(in_file_list), len(out_file_list)))
            raise

        try:
            assert mean_vector.size == self.feature_dimension and std_vector.size == self.feature_dimension
        except AssertionError:
            logger.critical(
                'the dimensionalities of the mean and standard derivation vectors are not the same as the dimensionality of the feature'
            )
            raise

        for i in xrange(file_number):
            features, current_frame_number = io_funcs.load_binary_file_frame(
                in_file_list[i], self.feature_dimension)

            mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1))
            std_matrix = numpy.tile(std_vector, (current_frame_number, 1))

            norm_features = features * std_matrix + mean_matrix

            io_funcs.array_to_binary_file(norm_features, out_file_list[i])
Пример #23
0
    def feature_normalisation(self, in_file_list, out_file_list):
        logger = logging.getLogger('feature_normalisation')
        
#        self.feature_dimension = feature_dimension
        try:
            assert len(in_file_list) == len(out_file_list)
        except  AssertionError:
            logger.critical('The input and output file numbers are not the same! %d vs %d' %(len(in_file_list), len(out_file_list)))
            raise

        if self.mean_vector == None:
            self.mean_vector = self.compute_mean(in_file_list, 0, self.feature_dimension)
        if self.std_vector  == None:
            self.std_vector = self.compute_std(in_file_list, self.mean_vector, 0, self.feature_dimension)
        
        io_funcs = BinaryIOCollection()
        file_number = len(in_file_list)
        for i in xrange(file_number):
            features, current_frame_number = io_funcs.load_binary_file_frame(in_file_list[i], self.feature_dimension)

            mean_matrix = numpy.tile(self.mean_vector, (current_frame_number, 1))
            std_matrix = numpy.tile(self.std_vector, (current_frame_number, 1))
            
            norm_features = (features - mean_matrix) / std_matrix
            
            io_funcs.array_to_binary_file(norm_features, out_file_list[i])

        return  self.mean_vector, self.std_vector
Пример #24
0
    def compute_std(self, file_list, mean_vector, start_index, end_index):
    
        logger = logging.getLogger('feature_normalisation')
        
        local_feature_dimension = end_index - start_index

        std_vector = numpy.zeros((1, self.feature_dimension))
        all_frame_number = 0

        io_funcs = BinaryIOCollection()
        for file_name in file_list:
            features, current_frame_number = io_funcs.load_binary_file_frame(file_name, self.feature_dimension)

            mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1))
            
            std_vector += numpy.reshape(numpy.sum((features[:, start_index:end_index] - mean_matrix) ** 2, axis=0), (1, local_feature_dimension))
            all_frame_number += current_frame_number
            
        std_vector /= float(all_frame_number)
        
        std_vector = std_vector ** 0.5
        
        # setting the print options in this way seems to break subsequent printing of numpy float32 types
        # no idea what is going on - removed until this can be solved
        # po=numpy.get_printoptions()
        # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4)
        logger.info('computed  std vector of length %d' % std_vector.shape[1] )
        logger.info('  std: %s' % std_vector)
        # restore the print options
        # numpy.set_printoptions(po)
        
        self.std_vector = std_vector
        
        return  std_vector
    def feature_normalisation(self, in_file_list, out_file_list):
        logger = logging.getLogger('feature_normalisation')

        #        self.feature_dimension = feature_dimension
        try:
            assert len(in_file_list) == len(out_file_list)
        except AssertionError:
            logger.critical(
                'The input and output file numbers are not the same! %d vs %d'
                % (len(in_file_list), len(out_file_list)))
            raise

        if self.mean_vector == None:
            self.mean_vector = self.compute_mean(in_file_list, 0,
                                                 self.feature_dimension)
        if self.std_vector == None:
            self.std_vector = self.compute_std(in_file_list, self.mean_vector,
                                               0, self.feature_dimension)

        io_funcs = BinaryIOCollection()
        file_number = len(in_file_list)
        for i in xrange(file_number):
            features, current_frame_number = io_funcs.load_binary_file_frame(
                in_file_list[i], self.feature_dimension)

            mean_matrix = numpy.tile(self.mean_vector,
                                     (current_frame_number, 1))
            std_matrix = numpy.tile(self.std_vector, (current_frame_number, 1))

            norm_features = (features - mean_matrix) / std_matrix

            io_funcs.array_to_binary_file(norm_features, out_file_list[i])

        return self.mean_vector, self.std_vector
    def compute_mean(self, file_list, start_index, end_index):

        logger = logging.getLogger('feature_normalisation')

        local_feature_dimension = end_index - start_index

        mean_vector = numpy.zeros((1, local_feature_dimension))
        all_frame_number = 0

        io_funcs = BinaryIOCollection()
        for file_name in file_list:
            features, current_frame_number = io_funcs.load_binary_file_frame(
                file_name, self.feature_dimension)

            mean_vector += numpy.reshape(
                numpy.sum(features[:, start_index:end_index], axis=0),
                (1, local_feature_dimension))
            all_frame_number += current_frame_number

        mean_vector /= float(all_frame_number)

        # setting the print options in this way seems to break subsequent printing of numpy float32 types
        # no idea what is going on - removed until this can be solved
        # po=numpy.get_printoptions()
        # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4)
        logger.info('computed mean vector of length %d :' %
                    mean_vector.shape[1])
        logger.info(' mean: %s' % mean_vector)
        # restore the print options
        # numpy.set_printoptions(po)

        self.mean_vector = mean_vector

        return mean_vector
Пример #27
0
def read_and_transform_data_from_file_list(in_file_list, dim, seq_length=200, merge_size=1):
    io_funcs = BinaryIOCollection()

    num_of_utt = len(in_file_list)

    temp_set = np.zeros((FRAME_BUFFER_SIZE, dim))

    ### read file by file ###
    current_index = 0
    for i in range(num_of_utt):
        in_file_name = in_file_list[i]
        in_features, frame_number = io_funcs.load_binary_file_frame(in_file_name, dim)
        base_file_name            = os.path.basename(in_file_name).split(".")[0]

        temp_set[current_index:current_index+frame_number, ] = in_features
        current_index += frame_number

        if (i+1)%merge_size == 0:
            current_index = seq_length * (int(np.ceil(float(current_index)/float(seq_length))))

        drawProgressBar(i+1, num_of_utt)

    sys.stdout.write("\n")

    num_of_samples = int(np.ceil(float(current_index)/float(seq_length)))

    temp_set = temp_set[0: num_of_samples*seq_length, ]
    temp_set = temp_set.reshape(num_of_samples, seq_length)

    return temp_set
Пример #28
0
    def normal_standardization(self, in_file_list, out_file_list,
                               feature_dimension):

        #        self.dimension_dict = dimension_dict
        self.feature_dimension = feature_dimension

        mean_vector = self.compute_mean(in_file_list, 0, feature_dimension)
        std_vector = self.compute_std(in_file_list, mean_vector, 0,
                                      feature_dimension)

        io_funcs = BinaryIOCollection()
        file_number = len(in_file_list)

        for i in range(file_number):

            features, current_frame_number = io_funcs.load_binary_file_frame(
                in_file_list[i], self.feature_dimension)

            mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1))
            std_matrix = numpy.tile(std_vector, (current_frame_number, 1))

            norm_features = (features - mean_matrix) / std_matrix

            io_funcs.array_to_binary_file(norm_features, out_file_list[i])

        return mean_vector, std_vector
Пример #29
0
    def compute_std(self, file_list, mean_vector, start_index, end_index):
        local_feature_dimension = end_index - start_index

        std_vector = numpy.zeros((1, self.feature_dimension))
        all_frame_number = 0

        io_funcs = BinaryIOCollection()
        for file_name in file_list:
            features, current_frame_number = io_funcs.load_binary_file_frame(
                file_name, self.feature_dimension)

            mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1))

            std_vector += numpy.reshape(
                numpy.sum(
                    (features[:, start_index:end_index] - mean_matrix)**2,
                    axis=0), (1, local_feature_dimension))
            all_frame_number += current_frame_number

        std_vector /= float(all_frame_number)

        std_vector = std_vector**0.5

        # po=numpy.get_printoptions()
        # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4)
        self.logger.info('computed  std vector of length %d' %
                         std_vector.shape[1])
        self.logger.info('  std: %s' % std_vector)
        # restore the print options
        # numpy.set_printoptions(po)

        return std_vector
Пример #30
0
    def compute_distortion(self, file_id_list, reference_dir, generation_dir, file_ext, feature_dim):
        total_voiced_frame_number = 0
        
        distortion = 0.0
        vuv_error = 0
        total_frame_number = 0

        io_funcs = BinaryIOCollection()

        ref_all_files_data = numpy.reshape(numpy.array([]), (-1,1))
        gen_all_files_data = numpy.reshape(numpy.array([]), (-1,1))
        for file_id in file_id_list:
            ref_file_name  = reference_dir + '/' + file_id + file_ext
            gen_file_name  = generation_dir + '/' + file_id + file_ext

            ref_data, ref_frame_number = io_funcs.load_binary_file_frame(ref_file_name, feature_dim)
            gen_data, gen_frame_number = io_funcs.load_binary_file_frame(gen_file_name, feature_dim)

            if ref_frame_number != gen_frame_number:
                self.logger.critical("The number of frames is not the same: %d vs %d. Error in compute_distortion.py\n." %(ref_frame_number, gen_frame_number))
                raise

            if file_ext == '.lf0':
                ref_all_files_data = numpy.concatenate((ref_all_files_data, ref_data), axis=0)
                gen_all_files_data = numpy.concatenate((gen_all_files_data, gen_data), axis=0)
                temp_distortion, temp_vuv_error, voiced_frame_number = self.compute_f0_mse(ref_data, gen_data)
                vuv_error += temp_vuv_error
                total_voiced_frame_number += voiced_frame_number
            elif file_ext == '.dur':
                ref_data = numpy.reshape(numpy.sum(ref_data, axis=1), (-1, 1))
                gen_data = numpy.reshape(numpy.sum(gen_data, axis=1), (-1, 1))
                ref_all_files_data = numpy.concatenate((ref_all_files_data, ref_data), axis=0)
                gen_all_files_data = numpy.concatenate((gen_all_files_data, gen_data), axis=0)
                continue; 
            elif file_ext == '.mgc':
                temp_distortion = self.compute_mse(ref_data[:, 1:feature_dim], gen_data[:, 1:feature_dim])
            else:
                temp_distortion = self.compute_mse(ref_data, gen_data)
            
            distortion += temp_distortion

            total_frame_number += ref_frame_number

        if file_ext == '.dur':
            dur_rmse = self.compute_rmse(ref_all_files_data, gen_all_files_data)
            dur_corr = self.compute_corr(ref_all_files_data, gen_all_files_data)

            return dur_rmse, dur_corr
        elif file_ext == '.lf0':
            distortion /= float(total_voiced_frame_number)
            vuv_error  /= float(total_frame_number)

            distortion = numpy.sqrt(distortion)
            f0_corr = self.compute_f0_corr(ref_all_files_data, gen_all_files_data)

            return  distortion, f0_corr, vuv_error
        else:
            distortion /= float(total_frame_number)

            return  distortion
Пример #31
0
    def feature_normalisation(self, in_file_list, out_file_list):
        logger = logging.getLogger('feature_normalisation')

#        self.feature_dimension = feature_dimension
        try:
            assert len(in_file_list) == len(out_file_list)
        except  AssertionError:
            logger.critical('The input and output file numbers are not the same! %d vs %d' %(len(in_file_list), len(out_file_list)))
            raise

        if self.mean_vector is None:
            self.mean_vector = self.compute_mean(in_file_list, 0, self.feature_dimension)
        if self.std_vector  is None:
            self.std_vector = self.compute_std(in_file_list, self.mean_vector, 0, self.feature_dimension)

        io_funcs = BinaryIOCollection()
        file_number = len(in_file_list)
        for i in range(file_number):
            features, current_frame_number = io_funcs.load_binary_file_frame(in_file_list[i], self.feature_dimension)

            mean_matrix = numpy.tile(self.mean_vector, (current_frame_number, 1))
            std_matrix = numpy.tile(self.std_vector, (current_frame_number, 1))

            norm_features = (features - mean_matrix) / std_matrix
            print(current_frame_number,in_file_list[i])
            norm_features=numpy.concatenate([norm_features[:,:self.feature_dimension-2],features[:,self.feature_dimension-2:]],axis=-1)


            # in fact the problem is that I normalized the out put xvector pvector and onehotvector.... so we have to in formalized this
            print(' normalized vector :{}'.format(norm_features[1,:]))

            io_funcs.array_to_binary_file(norm_features, out_file_list[i])

        return  self.mean_vector, self.std_vector
def load_covariance(var_file_dict, out_dimension_dict):
    var = {}
    io_funcs = BinaryIOCollection()
    for feature_name in var_file_dict.keys():
        var_values, dimension = io_funcs.load_binary_file_frame(var_file_dict[feature_name], 1)
        var_values = numpy.reshape(var_values, (out_dimension_dict[feature_name], 1))
        var[feature_name] = var_values
    return  var
Пример #33
0
    def load_next_utterance(self):
        """Load the data for one utterance. This function will be called when utterance-by-utterance loading is required (e.g., sequential training).

        """

        temp_set_x = numpy.empty((self.buffer_size, self.n_ins))
        temp_set_y = numpy.empty((self.buffer_size, self.n_outs))

        io_fun = BinaryIOCollection()

        in_features, lab_frame_number = io_fun.load_binary_file_frame(self.x_files_list[self.file_index], self.n_ins)
        out_features, out_frame_number = io_fun.load_binary_file_frame(self.y_files_list[self.file_index], self.n_outs)

        frame_number = lab_frame_number
        if abs(lab_frame_number - out_frame_number) < 5:    ## we allow small difference here. may not be correct, but sometimes, there is one/two frames difference
            if lab_frame_number > out_frame_number:
                frame_number = out_frame_number
        else:
            base_file_name = os.path.basename(self.x_files_list[self.file_index]).split('.')[0]
            self.logger.critical("the number of frames in label and acoustic features are different: %d vs %d (%s)" %(lab_frame_number, out_frame_number, base_file_name))
            raise

        temp_set_y = out_features[0:frame_number, ]
        temp_set_x = in_features[0:frame_number, ]

        self.file_index += 1

        if  self.file_index >= self.list_size:
            self.end_reading = True
            self.file_index = 0
       
        # reshape input-output
        if self.reshape_io:
            temp_set_x = numpy.reshape(temp_set_x, (1, temp_set_x.shape[0], self.n_ins))
            temp_set_y = numpy.reshape(temp_set_y, (1, temp_set_y.shape[0], self.n_outs))
        
            temp_set_x = numpy.array(temp_set_x, 'float32')
            temp_set_y = numpy.array(temp_set_y, 'float32')

        shared_set_x = self.make_shared(temp_set_x, 'x')
        shared_set_y = self.make_shared(temp_set_y, 'y')

        shared_set_xy = (shared_set_x, shared_set_y)

        return shared_set_xy, temp_set_x, temp_set_y
Пример #34
0
    def load_covariance(self, var_file_dict, out_dimension_dict):

        io_funcs = BinaryIOCollection()
        for feature_name in list(var_file_dict.keys()):
            var_values, dimension = io_funcs.load_binary_file_frame(var_file_dict[feature_name], 1)

            var_values = numpy.reshape(var_values, (out_dimension_dict[feature_name], 1))

            self.var[feature_name] = var_values
Пример #35
0
    def merge_data(self, binary_label_file_list, new_feat_file_list,
                   out_feat_file_list):
        '''
        merging new features with normalised label features
        '''
        utt_number = len(new_feat_file_list)
        if utt_number != len(binary_label_file_list):
            print(
                "the number of new feature input files and label files should be the same!\n"
            )
            sys.exit(1)

        new_feat_ext = new_feat_file_list[0].split('/')[-1].split('.')[1]

        io_funcs = BinaryIOCollection()
        for i in range(utt_number):
            lab_file_name = binary_label_file_list[i]
            new_feat_file_name = new_feat_file_list[i]
            out_feat_file_name = out_feat_file_list[i]

            lab_features, lab_frame_number = io_funcs.load_binary_file_frame(
                lab_file_name, self.lab_dim)
            new_features, feat_frame_number = io_funcs.load_binary_file_frame(
                new_feat_file_name, self.feat_dim)

            if (lab_frame_number - feat_frame_number) > 5:
                base_file_name = new_feat_file_list[i].split('/')[-1].split(
                    '.')[0]
                self.logger.critical(
                    "the number of frames in label and new features are different: %d vs %d (%s)"
                    % (lab_frame_number, feat_frame_number, base_file_name))
                raise

            merged_features = numpy.zeros(
                (lab_frame_number, self.lab_dim + self.feat_dim))

            merged_features[0:lab_frame_number, 0:self.lab_dim] = lab_features
            merged_features[0:feat_frame_number, self.lab_dim:self.lab_dim +
                            self.feat_dim] = new_features[0:lab_frame_number, ]

            io_funcs.array_to_binary_file(merged_features, out_feat_file_name)
            self.logger.debug(
                'merged new feature %s of %d frames with %d label features' %
                (new_feat_ext, feat_frame_number, lab_frame_number))
Пример #36
0
    def load_next_utterance_S2S(self):
        """Load the data for one utterance. This function will be called when utterance-by-utterance loading is required (e.g., sequential training).
        
        """

        temp_set_x = numpy.empty((self.buffer_size, self.n_ins))
        temp_set_y = numpy.empty((self.buffer_size, self.n_outs))

        io_fun = BinaryIOCollection()

        in_features, lab_frame_number = io_fun.load_binary_file_frame(
            self.x_files_list[self.file_index], self.n_ins)
        out_features, out_frame_number = io_fun.load_binary_file_frame(
            self.y_files_list[self.file_index], self.n_outs)

        temp_set_x = in_features[0:lab_frame_number, ]
        temp_set_y = out_features[0:out_frame_number, ]

        if not self.dur_files_list:
            dur_frame_number = out_frame_number
            dur_features = numpy.array([dur_frame_number])
        else:
            dur_features, dur_frame_number = io_fun.load_binary_file_frame(
                self.dur_files_list[self.file_index], 1)
            assert sum(dur_features) == out_frame_number

        dur_features = numpy.reshape(dur_features, (-1, ))
        temp_set_d = dur_features.astype(int)

        self.file_index += 1

        if self.file_index >= self.list_size:
            self.end_reading = True
            self.file_index = 0

        shared_set_x = self.make_shared(temp_set_x, 'x')
        shared_set_y = self.make_shared(temp_set_y, 'y')
        shared_set_d = theano.shared(numpy.asarray(temp_set_d, dtype='int32'),
                                     name='d',
                                     borrow=True)

        shared_set_xyd = (shared_set_x, shared_set_y, shared_set_d)

        return shared_set_xyd, temp_set_x, temp_set_y, temp_set_d
Пример #37
0
    def modify_dur_from_phone_alignment_labels(self, label_file_name,
                                               gen_dur_file_name,
                                               gen_lab_file_name):
        logger = logging.getLogger("dur")

        dur_dim = 1

        io_funcs = BinaryIOCollection()
        dur_features, frame_number = io_funcs.load_binary_file_frame(
            gen_dur_file_name, dur_dim)

        fid = open(label_file_name)
        utt_labels = fid.readlines()
        fid.close()

        label_number = len(utt_labels)
        logger.info('loaded %s, %3d labels' % (label_file_name, label_number))

        out_fid = open(gen_lab_file_name, 'w')

        current_index = 0
        prev_end_time = 0
        for line in utt_labels:
            line = line.strip()

            if len(line) < 1:
                continue
            temp_list = re.split('\s+', line)
            start_time = int(temp_list[0])
            end_time = int(temp_list[1])

            full_label = temp_list[2]

            label_binary_flag = self.check_silence_pattern(full_label)

            if label_binary_flag == 1:
                current_phone_dur = end_time - start_time
                out_fid.write(
                    str(prev_end_time) + ' ' +
                    str(prev_end_time + current_phone_dur) + ' ' + full_label +
                    '\n')
                prev_end_time = prev_end_time + current_phone_dur
                continue
            else:
                phone_dur = dur_features[current_index]
                phone_dur = int(phone_dur) * 5 * 10000
                out_fid.write(
                    str(prev_end_time) + ' ' + str(prev_end_time + phone_dur) +
                    ' ' + full_label + '\n')
                prev_end_time = prev_end_time + phone_dur

            current_index += 1

        logger.debug(
            'modifed label with predicted duration of %d frames x %d features'
            % dur_features.shape)
Пример #38
0
def read_data_from_file_list(inp_file_list, out_file_list, inp_dim, out_dim, sequential_training=True):
    io_funcs = BinaryIOCollection()

    num_of_utt = len(inp_file_list)

    file_length_dict = {'framenum2utt':{}, 'utt2framenum':{}}

    if sequential_training:
        temp_set_x = {}
        temp_set_y = {}
    else:
        temp_set_x = np.empty((FRAME_BUFFER_SIZE, inp_dim))
        temp_set_y = np.empty((FRAME_BUFFER_SIZE, out_dim))

    ### read file by file ###
    current_index = 0
    for i in range(num_of_utt):
        inp_file_name = inp_file_list[i]
        out_file_name = out_file_list[i]
        inp_features, inp_frame_number = io_funcs.load_binary_file_frame(inp_file_name, inp_dim)
        out_features, out_frame_number = io_funcs.load_binary_file_frame(out_file_name, out_dim)

        base_file_name = os.path.basename(inp_file_name).split(".")[0]

        if abs(inp_frame_number-out_frame_number)>5:
            print('the number of frames in input and output features are different: %d vs %d (%s)' %(inp_frame_number, out_frame_number, base_file_name))
            sys.exit(0)
        else:
            frame_number = min(inp_frame_number, out_frame_number)

        if sequential_training:
            temp_set_x[base_file_name] = inp_features[0:frame_number]
            temp_set_y[base_file_name] = out_features[0:frame_number]
        else:
            temp_set_x[current_index:current_index+frame_number, ] = inp_features[0:frame_number]
            temp_set_y[current_index:current_index+frame_number, ] = out_features[0:frame_number]
            current_index += frame_number

        if frame_number not in file_length_dict['framenum2utt']:
            file_length_dict['framenum2utt'][frame_number] = [base_file_name]
        else:
            file_length_dict['framenum2utt'][frame_number].append(base_file_name)

        file_length_dict['utt2framenum'][base_file_name] = frame_number

        drawProgressBar(i+1, num_of_utt)

    sys.stdout.write("\n")

    if not sequential_training:
        temp_set_x = temp_set_x[0:current_index, ]
        temp_set_y = temp_set_y[0:current_index, ]

    return temp_set_x, temp_set_y, file_length_dict
Пример #39
0
def read_data_from_file_list(inp_file_list, out_file_list, inp_dim, out_dim, sequential_training=True):
    io_funcs = BinaryIOCollection()

    num_of_utt = len(inp_file_list)

    file_length_dict = {'framenum2utt':{}, 'utt2framenum':{}}

    if sequential_training:
        temp_set_x = {}
        temp_set_y = {}
    else:
        temp_set_x = np.empty((FRAME_BUFFER_SIZE, inp_dim))
        temp_set_y = np.empty((FRAME_BUFFER_SIZE, out_dim))

    ### read file by file ###
    current_index = 0
    for i in range(num_of_utt):
        inp_file_name = inp_file_list[i]
        out_file_name = out_file_list[i]
        inp_features, inp_frame_number = io_funcs.load_binary_file_frame(inp_file_name, inp_dim)
        out_features, out_frame_number = io_funcs.load_binary_file_frame(out_file_name, out_dim)

        base_file_name = os.path.basename(inp_file_name).split(".")[0]

        if abs(inp_frame_number-out_frame_number)>5:
            print('the number of frames in input and output features are different: %d vs %d (%s)' %(inp_frame_number, out_frame_number, base_file_name))
            sys.exit(0)
        else:
            frame_number = min(inp_frame_number, out_frame_number)

        if sequential_training:
            temp_set_x[base_file_name] = inp_features[0:frame_number]
            temp_set_y[base_file_name] = out_features[0:frame_number]
        else:
            temp_set_x[current_index:current_index+frame_number, ] = inp_features[0:frame_number]
            temp_set_y[current_index:current_index+frame_number, ] = out_features[0:frame_number]
            current_index += frame_number

        if frame_number not in file_length_dict['framenum2utt']:
            file_length_dict['framenum2utt'][frame_number] = [base_file_name]
        else:
            file_length_dict['framenum2utt'][frame_number].append(base_file_name)

        file_length_dict['utt2framenum'][base_file_name] = frame_number

        drawProgressBar(i+1, num_of_utt)

    sys.stdout.write("\n")

    if not sequential_training:
        temp_set_x = temp_set_x[0:current_index, ]
        temp_set_y = temp_set_y[0:current_index, ]

    return temp_set_x, temp_set_y, file_length_dict
Пример #40
0
    def load_min_max_values(self, label_norm_file):

        logger = logging.getLogger("acoustic_norm")

        io_funcs = BinaryIOCollection()
        min_max_vector, frame_number = io_funcs.load_binary_file_frame(label_norm_file, 1)
        min_max_vector = numpy.reshape(min_max_vector, (-1, ))
        self.min_vector = min_max_vector[0:frame_number//2]
        self.max_vector = min_max_vector[frame_number//2:]

        logger.info('Loaded min max values from the trained data for feature dimension of %d' % self.feature_dimension)
Пример #41
0
    def load_mean_std_values(self, acoustic_norm_file):

        logger = logging.getLogger('feature_normalisation')

        io_funcs = BinaryIOCollection()
        mean_std_vector, frame_number = io_funcs.load_binary_file_frame(acoustic_norm_file, 1)
        mean_std_vector = numpy.reshape(mean_std_vector, (-1, ))
        self.mean_vector = mean_std_vector[0:frame_number//2]
        self.std_vector = mean_std_vector[frame_number//2:]

        logger.info('Loaded mean std values from the trained data for feature dimension of %d' % self.feature_dimension)
        return self.mean_vector, self.std_vector
Пример #42
0
    def load_mean_std_values(self, acoustic_norm_file):

        logger = logging.getLogger('feature_normalisation')

        io_funcs = BinaryIOCollection()
        mean_std_vector, frame_number = io_funcs.load_binary_file_frame(acoustic_norm_file, 1)
        mean_std_vector = numpy.reshape(mean_std_vector, (-1, ))
        self.mean_vector = mean_std_vector[0:frame_number//2]
        self.std_vector = mean_std_vector[frame_number//2:]

        logger.info('Loaded mean std values from the trained data for feature dimension of %d' % self.feature_dimension)
        return self.mean_vector, self.std_vector
Пример #43
0
    def compose_predict_label(self, orig_label_file, gen_label_file,
                              predict_duration_file):
        io_funcs = BinaryIOCollection()
        origMat = io_funcs.file2matrix(orig_label_file)

        state_number = 5
        duration, in_frame_number = io_funcs.load_binary_file_frame(
            predict_duration_file, state_number)
        assert origMat.shape[0] == in_frame_number
        origMat[:, -5:] = duration
        origMat = origMat.astype(int)
        io_funcs.matrix2file(origMat, gen_label_file)
Пример #44
0
    def load_next_utterance_S2S(self):
        """Load the data for one utterance. This function will be called when utterance-by-utterance loading is required (e.g., sequential training).

        """

        temp_set_x = numpy.empty((self.buffer_size, self.n_ins))
        temp_set_y = numpy.empty((self.buffer_size, self.n_outs))

        io_fun = BinaryIOCollection()

        in_features, lab_frame_number = io_fun.load_binary_file_frame(self.x_files_list[self.file_index], self.n_ins)
        out_features, out_frame_number = io_fun.load_binary_file_frame(self.y_files_list[self.file_index], self.n_outs)

        temp_set_x = in_features[0:lab_frame_number, ]
        temp_set_y = out_features[0:out_frame_number, ]

        if not self.dur_files_list:
            dur_frame_number = out_frame_number
            dur_features = numpy.array([dur_frame_number])
        else:
            dur_features, dur_frame_number = io_fun.load_binary_file_frame(self.dur_files_list[self.file_index], 1)
            assert sum(dur_features) == out_frame_number
           
        dur_features = numpy.reshape(dur_features, (-1, ))
        temp_set_d = dur_features.astype(int)   
        
        self.file_index += 1

        if  self.file_index >= self.list_size:
            self.end_reading = True
            self.file_index = 0

        shared_set_x = self.make_shared(temp_set_x, 'x')
        shared_set_y = self.make_shared(temp_set_y, 'y')
        shared_set_d = theano.shared(numpy.asarray(temp_set_d, dtype='int32'), name='d', borrow=True)

        shared_set_xyd = (shared_set_x, shared_set_y, shared_set_d)

        return shared_set_xyd, temp_set_x, temp_set_y, temp_set_d
Пример #45
0
    def modify_dur_from_state_alignment_labels(self, label_file_name, gen_dur_file_name, gen_lab_file_name): 
        logger = logging.getLogger("dur")

        state_number = self.state_number
        dur_dim = state_number
        
        io_funcs = BinaryIOCollection()
        dur_features, frame_number = io_funcs.load_binary_file_frame(gen_dur_file_name, dur_dim)

        fid = open(label_file_name)
        utt_labels = fid.readlines()
        fid.close()
        
        label_number = len(utt_labels)
        logger.info('loaded %s, %3d labels' % (label_file_name, label_number) )
		
        out_fid = open(gen_lab_file_name, 'w')

        current_index = 0
        prev_end_time = 0
        for line in utt_labels:
            line = line.strip()
            
            if len(line) < 1:
                continue
            temp_list = re.split('\s+', line)
            start_time = int(temp_list[0])
            end_time = int(temp_list[1])
            
            full_label = temp_list[2]
            full_label_length = len(full_label) - 3  # remove state information [k]
            state_index = full_label[full_label_length + 1]
            state_index = int(state_index) - 1

            label_binary_flag = self.check_silence_pattern(full_label)
          
            if label_binary_flag == 1:
                current_state_dur = end_time - start_time
                out_fid.write(str(prev_end_time)+' '+str(prev_end_time+current_state_dur)+' '+full_label+'\n')
                prev_end_time = prev_end_time+current_state_dur
                continue;
            else:
                state_dur = dur_features[current_index, state_index-1]
                state_dur = int(state_dur)*5*10000
                out_fid.write(str(prev_end_time)+' '+str(prev_end_time+state_dur)+' '+full_label+'\n')
                prev_end_time = prev_end_time+state_dur
        
            if state_index == state_number:
                current_index += 1
     
        logger.debug('modifed label with predicted duration of %d frames x %d features' % dur_features.shape )
    def duration_decomposition(self,
                               in_file_list,
                               dimension,
                               out_dimension_dict,
                               file_extension_dict,
                               meta=None):

        logger = logging.getLogger('param_generation')

        logger.debug('duration_decomposition for %d files' % len(in_file_list))

        state_number = 5  ## hard coding, try removing in future?

        if len(list(out_dimension_dict.keys())) > 1:
            logger.critical(
                "we don't support any additional features along with duration as of now."
            )
            sys.exit(1)
        else:
            feature_name = list(out_dimension_dict.keys())[0]

        io_funcs = BinaryIOCollection()

        findex = 0
        flen = len(in_file_list)
        for file_name in in_file_list:

            findex = findex + 1

            dir_name = os.path.dirname(file_name)
            file_id = os.path.splitext(os.path.basename(file_name))[0]

            features, frame_number = io_funcs.load_binary_file_frame(
                file_name, dimension)
            gen_features = numpy.int32(numpy.round(features))
            gen_features[gen_features < 1] = 1

            if dimension > state_number:
                gen_features = gen_features[:, state_number]

            logger.info('processing %4d of %4d: %s' %
                        (findex, flen, file_name))

            if meta is not None:
                gen_features = self.hardcode_duration(meta, gen_features)

            new_file_name = os.path.join(
                dir_name, file_id + file_extension_dict[feature_name])
            io_funcs.array_to_binary_file(gen_features, new_file_name)

            logger.debug('wrote to file %s' % new_file_name)
Пример #47
0
def read_data_from_file_list_shared(speaker_id_list, inp_file_list, out_file_list, inp_dim, out_dim):

    io_funcs = BinaryIOCollection()

    num_of_utt = len(inp_file_list)
    num_of_spk = len(speaker_id_list)

    file_length_dict = {'framenum2utt':{}, 'utt2framenum':{}}

    temp_set_x = {}
    temp_set_y = {}

    ### read file by file ###
    for i in range(num_of_utt):
        inp_file_name = inp_file_list[i]
        out_file_name = out_file_list[i]
        inp_features, inp_frame_number = io_funcs.load_binary_file_frame(inp_file_name, inp_dim)
        out_features, out_frame_number = io_funcs.load_binary_file_frame(out_file_name, out_dim)

        base_file_name = os.path.basename(inp_file_name).split(".")[0]

        if abs(inp_frame_number-out_frame_number)>5:
            print('the number of frames in input and output features are different: %d vs %d (%s)' %(inp_frame_number, out_frame_number, base_file_name))
            sys.exit(0)
        else:
            frame_number = min(inp_frame_number, out_frame_number)

        # Write to dictionaries
        temp_set_x[base_file_name] = inp_features[0:frame_number]
        temp_set_y[base_file_name] = out_features[0:frame_number]

        if frame_number not in file_length_dict['framenum2utt']:
            file_length_dict['framenum2utt'][frame_number] = [base_file_name]
        else:
            file_length_dict['framenum2utt'][frame_number].append(base_file_name)

        file_length_dict['utt2framenum'][base_file_name] = frame_number

        drawProgressBar(i+1, num_of_utt)

    sys.stdout.write("\n")

    set_x = temp_set_x
    set_y = {speaker: {} for speaker in speaker_id_list}
    for base_file_name in temp_set_y.keys():
        speaker_ind = np.where([speaker_id in base_file_name for speaker_id in speaker_id_list])
        speaker = speaker_id_list[int(speaker_ind[0])]
        set_y[speaker][base_file_name] = temp_set_y[base_file_name]

    return set_x, set_y, file_length_dict
Пример #48
0
    def load_min_max_values(self, label_norm_file):

        logger = logging.getLogger("acoustic_norm")

        io_funcs = BinaryIOCollection()
        min_max_vector, frame_number = io_funcs.load_binary_file_frame(
            label_norm_file, 1)
        min_max_vector = numpy.reshape(min_max_vector, (-1, ))
        self.min_vector = min_max_vector[0:frame_number // 2]
        self.max_vector = min_max_vector[frame_number // 2:]

        logger.info(
            'Loaded min max values from the trained data for feature dimension of %d'
            % self.feature_dimension)
 def load_prev_fea(self,):
     # load acoustic var and mean and linguistic feature
     fid = open(self.norm_info_file, 'rb')
     cmp_min_max = np.fromfile(fid, dtype=np.float32)
     fid.close()
     cmp_min_max = cmp_min_max.reshape((2, -1))
     cmp_mean_vector = cmp_min_max[0, ]
     cmp_std_vector = cmp_min_max[1, ]
     io_funcs = BinaryIOCollection()
     inp_features, frame_number = io_funcs.load_binary_file_frame(
         self.test_norm_path, self.n_in)
     test_lin_x, test_lab_x = np.hsplit(inp_features, np.array([-1]))
     # set 100 as vary utterance embedding
     test_lab_x = np.tile(np.array(100), (test_lab_x.shape[0], 1))
     return cmp_mean_vector, cmp_std_vector, test_lin_x, test_lab_x
Пример #50
0
def read_data_from_file_list(in_file_list, dim): 
    io_funcs = BinaryIOCollection()

    temp_set = np.empty((500000, dim))
     
    ### read file by file ###
    current_index = 0
    for i in tqdm.tqdm(range(len(in_file_list))):    
        in_file_name = in_file_list[i]
        in_features, frame_number = io_funcs.load_binary_file_frame(in_file_name, dim)
        temp_set[current_index:current_index+frame_number, ] = in_features
        current_index += frame_number
    
    temp_set = temp_set[0:current_index, ]
    
    return temp_set
Пример #51
0
def load_norm_stats(stats_file, dim, method="MVN"):
    #### load norm stats ####
    io_funcs = BinaryIOCollection()

    norm_matrix, frame_number = io_funcs.load_binary_file_frame(stats_file, dim)
    assert frame_number==2

    if method=="MVN":
        scaler = preprocessing.StandardScaler()
        scaler.mean_  = norm_matrix[0, :]
        scaler.scale_ = norm_matrix[1, :]
    elif method=="MINMAX":
        scaler = preprocessing.MinMaxScaler(feature_range=(0.01, 0.99))
        scaler.min_   = norm_matrix[0, :]
        scaler.scale_ = norm_matrix[1, :]

    return scaler
Пример #52
0
def read_test_data_from_file_list(inp_file_list, inp_dim, sequential_training=True):
    io_funcs = BinaryIOCollection()

    num_of_utt = len(inp_file_list)

    file_length_dict = {'framenum2utt':{}, 'utt2framenum':{}}

    if sequential_training:
        temp_set_x = {}
    else:
        temp_set_x = np.empty((FRAME_BUFFER_SIZE, inp_dim))

    ### read file by file ###
    current_index = 0
    for i in range(num_of_utt):
        inp_file_name = inp_file_list[i]
        inp_features, frame_number = io_funcs.load_binary_file_frame(inp_file_name, inp_dim)

        base_file_name = os.path.basename(inp_file_name).split(".")[0]

        if sequential_training:
            temp_set_x[base_file_name] = inp_features
        else:
            temp_set_x[current_index:current_index+frame_number, ] = inp_features[0:frame_number]
            current_index += frame_number

        if frame_number not in file_length_dict['framenum2utt']:
            file_length_dict['framenum2utt'][frame_number] = [base_file_name]
        else:
            file_length_dict['framenum2utt'][frame_number].append(base_file_name)

        file_length_dict['utt2framenum'][base_file_name] = frame_number

        drawProgressBar(i+1, num_of_utt)

    sys.stdout.write("\n")

    if not sequential_training:
        temp_set_x = temp_set_x[0:current_index, ]

    return temp_set_x, file_length_dict
Пример #53
0
    def duration_decomposition(self, in_file_list, dimension, out_dimension_dict, file_extension_dict):

        logger = logging.getLogger('param_generation')

        logger.debug('duration_decomposition for %d files' % len(in_file_list) )

        state_number = 5  ## hard coding, try removing in future?

        if len(list(out_dimension_dict.keys()))>1:
            logger.critical("we don't support any additional features along with duration as of now.")
            sys.exit(1)
        else:
            feature_name = list(out_dimension_dict.keys())[0]

        io_funcs = BinaryIOCollection()

        findex=0
        flen=len(in_file_list)
        for file_name in in_file_list:

            findex=findex+1

            dir_name = os.path.dirname(file_name)
            file_id = os.path.splitext(os.path.basename(file_name))[0]

            features, frame_number = io_funcs.load_binary_file_frame(file_name, dimension)
            gen_features = numpy.int32(numpy.round(features))
            gen_features[gen_features<1]=1

            if dimension > state_number:
                gen_features = gen_features[:, state_number]

            logger.info('processing %4d of %4d: %s' % (findex,flen,file_name) )

            new_file_name = os.path.join(dir_name, file_id + file_extension_dict[feature_name])
            io_funcs.array_to_binary_file(gen_features, new_file_name)

            logger.debug('wrote to file %s' % new_file_name)
Пример #54
0
    def normal_standardization(self, in_file_list, out_file_list, feature_dimension):
    
#        self.dimension_dict = dimension_dict
        self.feature_dimension = feature_dimension

        mean_vector = self.compute_mean(in_file_list, 0, feature_dimension)
        std_vector = self.compute_std(in_file_list, mean_vector, 0, feature_dimension)
                
        io_funcs = BinaryIOCollection()
        file_number = len(in_file_list)
 
        for i in xrange(file_number):
                    
            features, current_frame_number = io_funcs.load_binary_file_frame(in_file_list[i], self.feature_dimension)

            mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1))
            std_matrix = numpy.tile(std_vector, (current_frame_number, 1))
            
            norm_features = (features - mean_matrix) / std_matrix
            
            io_funcs.array_to_binary_file(norm_features, out_file_list[i])

        return  mean_vector, std_vector
Пример #55
0
    def feature_denormalisation(self, in_file_list, out_file_list, mean_vector, std_vector):
        io_funcs = BinaryIOCollection()
        file_number = len(in_file_list)
        try:
            assert len(in_file_list) == len(out_file_list)
        except  AssertionError:
            logger.critical('The input and output file numbers are not the same! %d vs %d' %(len(in_file_list), len(out_file_list)))
            raise

        try:
            assert  mean_vector.size == self.feature_dimension and std_vector.size == self.feature_dimension
        except AssertionError:
            logger.critical('the dimensionalities of the mean and standard derivation vectors are not the same as the dimensionality of the feature')
            raise

        for i in xrange(file_number):
            features, current_frame_number = io_funcs.load_binary_file_frame(in_file_list[i], self.feature_dimension)

            mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1))
            std_matrix = numpy.tile(std_vector, (current_frame_number, 1))

            norm_features = features * std_matrix + mean_matrix

            io_funcs.array_to_binary_file(norm_features, out_file_list[i])
Пример #56
0
    def compute_mean(self, file_list, start_index, end_index):

        local_feature_dimension = end_index - start_index
        
        mean_vector = numpy.zeros((1, local_feature_dimension))
        all_frame_number = 0

        io_funcs = BinaryIOCollection()
        for file_name in file_list:
            features, current_frame_number = io_funcs.load_binary_file_frame(file_name, self.feature_dimension)

            mean_vector += numpy.reshape(numpy.sum(features[:, start_index:end_index], axis=0), (1, local_feature_dimension))
            all_frame_number += current_frame_number
            
        mean_vector /= float(all_frame_number)
        
        # po=numpy.get_printoptions()
        # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4)
        self.logger.info('computed mean vector of length %d :' % mean_vector.shape[1] )
        self.logger.info(' mean: %s' % mean_vector)
        # restore the print options
        # numpy.set_printoptions(po)
        
        return  mean_vector
Пример #57
0
    def load_next_batch(self):
        io_funcs = BinaryIOCollection()

        ## set sequence length for batch training 
        if(self.training_algo == 1):
            # set seq length to maximum seq length from current batch
            self.set_seq_length_from_current_batch()
        elif(self.training_algo == 2):
            # set seq length to maximum seq length from current bucket
            while not self.current_bucket_size:
                self.get_next_bucket()
        elif(self.training_algo == 3):
            # seq length is set based on default/user configuration 
            pass;
            
        temp_set_x = numpy.zeros((self.buffer_size, self.n_ins))
        temp_set_y = numpy.zeros((self.buffer_size, self.n_outs))

        ### read file by file ###
        current_index = 0
        while True:
            if current_index >= self.buffer_size:
                print('buffer size reached by file index %d' %(self.file_index))
                break

            if self.training_algo == 2:
                # choose utterance from current bucket list
                base_file_name = self.current_bucket_list[self.bucket_file_index]
                self.utt_index = self.file_length_dict['utt2index'][base_file_name] 
            else: 
                # choose utterance randomly from current file list 
                #self.utt_index = numpy.random.randint(self.list_size)
                ## choose utterance in serial order
                self.utt_index = self.file_index 
                base_file_name = os.path.basename(self.x_files_list[self.utt_index]).split('.')[0]

            in_features, lab_frame_number = io_funcs.load_binary_file_frame(self.x_files_list[self.utt_index], self.n_ins)
            out_features, out_frame_number = io_funcs.load_binary_file_frame(self.y_files_list[self.utt_index], self.n_outs)
         
            frame_number = self.file_length_dict['utt2framenum'][base_file_name]

            temp_set_x[current_index:current_index+frame_number, ] = in_features
            temp_set_y[current_index:current_index+frame_number, ] = out_features
            current_index += frame_number

            if((self.file_index+1)%self.merge_size == 0):
                num_of_samples = int(numpy.ceil(float(current_index)/float(self.seq_length)))
                current_index = self.seq_length * num_of_samples
                
            self.file_index += 1
            
            # break for any of the below conditions
            if self.training_algo == 2:
                self.bucket_file_index += 1
                if(self.bucket_file_index >= self.current_bucket_size):
                    self.current_bucket_size = 0
                    break;
                if(self.bucket_file_index%self.batch_size==0):
                    break;
            else:  
                if(self.file_index%self.batch_size==0) or (self.file_index >= self.list_size):
                    break
        
        if  self.file_index >= self.list_size:
            self.end_reading = True
            self.file_index = 0
        
        num_of_samples = int(numpy.ceil(float(current_index)/float(self.seq_length)))

        temp_set_x = temp_set_x[0: num_of_samples*self.seq_length, ]
        temp_set_y = temp_set_y[0: num_of_samples*self.seq_length, ]
        
        temp_set_x = temp_set_x.reshape(num_of_samples, self.seq_length, self.n_ins)
        temp_set_y = temp_set_y.reshape(num_of_samples, self.seq_length, self.n_outs)

        shared_set_x = self.make_shared(temp_set_x, 'x')
        shared_set_y = self.make_shared(temp_set_y, 'y')

        shared_set_xy = (shared_set_x, shared_set_y)

        return shared_set_xy, temp_set_x, temp_set_y
Пример #58
0
    def load_next_batch_S2S(self):
        """Load the data for one utterance. This function will be called when utterance-by-utterance loading is required (e.g., sequential training).
        
        """

        temp_set_x = numpy.empty((self.buffer_size, self.n_ins))
        temp_set_y = numpy.empty((self.buffer_size, self.n_outs))
        temp_set_d = numpy.empty((self.buffer_size, 1))

        io_fun = BinaryIOCollection()

        lab_start_frame_number = 0
        lab_end_frame_number   = 0

        out_start_frame_number = 0
        out_end_frame_number   = 0

        new_x_files_list = self.x_files_list[self.file_index].split(',')
        new_y_files_list = self.y_files_list[self.file_index].split(',')
        new_dur_files_list = self.dur_files_list[self.file_index].split(',')

        for new_file_index in xrange(len(new_x_files_list)):
            in_features, lab_frame_number = io_fun.load_binary_file_frame(new_x_files_list[new_file_index], self.n_ins)
            out_features, out_frame_number = io_fun.load_binary_file_frame(new_y_files_list[new_file_index], self.n_outs)
            
            lab_end_frame_number+=lab_frame_number
            out_end_frame_number+=out_frame_number

            temp_set_x[lab_start_frame_number: lab_end_frame_number, ] = in_features[0:lab_frame_number, ]
            temp_set_y[out_start_frame_number: out_end_frame_number, ] = out_features[0:out_frame_number, ]
            if not self.dur_files_list:
                dur_frame_number = out_end_frame_number
                temp_set_d = numpy.array([dur_frame_number])
            else:
                dur_features, dur_frame_number = io_fun.load_binary_file_frame(new_dur_files_list[new_file_index], 1)
                assert sum(dur_features) == out_frame_number
                temp_set_d[lab_start_frame_number: lab_end_frame_number, ] = dur_features[0:lab_frame_number, ]

            lab_start_frame_number = lab_end_frame_number
            out_start_frame_number = out_end_frame_number

        temp_set_x = temp_set_x[0:lab_end_frame_number, ]
        temp_set_y = temp_set_y[0:out_end_frame_number, ]

        temp_set_d = temp_set_d[0:lab_end_frame_number, ]
        temp_set_d = numpy.reshape(temp_set_d, (-1, ))
        temp_set_d = temp_set_d.astype(int)   
        
        self.file_index += 1

        if  self.file_index >= self.list_size:
            self.end_reading = True
            self.file_index = 0

        shared_set_x = self.make_shared(temp_set_x, 'x')
        shared_set_y = self.make_shared(temp_set_y, 'y')
        shared_set_d = theano.shared(numpy.asarray(temp_set_d, dtype='int32'), name='d', borrow=True)

        shared_set_xyd = (shared_set_x, shared_set_y, shared_set_d)

        return shared_set_xyd, temp_set_x, temp_set_y, temp_set_d
Пример #59
0
    def acoustic_decomposition(self, in_file_list, dimension, out_dimension_dict, file_extension_dict, var_file_dict, do_MLPG=True, cfg=None):

        logger = logging.getLogger('param_generation')

        logger.debug('acoustic_decomposition for %d files' % len(in_file_list) )

        self.load_covariance(var_file_dict, out_dimension_dict)

        stream_start_index = {}
        dimension_index = 0
        recorded_vuv = False
        vuv_dimension = None

        for feature_name in list(out_dimension_dict.keys()):
#            if feature_name != 'vuv':
            stream_start_index[feature_name] = dimension_index
#            else:
#                vuv_dimension = dimension_index
#                recorded_vuv = True

            dimension_index += out_dimension_dict[feature_name]

        io_funcs = BinaryIOCollection()

        mlpg_algo = MLParameterGeneration()

        findex=0
        flen=len(in_file_list)
        for file_name in in_file_list:

            findex=findex+1

            dir_name = os.path.dirname(file_name)
            file_id = os.path.splitext(os.path.basename(file_name))[0]

            features, frame_number = io_funcs.load_binary_file_frame(file_name, dimension)

            logger.info('processing %4d of %4d: %s' % (findex,flen,file_name) )

            for feature_name in self.gen_wav_features:

                logger.debug(' feature: %s' % feature_name)

                current_features = features[:, stream_start_index[feature_name]:stream_start_index[feature_name]+out_dimension_dict[feature_name]]
                if FAST_MLPG:
                    ### fast version wants variance per frame, not single global one:
                    var = self.var[feature_name]
                    var = numpy.transpose(numpy.tile(var,frame_number))
                else:
                    var = self.var[feature_name]

#                print  var.shape[1]
                if do_MLPG == False:
                    gen_features = current_features
                else:
                    gen_features = mlpg_algo.generation(current_features, var, out_dimension_dict[feature_name]//3)
#                else:
#                    self.logger.critical("the dimensions do not match for MLPG: %d vs %d" %(var.shape[1], out_dimension_dict[feature_name]))
#                    raise

                logger.debug(' feature dimensions: %d by %d' %(gen_features.shape[0], gen_features.shape[1]))

                if feature_name in ['lf0', 'F0']:
                    if 'vuv' in stream_start_index:
                        vuv_feature = features[:, stream_start_index['vuv']:stream_start_index['vuv']+1]

                        for i in range(frame_number):
                            if vuv_feature[i, 0] < 0.5 or gen_features[i, 0] < numpy.log(20):
                                gen_features[i, 0] = self.inf_float

                new_file_name = os.path.join(dir_name, file_id + file_extension_dict[feature_name])

                if self.enforce_silence:
                    silence_pattern = cfg.silence_pattern
                    label_align_dir = cfg.in_label_align_dir
                    in_f = open(label_align_dir+'/'+file_id+'.lab','r')
                    for line in in_f.readlines():
                        line = line.strip()

                        if len(line) < 1:
                            continue
                        temp_list  = re.split('\s+', line)
                        start_time = int(int(temp_list[0])*(10**-4)/5)
                        end_time   = int(int(temp_list[1])*(10**-4)/5)

                        full_label = temp_list[2]

                        label_binary_flag = self.check_silence_pattern(full_label, silence_pattern)

                        if label_binary_flag:
                            if feature_name in ['lf0', 'F0', 'mag']:
                                gen_features[start_time:end_time, :] = self.inf_float
                            else:
                                gen_features[start_time:end_time, :] = 0.0

                io_funcs.array_to_binary_file(gen_features, new_file_name)
                logger.debug(' wrote to file %s' % new_file_name)
Пример #60
0
    def load_next_partition(self):
        """Load one block data. The number of frames will be the buffer size set during intialisation.

        """

        self.logger.debug('loading next partition')

        temp_set_x = numpy.empty((self.buffer_size, self.n_ins))
        temp_set_y = numpy.empty((self.buffer_size, self.n_outs))
        current_index = 0

        ### first check whether there are remaining data from previous utterance
        if self.remain_frame_number > 0:
            temp_set_x[current_index:self.remain_frame_number, ] = self.remain_data_x
            temp_set_y[current_index:self.remain_frame_number, ] = self.remain_data_y
            current_index += self.remain_frame_number

            self.remain_frame_number = 0

        io_fun = BinaryIOCollection()
        while True:
            if current_index >= self.buffer_size:
                break
            if  self.file_index >= self.list_size:
                self.end_reading = True
                self.file_index = 0
                break

            in_features, lab_frame_number = io_fun.load_binary_file_frame(self.x_files_list[self.file_index], self.n_ins)
            out_features, out_frame_number = io_fun.load_binary_file_frame(self.y_files_list[self.file_index], self.n_outs)

            frame_number = lab_frame_number
            if abs(lab_frame_number - out_frame_number) < 5:    ## we allow small difference here. may not be correct, but sometimes, there is one/two frames difference
                if lab_frame_number > out_frame_number:
                    frame_number = out_frame_number
            else:
                base_file_name = os.path.basename(self.x_files_list[self.file_index]).split('.')[0]
                self.logger.critical("the number of frames in label and acoustic features are different: %d vs %d (%s)" %(lab_frame_number, out_frame_number, base_file_name))
                raise

            out_features = out_features[0:frame_number, ]
            in_features = in_features[0:frame_number, ]

            if current_index + frame_number <= self.buffer_size:
                temp_set_x[current_index:current_index+frame_number, ] = in_features
                temp_set_y[current_index:current_index+frame_number, ] = out_features

                current_index = current_index + frame_number
            else:   ## if current utterance cannot be stored in the block, then leave the remaining part for the next block
                used_frame_number = self.buffer_size - current_index
                temp_set_x[current_index:self.buffer_size, ] = in_features[0:used_frame_number, ]
                temp_set_y[current_index:self.buffer_size, ] = out_features[0:used_frame_number, ]
                current_index = self.buffer_size

                self.remain_data_x = in_features[used_frame_number:frame_number, ]
                self.remain_data_y = out_features[used_frame_number:frame_number, ]
                self.remain_frame_number = frame_number - used_frame_number

            self.file_index += 1

        temp_set_x = temp_set_x[0:current_index, ]
        temp_set_y = temp_set_y[0:current_index, ]

        numpy.random.seed(271639)
        numpy.random.shuffle(temp_set_x)
        numpy.random.seed(271639)
        numpy.random.shuffle(temp_set_y)

        shared_set_x = self.make_shared(temp_set_x, 'x')
        shared_set_y = self.make_shared(temp_set_y, 'y')

        shared_set_xy = (shared_set_x, shared_set_y)
#        temp_set_x = self.make_shared(temp_set_x, 'x')
#        temp_set_y = self.make_shared(temp_set_y, 'y')

        return shared_set_xy, temp_set_x, temp_set_y