Пример #1
0
    def normal_standardization(self, in_file_list, out_file_list,
                               feature_dimension):

        #        self.dimension_dict = dimension_dict
        self.feature_dimension = feature_dimension

        mean_vector = self.compute_mean(in_file_list, 0, feature_dimension)
        std_vector = self.compute_std(in_file_list, mean_vector, 0,
                                      feature_dimension)

        io_funcs = BinaryIOCollection()
        file_number = len(in_file_list)

        for i in range(file_number):

            features, current_frame_number = io_funcs.load_binary_file_frame(
                in_file_list[i], self.feature_dimension)

            mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1))
            std_matrix = numpy.tile(std_vector, (current_frame_number, 1))

            norm_features = (features - mean_matrix) / std_matrix

            io_funcs.array_to_binary_file(norm_features, out_file_list[i])

        return mean_vector, std_vector
Пример #2
0
    def process_utterance(self, utt):
        # if utt.has_attribute("waveform"):
        # print "Utt has a natural waveform -- don't synthesise"
        # return

        if not self.trained:
            print('WARNING: Cannot apply processor %s till model is trained' %
                  (self.processor_name))
            return

        label = utt.get_filename(self.input_label_filetype)
        owave = utt.get_filename(self.output_filetype)

        streams = self.model.generate(label, variance_expansion=self.variance_expansion, \
                                      fill_unvoiced_gaps=self.fill_unvoiced_gaps)

        # TODO: save streams to binary files
        # Streams are a dictionary: {bap, lf0, mgc, vuv}
        # I can specify path via self.voice_resources['voice'] ('/home/alexander/Documents/text_to_speech/projects/Ossian_py3/voices//ice/ivona/lvl_lex_01_nn')
        directory = os.path.join(self.voice_resources.path['voice'], 'output',
                                 'cmp')
        if not os.path.exists(directory):
            os.makedirs(directory)

        io = BinaryIOCollection()
        for name, data in streams.items():
            file = os.path.join(directory,
                                utt.data.attrib['utterance_name'] + '.' + name)
            io.array_to_binary_file(data, file)
            # writelist(utt_data=data, label_file=file, uni=False)

        self.world_resynth(streams, owave)
Пример #3
0
    def merge_data(self, binary_label_file_list, new_feat_file_list, out_feat_file_list):
        '''
        merging new features with normalised label features
        '''
        utt_number = len(new_feat_file_list)
        if utt_number != len(binary_label_file_list):
            print("the number of new feature input files and label files should be the same!\n");
            sys.exit(1)

        new_feat_ext   = new_feat_file_list[0].split('/')[-1].split('.')[1]

        io_funcs = BinaryIOCollection()
        for i in range(utt_number):
            lab_file_name = binary_label_file_list[i]
            new_feat_file_name = new_feat_file_list[i]
            out_feat_file_name = out_feat_file_list[i]

            lab_features, lab_frame_number  = io_funcs.load_binary_file_frame(lab_file_name, self.lab_dim)
            new_features, feat_frame_number = io_funcs.load_binary_file_frame(new_feat_file_name, self.feat_dim)


            if (lab_frame_number - feat_frame_number)>5:
                base_file_name = new_feat_file_list[i].split('/')[-1].split('.')[0]
                self.logger.critical("the number of frames in label and new features are different: %d vs %d (%s)" %(lab_frame_number, feat_frame_number, base_file_name))
                raise

            merged_features = numpy.zeros((lab_frame_number, self.lab_dim+self.feat_dim))

            merged_features[0:lab_frame_number, 0:self.lab_dim] = lab_features
            merged_features[0:feat_frame_number, self.lab_dim:self.lab_dim+self.feat_dim] = new_features[0:lab_frame_number, ]

            io_funcs.array_to_binary_file(merged_features, out_feat_file_name)
            self.logger.debug('merged new feature %s of %d frames with %d label features' % (new_feat_ext, feat_frame_number,lab_frame_number) )
    def feature_normalisation(self, in_file_list, out_file_list):
        logger = logging.getLogger('feature_normalisation')

        #        self.feature_dimension = feature_dimension
        try:
            assert len(in_file_list) == len(out_file_list)
        except AssertionError:
            logger.critical(
                'The input and output file numbers are not the same! %d vs %d'
                % (len(in_file_list), len(out_file_list)))
            raise

        if self.mean_vector == None:
            self.mean_vector = self.compute_mean(in_file_list, 0,
                                                 self.feature_dimension)
        if self.std_vector == None:
            self.std_vector = self.compute_std(in_file_list, self.mean_vector,
                                               0, self.feature_dimension)

        io_funcs = BinaryIOCollection()
        file_number = len(in_file_list)
        for i in xrange(file_number):
            features, current_frame_number = io_funcs.load_binary_file_frame(
                in_file_list[i], self.feature_dimension)

            mean_matrix = numpy.tile(self.mean_vector,
                                     (current_frame_number, 1))
            std_matrix = numpy.tile(self.std_vector, (current_frame_number, 1))

            norm_features = (features - mean_matrix) / std_matrix

            io_funcs.array_to_binary_file(norm_features, out_file_list[i])

        return self.mean_vector, self.std_vector
Пример #5
0
    def make_equal_frames(self, in_file_list, ref_file_list, in_dimension_dict):
        logger = logging.getLogger("acoustic_comp")

        logger.info('making equal number of lines...')

        io_funcs = BinaryIOCollection()

        utt_number = len(in_file_list)

        for i in range(utt_number):
            in_file_name = in_file_list[i]
            in_data_stream_name = in_file_name.split('.')[-1]
            in_feature_dim = in_dimension_dict[in_data_stream_name]
            in_features, in_frame_number = io_funcs.load_binary_file_frame(in_file_name, in_feature_dim)

            ref_file_name = ref_file_list[i]
            ref_data_stream_name = ref_file_name.split('.')[-1]
            ref_feature_dim = in_dimension_dict[ref_data_stream_name]
            ref_features, ref_frame_number = io_funcs.load_binary_file_frame(ref_file_name, ref_feature_dim)

            target_features = numpy.zeros((ref_frame_number, in_feature_dim))
            if in_frame_number == ref_frame_number:
                continue;
            elif in_frame_number > ref_frame_number:
                target_features[0:ref_frame_number, ] = in_features[0:ref_frame_number, ]
            elif in_frame_number < ref_frame_number:
                target_features[0:in_frame_number, ] = in_features[0:in_frame_number, ]
            io_funcs.array_to_binary_file(target_features, in_file_name)

        logger.info('Finished: made equal rows in data stream %s with reference to data stream %s ' %(in_data_stream_name, ref_data_stream_name))
Пример #6
0
    def normalise_data(self, in_file_list, out_file_list):
        file_number = len(in_file_list)

        fea_max_min_diff = self.max_vector - self.min_vector
        diff_value = self.target_max_value - self.target_min_value
        fea_max_min_diff = numpy.reshape(fea_max_min_diff, (1, self.feature_dimension))

        target_max_min_diff = numpy.zeros((1, self.feature_dimension))
        target_max_min_diff.fill(diff_value)

        target_max_min_diff[fea_max_min_diff <= 0.0] = 1.0
        fea_max_min_diff[fea_max_min_diff <= 0.0] = 1.0

        io_funcs = BinaryIOCollection()
        for i in range(file_number):
            features = io_funcs.load_binary_file(in_file_list[i], self.feature_dimension)

            frame_number = features.size // self.feature_dimension
            fea_min_matrix = numpy.tile(self.min_vector, (frame_number, 1))
            target_min_matrix = numpy.tile(self.target_min_value, (frame_number, self.feature_dimension))

            fea_diff_matrix = numpy.tile(fea_max_min_diff, (frame_number, 1))
            diff_norm_matrix = numpy.tile(target_max_min_diff, (frame_number, 1)) / fea_diff_matrix

            norm_features = diff_norm_matrix * (features - fea_min_matrix) + target_min_matrix

            ## If we are to keep some columns unnormalised, use advanced indexing to
            ## reinstate original values:
            m,n = numpy.shape(features)
            for col in self.exclude_columns:
                norm_features[list(range(m)),[col]*m] = features[list(range(m)),[col]*m]

            io_funcs.array_to_binary_file(norm_features, out_file_list[i])
Пример #7
0
    def denormalise_data(self, in_file_list, out_file_list):

        logger = logging.getLogger("acoustic_norm")

        file_number = len(in_file_list)
        logger.info('MinMaxNormalisation.denormalise_data for %d files' % file_number)

        # print   self.max_vector, self.min_vector
        fea_max_min_diff = self.max_vector - self.min_vector
        diff_value = self.target_max_value - self.target_min_value
        # logger.debug('reshaping fea_max_min_diff from shape %s to (1,%d)' % (fea_max_min_diff.shape, self.feature_dimension) )

        fea_max_min_diff = numpy.reshape(fea_max_min_diff, (1, self.feature_dimension))

        target_max_min_diff = numpy.zeros((1, self.feature_dimension))
        target_max_min_diff.fill(diff_value)

        target_max_min_diff[fea_max_min_diff <= 0.0] = 1.0
        fea_max_min_diff[fea_max_min_diff <= 0.0] = 1.0

        io_funcs = BinaryIOCollection()
        for i in range(file_number):
            features = io_funcs.load_binary_file(in_file_list[i], self.feature_dimension)

            frame_number = features.size // self.feature_dimension
            fea_min_matrix = numpy.tile(self.min_vector, (frame_number, 1))
            target_min_matrix = numpy.tile(self.target_min_value, (frame_number, self.feature_dimension))

            fea_diff_matrix = numpy.tile(fea_max_min_diff, (frame_number, 1))
            diff_norm_matrix = fea_diff_matrix / numpy.tile(target_max_min_diff, (frame_number, 1))
            norm_features = diff_norm_matrix * (features - target_min_matrix) + fea_min_matrix
            io_funcs.array_to_binary_file(norm_features, out_file_list[i])
    def feature_denormalisation(self, in_file_list, out_file_list, mean_vector,
                                std_vector):
        io_funcs = BinaryIOCollection()
        file_number = len(in_file_list)
        try:
            assert len(in_file_list) == len(out_file_list)
        except AssertionError:
            logger.critical(
                'The input and output file numbers are not the same! %d vs %d'
                % (len(in_file_list), len(out_file_list)))
            raise

        try:
            assert mean_vector.size == self.feature_dimension and std_vector.size == self.feature_dimension
        except AssertionError:
            logger.critical(
                'the dimensionalities of the mean and standard derivation vectors are not the same as the dimensionality of the feature'
            )
            raise

        for i in xrange(file_number):
            features, current_frame_number = io_funcs.load_binary_file_frame(
                in_file_list[i], self.feature_dimension)

            mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1))
            std_matrix = numpy.tile(std_vector, (current_frame_number, 1))

            norm_features = features * std_matrix + mean_matrix

            io_funcs.array_to_binary_file(norm_features, out_file_list[i])
Пример #9
0
    def merge_label(self, binary_label_file_list, new_feat_file_list,
                    out_feat_file_list):
        """
            merging additional label for each utterance. 
        """
        utt_number = len(new_feat_file_list)
        if utt_number != len(binary_label_file_list):
            print(
                "the number of new feature input files and label files should be the same!\n"
            )
            sys.exit(1)

        io_funcs = BinaryIOCollection()
        for i in range(utt_number):

            lab_file_name = binary_label_file_list[i]
            new_feat_file_name = new_feat_file_list[i]
            out_feat_file_name = out_feat_file_list[i]

            lab_features, lab_frame_number = io_funcs.load_binary_file_frame(
                lab_file_name, self.lab_dim)
            # shape of new feature shoule be (1, dim)
            new_features = io_funcs.load_binary_file(new_feat_file_name,
                                                     self.feat_dim)
            # expand shape of new feature to (T, dim)
            new_features = numpy.tile(new_features, (lab_frame_number, 1))
            merged_features = numpy.zeros(
                (lab_frame_number, self.lab_dim + self.feat_dim))

            merged_features[0:lab_frame_number, 0:self.lab_dim] = lab_features
            merged_features[0:lab_frame_number, self.lab_dim:self.lab_dim +
                            self.feat_dim] = new_features[0:lab_frame_number, ]

            io_funcs.array_to_binary_file(merged_features, out_feat_file_name)
Пример #10
0
    def produce_nn_cmp(self, in_file_list, out_file_list):


        logger = logging.getLogger("acoustic_norm")

        delta_win = [-0.5, 0.0, 0.5]
        acc_win   = [1.0, -2.0, 1.0]
        
        file_number = len(in_file_list)
        logger.info('starting creation of %d files' % file_number)

        for i in xrange(file_number):
            
            mgc_data, bap_data, lf0_data = self.load_cmp_file(in_file_list[i])
            ip_lf0, vuv_vector = self.interpolate_f0(lf0_data)
            
            delta_lf0 = self.compute_delta(ip_lf0, delta_win)
            acc_lf0 = self.compute_delta(ip_lf0, acc_win)

            frame_number = ip_lf0.size

            cmp_data = numpy.concatenate((mgc_data, ip_lf0, delta_lf0, acc_lf0, vuv_vector, bap_data), axis=1)
            
            io_funcs = BinaryIOCollection()
            io_funcs.array_to_binary_file(cmp_data, out_file_list[i])
            
        logger.info('finished creation of %d binary files' % file_number)
Пример #11
0
    def predict(self,
                test_x,
                out_scaler,
                gen_test_file_list,
                sequential_training=False,
                stateful=False):
        #### compute predictions ####
        io_funcs = BinaryIOCollection()

        test_file_number = len(gen_test_file_list)
        print("generating features on held-out test data...")
        for utt_index in range(test_file_number):
            gen_test_file_name = gen_test_file_list[utt_index]
            test_id = os.path.splitext(os.path.basename(gen_test_file_name))[0]
            temp_test_x = test_x[test_id]
            num_of_rows = temp_test_x.shape[0]

            if stateful:
                temp_test_x = data_utils.get_stateful_input(
                    temp_test_x, self.seq_length, self.batch_size)
            elif sequential_training:
                temp_test_x = np.reshape(temp_test_x,
                                         (1, num_of_rows, self.n_in))

            predictions = self.model.predict(temp_test_x)
            if sequential_training:
                predictions = np.reshape(predictions,
                                         (num_of_rows, self.n_out))

            data_utils.denorm_data(predictions, out_scaler)

            io_funcs.array_to_binary_file(predictions, gen_test_file_name)
            data_utils.drawProgressBar(utt_index + 1, test_file_number)

        sys.stdout.write("\n")
Пример #12
0
    def feature_normalisation(self, in_file_list, out_file_list):
        logger = logging.getLogger('feature_normalisation')

#        self.feature_dimension = feature_dimension
        try:
            assert len(in_file_list) == len(out_file_list)
        except  AssertionError:
            logger.critical('The input and output file numbers are not the same! %d vs %d' %(len(in_file_list), len(out_file_list)))
            raise

        if self.mean_vector is None:
            self.mean_vector = self.compute_mean(in_file_list, 0, self.feature_dimension)
        if self.std_vector  is None:
            self.std_vector = self.compute_std(in_file_list, self.mean_vector, 0, self.feature_dimension)

        io_funcs = BinaryIOCollection()
        file_number = len(in_file_list)
        for i in range(file_number):
            features, current_frame_number = io_funcs.load_binary_file_frame(in_file_list[i], self.feature_dimension)

            mean_matrix = numpy.tile(self.mean_vector, (current_frame_number, 1))
            std_matrix = numpy.tile(self.std_vector, (current_frame_number, 1))

            norm_features = (features - mean_matrix) / std_matrix
            print(current_frame_number,in_file_list[i])
            norm_features=numpy.concatenate([norm_features[:,:self.feature_dimension-2],features[:,self.feature_dimension-2:]],axis=-1)


            # in fact the problem is that I normalized the out put xvector pvector and onehotvector.... so we have to in formalized this
            print(' normalized vector :{}'.format(norm_features[1,:]))

            io_funcs.array_to_binary_file(norm_features, out_file_list[i])

        return  self.mean_vector, self.std_vector
Пример #13
0
    def extract_dur_features(self, orig_file, output_file):
        io_funcs = BinaryIOCollection()
        totalMat = io_funcs.file2matrix(orig_file)
        self.label_dimension = totalMat.shape[1] - 5  # collum num
        durMat = totalMat[:, -5:]

        io_funcs.array_to_binary_file(durMat, output_file)
Пример #14
0
    def predict(self, test_x, out_scaler, gen_test_file_list, sequential_training=False, stateful=False):
        #### compute predictions ####
        io_funcs = BinaryIOCollection()

        test_file_number = len(gen_test_file_list)
        print("generating features on held-out test data...")
        for utt_index in range(test_file_number):
            gen_test_file_name = gen_test_file_list[utt_index]
            test_id = os.path.splitext(os.path.basename(gen_test_file_name))[0]
            temp_test_x        = test_x[test_id]
            num_of_rows        = temp_test_x.shape[0]

            if stateful:
                temp_test_x = data_utils.get_stateful_input(temp_test_x, self.seq_length, self.batch_size)
            elif sequential_training:
                temp_test_x = np.reshape(temp_test_x, (1, num_of_rows, self.n_in))

            predictions = self.model.predict(temp_test_x)
            if sequential_training:
                predictions = np.reshape(predictions, (num_of_rows, self.n_out))

            data_utils.denorm_data(predictions, out_scaler)

            io_funcs.array_to_binary_file(predictions, gen_test_file_name)
            data_utils.drawProgressBar(utt_index+1, test_file_number)

        sys.stdout.write("\n")
Пример #15
0
    def feature_normalisation(self, in_file_list, out_file_list):
        logger = logging.getLogger('feature_normalisation')
        
#        self.feature_dimension = feature_dimension
        try:
            assert len(in_file_list) == len(out_file_list)
        except  AssertionError:
            logger.critical('The input and output file numbers are not the same! %d vs %d' %(len(in_file_list), len(out_file_list)))
            raise

        if self.mean_vector == None:
            self.mean_vector = self.compute_mean(in_file_list, 0, self.feature_dimension)
        if self.std_vector  == None:
            self.std_vector = self.compute_std(in_file_list, self.mean_vector, 0, self.feature_dimension)
        
        io_funcs = BinaryIOCollection()
        file_number = len(in_file_list)
        for i in xrange(file_number):
            features, current_frame_number = io_funcs.load_binary_file_frame(in_file_list[i], self.feature_dimension)

            mean_matrix = numpy.tile(self.mean_vector, (current_frame_number, 1))
            std_matrix = numpy.tile(self.std_vector, (current_frame_number, 1))
            
            norm_features = (features - mean_matrix) / std_matrix
            
            io_funcs.array_to_binary_file(norm_features, out_file_list[i])

        return  self.mean_vector, self.std_vector
Пример #16
0
    def make_equal_frames(self, in_file_list, ref_file_list, in_dimension_dict):
        logger = logging.getLogger("acoustic_comp")
        
        logger.info('making equal number of lines...')
        
        io_funcs = BinaryIOCollection()

        utt_number = len(in_file_list)

        for i in xrange(utt_number):
            in_file_name = in_file_list[i]
            in_data_stream_name = in_file_name.split('.')[-1]
            in_feature_dim = in_dimension_dict[in_data_stream_name]
            in_features, in_frame_number = io_funcs.load_binary_file_frame(in_file_name, in_feature_dim)
            
            ref_file_name = ref_file_list[i]
            ref_data_stream_name = ref_file_name.split('.')[-1]
            ref_feature_dim = in_dimension_dict[ref_data_stream_name]
            ref_features, ref_frame_number = io_funcs.load_binary_file_frame(ref_file_name, ref_feature_dim)

            target_features = numpy.zeros((ref_frame_number, in_feature_dim))
            if in_frame_number == ref_frame_number:
                continue;
            elif in_frame_number > ref_frame_number:
                target_features[0:ref_frame_number, ] = in_features[0:ref_frame_number, ]
            elif in_frame_number < ref_frame_number:
                target_features[0:in_frame_number, ] = in_features[0:in_frame_number, ]
            io_funcs.array_to_binary_file(target_features, in_file_name)
        
        logger.info('Finished: made equal rows in data stream %s with reference to data stream %s ' %(in_data_stream_name, ref_data_stream_name))
Пример #17
0
def simple_scale_variance_CONTINUUM(indir, outdir, var_file_dict,
                                    out_dimension_dict, file_id_list):
    ## Try range of interpolation weights for combining global & local variance
    all_streams = ['cmp', 'HNR', 'F0', 'LSF', 'Gain', 'LSFsource']
    streams_to_scale = ['LSF']

    static_variances = {}

    static_dimension_dict = {}
    for (feature_name, size) in out_dimension_dict.items():
        static_dimension_dict[feature_name] = size / 3

    io_funcs = BinaryIOCollection()
    for feature_name in var_file_dict.keys():
        var_values, dimension = io_funcs.load_binary_file_frame(
            var_file_dict[feature_name], 1)
        static_var_values = var_values[:static_dimension_dict[feature_name], :]
        static_variances[feature_name] = static_var_values

    if not os.path.isdir(outdir):
        os.makedirs(outdir)

    file_id_list_out = []
    for uttname in file_id_list:
        for gv_weight in [
                0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0
        ]:
            local_weight = 1.0 - gv_weight
            for stream in all_streams:
                infile = os.path.join(indir, uttname + '.' + stream)
                extended_uttname = uttname + '_gv' + str(gv_weight)
                print extended_uttname
                outfile = os.path.join(outdir, extended_uttname + '.' + stream)
                if not os.path.isfile(infile):
                    sys.exit(infile + ' does not exist')
                if stream in streams_to_scale:
                    speech, dimension = io_funcs.load_binary_file_frame(
                        infile, static_dimension_dict[stream])
                    utt_mean = numpy.mean(speech, axis=0)
                    utt_std = numpy.std(speech, axis=0)

                    global_std = numpy.transpose((static_variances[stream]))

                    weighted_global_std = (gv_weight * global_std) + (
                        local_weight * utt_std)

                    std_ratio = weighted_global_std / utt_std

                    nframes, ndim = numpy.shape(speech)
                    utt_mean_matrix = numpy.tile(utt_mean, (nframes, 1))
                    std_ratio_matrix = numpy.tile(std_ratio, (nframes, 1))

                    scaled_speech = ((speech - utt_mean_matrix) *
                                     std_ratio_matrix) + utt_mean_matrix
                    io_funcs.array_to_binary_file(scaled_speech, outfile)

                else:
                    os.system('cp %s %s' % (infile, outfile))
            file_id_list_out.append(extended_uttname)
    return file_id_list_out
Пример #18
0
def simple_scale_variance(indir,
                          outdir,
                          var_file_dict,
                          out_dimension_dict,
                          file_id_list,
                          gv_weight=1.0):
    ## simple variance scaling (silen et al. 2012, paragraph 3.1)
    ## TODO: Lots of things like stream names hardcoded here; 3 for delta + delta-delta; ...
    #     all_streams = ['cmp','HNR','F0','LSF','Gain','LSFsource']
    #     streams_to_scale = ['LSF']
    all_streams = ['cmp', 'mgc', 'lf0', 'bap']
    streams_to_scale = ['mgc']

    static_variances = {}

    static_dimension_dict = {}
    for (feature_name, size) in out_dimension_dict.items():
        static_dimension_dict[feature_name] = size / 3

    io_funcs = BinaryIOCollection()
    for feature_name in var_file_dict.keys():
        var_values, dimension = io_funcs.load_binary_file_frame(
            var_file_dict[feature_name], 1)
        static_var_values = var_values[:static_dimension_dict[feature_name], :]
        static_variances[feature_name] = static_var_values

    if not os.path.isdir(outdir):
        os.makedirs(outdir)

    assert gv_weight <= 1.0 and gv_weight >= 0.0
    local_weight = 1.0 - gv_weight

    for uttname in file_id_list:
        for stream in all_streams:
            infile = os.path.join(indir, uttname + '.' + stream)
            outfile = os.path.join(outdir, uttname + '.' + stream)
            if not os.path.isfile(infile):
                sys.exit(infile + ' does not exist')
            if stream in streams_to_scale:
                speech, dimension = io_funcs.load_binary_file_frame(
                    infile, static_dimension_dict[stream])
                utt_mean = numpy.mean(speech, axis=0)
                utt_std = numpy.std(speech, axis=0)

                global_std = numpy.transpose((static_variances[stream]))
                weighted_global_std = (gv_weight *
                                       global_std) + (local_weight * utt_std)
                std_ratio = weighted_global_std / utt_std

                nframes, ndim = numpy.shape(speech)
                utt_mean_matrix = numpy.tile(utt_mean, (nframes, 1))
                std_ratio_matrix = numpy.tile(std_ratio, (nframes, 1))

                scaled_speech = ((speech - utt_mean_matrix) *
                                 std_ratio_matrix) + utt_mean_matrix
                io_funcs.array_to_binary_file(scaled_speech, outfile)

            else:
                os.system('cp %s %s' % (infile, outfile))
Пример #19
0
    def extract_label_features(self, orig_file, output_file):
        io_funcs = BinaryIOCollection()
        totalMat = io_funcs.file2matrix(orig_file)
        self.label_dimension = totalMat.shape[1] - 5  # collum num
        labelMat = totalMat[:, :-5]
        #print orig_file, totalMat.shape, labelMat.shape

        io_funcs.array_to_binary_file(labelMat, output_file)
Пример #20
0
def simple_scale_variance_CONTINUUM(indir, outdir, var_file_dict, out_dimension_dict, file_id_list):
    ## Try range of interpolation weights for combining global & local variance
    all_streams = ['cmp','HNR','F0','LSF','Gain','LSFsource']
    streams_to_scale = ['LSF']

    static_variances = {}

    static_dimension_dict = {}
    for (feature_name,size) in list(out_dimension_dict.items()):
        static_dimension_dict[feature_name] = size/3

    io_funcs = BinaryIOCollection()
    for feature_name in list(var_file_dict.keys()):
        var_values, dimension = io_funcs.load_binary_file_frame(var_file_dict[feature_name], 1)
        static_var_values = var_values[:static_dimension_dict[feature_name], :]
        static_variances[feature_name] = static_var_values

    if not os.path.isdir(outdir):
        os.makedirs(outdir)

    file_id_list_out = []
    for uttname in file_id_list:
        for gv_weight in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
            local_weight = 1.0 - gv_weight
            for stream in all_streams:
                infile = os.path.join(indir, uttname + '.' + stream)
                extended_uttname = uttname + '_gv' + str(gv_weight)
                print(extended_uttname)
                outfile = os.path.join(outdir, extended_uttname + '.' + stream)
                if not os.path.isfile(infile):
                    sys.exit(infile + ' does not exist')
                if stream in streams_to_scale:
                    speech, dimension = io_funcs.load_binary_file_frame(infile, static_dimension_dict[stream])
                    utt_mean = numpy.mean(speech, axis=0)
                    utt_std =  numpy.std(speech, axis=0)

                    global_std = numpy.transpose((static_variances[stream]))

                    weighted_global_std = (gv_weight * global_std) + (local_weight * utt_std)

                    std_ratio = weighted_global_std / utt_std

                    nframes, ndim = numpy.shape(speech)
                    utt_mean_matrix = numpy.tile(utt_mean, (nframes,1))
                    std_ratio_matrix = numpy.tile(std_ratio, (nframes,1))

                    scaled_speech = ((speech - utt_mean_matrix) * std_ratio_matrix) + utt_mean_matrix
                    io_funcs.array_to_binary_file(scaled_speech, outfile)


                else:
                    os.system('cp %s %s'%(infile, outfile))
            file_id_list_out.append(extended_uttname)
    return file_id_list_out
Пример #21
0
    def extract_linguistic_features(self, in_file_name, out_file_name=None, label_type="state_align", dur_file_name=None):
        if label_type=="phone_align":
            A = self.load_labels_with_phone_alignment(in_file_name, dur_file_name)
        elif label_type=="state_align":
            A = self.load_labels_with_state_alignment(in_file_name)
        else:
            logger.critical("we don't support %s labels as of now!!" % (label_type))

        if out_file_name:
            io_funcs = BinaryIOCollection()
            io_funcs.array_to_binary_file(A, out_file_name)
        else:
            return A
Пример #22
0
    def extract_linguistic_features(self, in_file_name, out_file_name=None, label_type="state_align", dur_file_name=None):
        if label_type=="phone_align":
            A = self.load_labels_with_phone_alignment(in_file_name, dur_file_name)
        elif label_type=="state_align":
            A = self.load_labels_with_state_alignment(in_file_name)
        else:
            logger.critical("we don't support %s labels as of now!!" % (label_type))

        if out_file_name:
            io_funcs = BinaryIOCollection()
            io_funcs.array_to_binary_file(A, out_file_name)
        else:
            return A
Пример #23
0
def simple_scale_variance(indir, outdir, var_file_dict, out_dimension_dict, file_id_list, gv_weight=1.0):
    ## simple variance scaling (silen et al. 2012, paragraph 3.1)
    ## TODO: Lots of things like stream names hardcoded here; 3 for delta + delta-delta; ...
#     all_streams = ['cmp','HNR','F0','LSF','Gain','LSFsource']
#     streams_to_scale = ['LSF']
    all_streams = ['cmp','mgc','lf0','bap']
    streams_to_scale = ['mgc']
    
    static_variances = {}
 
    static_dimension_dict = {}
    for (feature_name,size) in out_dimension_dict.items():
        static_dimension_dict[feature_name] = size/3

    io_funcs = BinaryIOCollection()
    for feature_name in var_file_dict.keys():
        var_values, dimension = io_funcs.load_binary_file_frame(var_file_dict[feature_name], 1)
        static_var_values = var_values[:static_dimension_dict[feature_name], :]
        static_variances[feature_name] = static_var_values

    if not os.path.isdir(outdir):
        os.makedirs(outdir)

    assert gv_weight <= 1.0 and gv_weight >= 0.0
    local_weight = 1.0 - gv_weight

    for uttname in file_id_list:
        for stream in all_streams:
            infile = os.path.join(indir, uttname + '.' + stream)
            outfile = os.path.join(outdir, uttname + '.' + stream)
            if not os.path.isfile(infile):
                sys.exit(infile + ' does not exist')
            if stream in streams_to_scale:
                speech, dimension = io_funcs.load_binary_file_frame(infile, static_dimension_dict[stream])
                utt_mean = numpy.mean(speech, axis=0) 
                utt_std =  numpy.std(speech, axis=0) 

                global_std = numpy.transpose((static_variances[stream]))
                weighted_global_std = (gv_weight * global_std) + (local_weight * utt_std)
                std_ratio = weighted_global_std / utt_std 

                nframes, ndim = numpy.shape(speech)
                utt_mean_matrix = numpy.tile(utt_mean, (nframes,1))
                std_ratio_matrix = numpy.tile(std_ratio, (nframes,1))

                scaled_speech = ((speech - utt_mean_matrix) * std_ratio_matrix) + utt_mean_matrix
                io_funcs.array_to_binary_file(scaled_speech, outfile)


            else:
                os.system('cp %s %s'%(infile, outfile))
    def duration_decomposition(self,
                               in_file_list,
                               dimension,
                               out_dimension_dict,
                               file_extension_dict,
                               meta=None):

        logger = logging.getLogger('param_generation')

        logger.debug('duration_decomposition for %d files' % len(in_file_list))

        state_number = 5  ## hard coding, try removing in future?

        if len(list(out_dimension_dict.keys())) > 1:
            logger.critical(
                "we don't support any additional features along with duration as of now."
            )
            sys.exit(1)
        else:
            feature_name = list(out_dimension_dict.keys())[0]

        io_funcs = BinaryIOCollection()

        findex = 0
        flen = len(in_file_list)
        for file_name in in_file_list:

            findex = findex + 1

            dir_name = os.path.dirname(file_name)
            file_id = os.path.splitext(os.path.basename(file_name))[0]

            features, frame_number = io_funcs.load_binary_file_frame(
                file_name, dimension)
            gen_features = numpy.int32(numpy.round(features))
            gen_features[gen_features < 1] = 1

            if dimension > state_number:
                gen_features = gen_features[:, state_number]

            logger.info('processing %4d of %4d: %s' %
                        (findex, flen, file_name))

            if meta is not None:
                gen_features = self.hardcode_duration(meta, gen_features)

            new_file_name = os.path.join(
                dir_name, file_id + file_extension_dict[feature_name])
            io_funcs.array_to_binary_file(gen_features, new_file_name)

            logger.debug('wrote to file %s' % new_file_name)
Пример #25
0
    def predict(self, test_x, out_scaler, gen_test_file_list, sequential_training=False, stateful=False):
        #### compute predictions ####

        io_funcs = BinaryIOCollection()

        test_id_list = test_x.keys()
        test_id_list.sort()

        test_file_number = len(test_id_list)

        print("generating features on held-out test data...")
        with tf.Session() as sess:
           new_saver=tf.train.import_meta_graph(os.path.join(self.ckpt_dir,"mymodel.ckpt.meta"))
           print "loading the model parameters..."
           output_layer=tf.get_collection("output_layer")[0]
           input_layer=tf.get_collection("input_layer")[0]
           new_saver.restore(sess,os.path.join(self.ckpt_dir,"mymodel.ckpt"))
           print "The model parameters are successfully restored"
           for utt_index in xrange(test_file_number):
               gen_test_file_name = gen_test_file_list[utt_index]
               temp_test_x        = test_x[test_id_list[utt_index]]
               num_of_rows        = temp_test_x.shape[0]
               if not sequential_training:
                   is_training_batch=tf.get_collection("is_training_batch")[0]
                   if self.dropout_rate!=0.0:
                        is_training_drop=tf.get_collection("is_training_drop")[0]
                        y_predict=sess.run(output_layer,feed_dict={input_layer:temp_test_x,is_training_drop:False,is_training_batch:False})
                   else:
                        y_predict=sess.run(output_layer,feed_dict={input_layer:temp_test_x,is_training_batch:False})
               else:
                        temp_test_x=np.reshape(temp_test_x,[1,num_of_rows,self.n_in])
                        hybrid=0
                        utt_length_placeholder=tf.get_collection("utt_length")[0]
                        if "tanh" in self.hidden_layer_type:
                            hybrid=1
                            is_training_batch=tf.get_collection("is_training_batch")[0]
                        if self.dropout_rate!=0.0:
                           is_training_drop=tf.get_collection("is_training_drop")[0]
                           if hybrid:
                              y_predict=sess.run(output_layer,feed_dict={input_layer:temp_test_x,utt_length_placeholder:[num_of_rows],is_training_drop:False,is_training_batch:False})
                           else:
                              y_predict=sess.run(output_layer,feed_dict={input_layer:temp_test_x,utt_length_placeholder:[num_of_rows],is_training_drop:False})
                        elif hybrid:
                              y_predict=sess.run(output_layer,feed_dict={input_layer:temp_test_x,utt_length_placeholder:[num_of_rows],is_training_batch:False})
                        else:
                              y_predict=sess.run(output_layer,feed_dict={input_layer:temp_test_x,utt_length_placeholder:[num_of_rows]})
               data_utils.denorm_data(y_predict, out_scaler)
               io_funcs.array_to_binary_file(y_predict, gen_test_file_name)
               data_utils.drawProgressBar(utt_index+1, test_file_number)
Пример #26
0
    def predict(self, test_x, out_scaler, gen_test_file_list):
        #### compute predictions ####

        io_funcs = BinaryIOCollection()

        test_id_list = test_x.keys()
        test_id_list.sort()
        inference_batch_size = len(test_id_list)
        test_file_number = len(test_id_list)
        with tf.Session(graph=self.graph) as sess:
            new_saver = tf.train.import_meta_graph(self.ckpt_dir,
                                                   "mymodel.ckpt.meta")
            """Notice change targets=tf.get_collection("targets")[0]"""
            inputs_data = self.graph.get_collection("inputs_data")[0]
            """Notice Change decoder_outputs=tf.get_collection("decoder_outputs")[0]"""
            inputs_sequence_length = self.graph.get_collection(
                "inputs_sequence_length")[0]
            target_sequence_length = self.graph.get_collection(
                "target_sequence_length")[0]
            print("loading the model parameters...")
            new_saver.restore(sess, os.path.join(self.ckpt_dir,
                                                 "mymodel.ckpt"))
            print("Model parameters are successfully restored")
            print("generating features on held-out test data...")
            for utt_index in range(test_file_number):
                gen_test_file_name = gen_test_file_list[utt_index]
                temp_test_x = test_x[test_id_list[utt_index]]
                num_of_rows = temp_test_x.shape[0]

                #utt_length=[len(utt) for utt in test_x.values()]
                #max_step=max(utt_length)
                temp_test_x = tf.reshape(temp_test_x,
                                         [1, num_of_rows, self.n_in])

                outputs = np.zeros(shape=[len(test_x), max_step, self.n_out],
                                   dtype=np.float32)
                #dec_cell=self.graph.get_collection("decoder_cell")[0]
                print("Generating speech parameters ...")
                for t in range(num_of_rows):
                    #  outputs=sess.run(inference_output,{inputs_data:temp_test_x,inputs_sequence_length:utt_length,\
                    #            target_sequence_length:utt_length})
                    _outputs=sess.run(decoder_outputs,feed_dict={inputs_data:temp_test_x,targets:outputs,inputs_sequence_length:[num_of_rows],\
                              target_sequence_length:[num_of_rows]})
                    #   #print _outputs[:,t,:]
                    outputs[:, t, :] = _outputs[:, t, :]

                data_utils.denorm_data(outputs, out_scaler)
                io_funcs.array_to_binary_file(outputs, gen_test_file_name)
                data_utils.drawProgressBar(utt_index + 1, test_file_number)
Пример #27
0
def compute_norm_stats(data, stats_file, method="MVN"):
    #### normalize training data ####
    io_funcs = BinaryIOCollection()

    if method=="MVN":
        scaler = preprocessing.StandardScaler().fit(data)
        norm_matrix = np.vstack((scaler.mean_, scaler.scale_))
    elif method=="MINMAX":
        scaler = preprocessing.MinMaxScaler(feature_range=(0.01, 0.99)).fit(data)
        norm_matrix = np.vstack((scaler.min_, scaler.scale_))
    
    print norm_matrix.shape
    io_funcs.array_to_binary_file(norm_matrix, stats_file)

    return scaler
Пример #28
0
    def extract_dur_features(self, in_file_name, out_file_name=None, label_type="state_align", feature_type=None, unit_size=None, feat_size=None):
        logger = logging.getLogger("dur")
        if label_type=="phone_align":
            A = self.extract_dur_from_phone_alignment_labels(in_file_name, feature_type, unit_size, feat_size)
        elif label_type=="state_align":
            A = self.extract_dur_from_state_alignment_labels(in_file_name, feature_type, unit_size, feat_size)
        else:
            logger.critical("we don't support %s labels as of now!!" % (label_type))
            sys.exit(1)

        if out_file_name:
            io_funcs = BinaryIOCollection()
            io_funcs.array_to_binary_file(A, out_file_name)
        else:
            return A
Пример #29
0
def compute_norm_stats(data, stats_file, method="MVN"):
    #### normalize training data ####
    io_funcs = BinaryIOCollection()

    if method=="MVN":
        scaler = preprocessing.StandardScaler().fit(data)
        norm_matrix = np.vstack((scaler.mean_, scaler.scale_))
    elif method=="MINMAX":
        scaler = preprocessing.MinMaxScaler(feature_range=(0.01, 0.99)).fit(data)
        norm_matrix = np.vstack((scaler.min_, scaler.scale_))

    print(norm_matrix.shape)
    io_funcs.array_to_binary_file(norm_matrix, stats_file)

    return scaler
Пример #30
0
    def extract_dur_features(self, in_file_name, out_file_name=None, label_type="state_align", feature_type=None, unit_size=None, feat_size=None):
        logger = logging.getLogger("dur")
        if label_type=="phone_align":
            A = self.extract_dur_from_phone_alignment_labels(in_file_name, feature_type, unit_size, feat_size)
        elif label_type=="state_align":
            A = self.extract_dur_from_state_alignment_labels(in_file_name, feature_type, unit_size, feat_size)
        else:
            logger.critical("we don't support %s labels as of now!!" % (label_type))
            sys.exit(1)

        if out_file_name:
            io_funcs = BinaryIOCollection()
            io_funcs.array_to_binary_file(A, out_file_name)
        else:
            return A
Пример #31
0
    def normalise_data(self, in_file_list, out_file_list):
        file_number = len(in_file_list)

        fea_max_min_diff = self.max_vector - self.min_vector
        diff_value = self.target_max_value - self.target_min_value
        fea_max_min_diff = numpy.reshape(fea_max_min_diff,
                                         (1, self.feature_dimension))

        target_max_min_diff = numpy.zeros((1, self.feature_dimension))
        target_max_min_diff.fill(diff_value)

        target_max_min_diff[fea_max_min_diff <= 0.0] = 1.0
        fea_max_min_diff[fea_max_min_diff <= 0.0] = 1.0

        io_funcs = BinaryIOCollection()
        for i in xrange(file_number):
            features = io_funcs.load_binary_file(in_file_list[i],
                                                 self.feature_dimension)

            frame_number = features.size / self.feature_dimension
            fea_min_matrix = numpy.tile(self.min_vector, (frame_number, 1))
            fea_max_matrix = numpy.tile(self.max_vector, (frame_number, 1))

            for m in xrange(features.shape[0]):
                for n in xrange(features.shape[1]):
                    if features[m][n] < fea_min_matrix[m][n]:
                        features[m][n] = fea_min_matrix[m][n]
                    elif features[m][n] > fea_max_matrix[m][n]:
                        features[m][n] = fea_max_matrix[m][n]

            target_min_matrix = numpy.tile(
                self.target_min_value, (frame_number, self.feature_dimension))

            fea_diff_matrix = numpy.tile(fea_max_min_diff, (frame_number, 1))
            diff_norm_matrix = numpy.tile(target_max_min_diff,
                                          (frame_number, 1)) / fea_diff_matrix

            norm_features = diff_norm_matrix * (
                features - fea_min_matrix) + target_min_matrix

            ## If we are to keep some columns unnormalised, use advanced indexing to
            ## reinstate original values:
            m, n = numpy.shape(features)
            for col in self.exclude_columns:
                norm_features[range(m), [col] * m] = features[range(m),
                                                              [col] * m]

            io_funcs.array_to_binary_file(norm_features, out_file_list[i])
Пример #32
0
    def normal_standardization(self, in_file_list, out_file_list):
        mean_vector = self.compute_mean(in_file_list)
        std_vector = self.compute_std(in_file_list, mean_vector)

        io_funcs = BinaryIOCollection()
        file_number = len(in_file_list)
        for i in range(file_number):
            features = io_funcs.load_binary_file(in_file_list[i], self.feature_dimension)
            current_frame_number = features.size // self.feature_dimension

            mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1))
            std_matrix = numpy.tile(std_vector, (current_frame_number, 1))

            norm_features = (features - mean_matrix) / std_matrix

            io_funcs.array_to_binary_file(norm_features, out_file_list[i])
Пример #33
0
    def normal_standardization(self, in_file_list, out_file_list):
        mean_vector = self.compute_mean(in_file_list)
        std_vector = self.compute_std(in_file_list, mean_vector)

        io_funcs = BinaryIOCollection()
        file_number = len(in_file_list)
        for i in range(file_number):
            features = io_funcs.load_binary_file(in_file_list[i],
                                                 self.feature_dimension)
            current_frame_number = features.size // self.feature_dimension

            mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1))
            std_matrix = numpy.tile(std_vector, (current_frame_number, 1))

            norm_features = old_div((features - mean_matrix), std_matrix)

            io_funcs.array_to_binary_file(norm_features, out_file_list[i])
Пример #34
0
    def remove_silence(self,
                       in_data_list,
                       in_align_list,
                       out_data_list,
                       dur_file_list=None):
        file_number = len(in_data_list)
        align_file_number = len(in_align_list)

        if file_number != align_file_number:
            print "The number of input and output files does not equal!\n"
            sys.exit(1)
        if file_number != len(out_data_list):
            print "The number of input and output files does not equal!\n"
            sys.exit(1)

        io_funcs = BinaryIOCollection()
        for i in xrange(file_number):

            if self.label_type == "phone_align":
                if dur_file_list:
                    dur_file_name = dur_file_list[i]
                else:
                    dur_file_name = None
                nonsilence_indices = self.load_phone_alignment(
                    in_align_list[i], dur_file_name)
            else:
                nonsilence_indices = self.load_alignment(in_align_list[i])

            ori_cmp_data = io_funcs.load_binary_file(in_data_list[i],
                                                     self.n_cmp)

            frame_number = ori_cmp_data.size / self.n_cmp

            if len(nonsilence_indices) == frame_number:
                print 'WARNING: no silence found!'
                # previsouly: continue -- in fact we should keep non-silent data!

            ## if labels have a few extra frames than audio, this can break the indexing, remove them:
            nonsilence_indices = [
                ix for ix in nonsilence_indices if ix < frame_number
            ]

            new_cmp_data = ori_cmp_data[nonsilence_indices, ]

            io_funcs.array_to_binary_file(new_cmp_data, out_data_list[i])
Пример #35
0
    def merge_data(self, binary_label_file_list, new_feat_file_list,
                   out_feat_file_list):
        '''
        merging new features with normalised label features
        '''
        utt_number = len(new_feat_file_list)
        if utt_number != len(binary_label_file_list):
            print(
                "the number of new feature input files and label files should be the same!\n"
            )
            sys.exit(1)

        new_feat_ext = new_feat_file_list[0].split('/')[-1].split('.')[1]

        io_funcs = BinaryIOCollection()
        for i in range(utt_number):
            lab_file_name = binary_label_file_list[i]
            new_feat_file_name = new_feat_file_list[i]
            out_feat_file_name = out_feat_file_list[i]

            lab_features, lab_frame_number = io_funcs.load_binary_file_frame(
                lab_file_name, self.lab_dim)
            new_features, feat_frame_number = io_funcs.load_binary_file_frame(
                new_feat_file_name, self.feat_dim)

            if (lab_frame_number - feat_frame_number) > 5:
                base_file_name = new_feat_file_list[i].split('/')[-1].split(
                    '.')[0]
                self.logger.critical(
                    "the number of frames in label and new features are different: %d vs %d (%s)"
                    % (lab_frame_number, feat_frame_number, base_file_name))
                raise

            merged_features = numpy.zeros(
                (lab_frame_number, self.lab_dim + self.feat_dim))

            merged_features[0:lab_frame_number, 0:self.lab_dim] = lab_features
            merged_features[0:feat_frame_number, self.lab_dim:self.lab_dim +
                            self.feat_dim] = new_features[0:lab_frame_number, ]

            io_funcs.array_to_binary_file(merged_features, out_feat_file_name)
            self.logger.debug(
                'merged new feature %s of %d frames with %d label features' %
                (new_feat_ext, feat_frame_number, lab_frame_number))
Пример #36
0
      def predict(self,test_x, out_scaler, gen_test_file_list):
          #### compute predictions ####

         io_funcs = BinaryIOCollection()

         test_id_list = test_x.keys()
         test_id_list.sort()
         inference_batch_size=len(test_id_list)
         test_file_number = len(test_id_list)
         with tf.Session(graph=self.graph) as sess:
             new_saver=tf.train.import_meta_graph(self.ckpt_dir,"mymodel.ckpt.meta")
             """Notice change targets=tf.get_collection("targets")[0]"""
             inputs_data=self.graph.get_collection("inputs_data")[0]
             """Notice Change decoder_outputs=tf.get_collection("decoder_outputs")[0]"""
             inputs_sequence_length=self.graph.get_collection("inputs_sequence_length")[0]
             target_sequence_length=self.graph.get_collection("target_sequence_length")[0]
             print "loading the model parameters..."
             new_saver.restore(sess,os.path.join(self.ckpt_dir,"mymodel.ckpt"))
             print "Model parameters are successfully restored"
             print("generating features on held-out test data...")
             for utt_index in xrange(test_file_number):
               gen_test_file_name = gen_test_file_list[utt_index]
               temp_test_x        = test_x[test_id_list[utt_index]]
               num_of_rows        = temp_test_x.shape[0]

         #utt_length=[len(utt) for utt in test_x.values()]
         #max_step=max(utt_length)
               temp_test_x = tf.reshape(temp_test_x,[1,num_of_rows,self.n_in])

               outputs=np.zeros(shape=[len(test_x),max_step,self.n_out],dtype=np.float32)
                #dec_cell=self.graph.get_collection("decoder_cell")[0]
               print "Generating speech parameters ..."
               for t in range(num_of_rows):
                 #  outputs=sess.run(inference_output,{inputs_data:temp_test_x,inputs_sequence_length:utt_length,\
                #            target_sequence_length:utt_length})
                   _outputs=sess.run(decoder_outputs,feed_dict={inputs_data:temp_test_x,targets:outputs,inputs_sequence_length:[num_of_rows],\
                             target_sequence_length:[num_of_rows]})
                #   #print _outputs[:,t,:]
                   outputs[:,t,:]=_outputs[:,t,:]

               data_utils.denorm_data(outputs, out_scaler)
               io_funcs.array_to_binary_file(outputs, gen_test_file_name)
               data_utils.drawProgressBar(utt_index+1, test_file_number)
Пример #37
0
    def denormalise_data(self, in_file_list, out_file_list):

        logger = logging.getLogger("acoustic_norm")

        file_number = len(in_file_list)
        logger.info('MinMaxNormalisation.denormalise_data for %d files' %
                    file_number)

        # print   self.max_vector, self.min_vector
        fea_max_min_diff = self.max_vector - self.min_vector
        diff_value = self.target_max_value - self.target_min_value
        # logger.debug('reshaping fea_max_min_diff from shape %s to (1,%d)' % (fea_max_min_diff.shape, self.feature_dimension) )

        fea_max_min_diff = numpy.reshape(fea_max_min_diff,
                                         (1, self.feature_dimension))

        target_max_min_diff = numpy.zeros((1, self.feature_dimension))
        target_max_min_diff.fill(diff_value)

        target_max_min_diff[fea_max_min_diff <= 0.0] = 1.0
        fea_max_min_diff[fea_max_min_diff <= 0.0] = 1.0

        io_funcs = BinaryIOCollection()
        for i in range(file_number):
            features = io_funcs.load_binary_file(in_file_list[i],
                                                 self.feature_dimension)

            frame_number = features.size // self.feature_dimension
            fea_min_matrix = numpy.tile(self.min_vector, (frame_number, 1))
            target_min_matrix = numpy.tile(
                self.target_min_value, (frame_number, self.feature_dimension))

            fea_diff_matrix = numpy.tile(fea_max_min_diff, (frame_number, 1))
            diff_norm_matrix = old_div(
                fea_diff_matrix,
                numpy.tile(target_max_min_diff, (frame_number, 1)))
            norm_features = diff_norm_matrix * (
                features - target_min_matrix) + fea_min_matrix
            io_funcs.array_to_binary_file(norm_features, out_file_list[i])
Пример #38
0
    def duration_decomposition(self, in_file_list, dimension, out_dimension_dict, file_extension_dict):

        logger = logging.getLogger('param_generation')

        logger.debug('duration_decomposition for %d files' % len(in_file_list) )

        state_number = 5  ## hard coding, try removing in future?

        if len(list(out_dimension_dict.keys()))>1:
            logger.critical("we don't support any additional features along with duration as of now.")
            sys.exit(1)
        else:
            feature_name = list(out_dimension_dict.keys())[0]

        io_funcs = BinaryIOCollection()

        findex=0
        flen=len(in_file_list)
        for file_name in in_file_list:

            findex=findex+1

            dir_name = os.path.dirname(file_name)
            file_id = os.path.splitext(os.path.basename(file_name))[0]

            features, frame_number = io_funcs.load_binary_file_frame(file_name, dimension)
            gen_features = numpy.int32(numpy.round(features))
            gen_features[gen_features<1]=1

            if dimension > state_number:
                gen_features = gen_features[:, state_number]

            logger.info('processing %4d of %4d: %s' % (findex,flen,file_name) )

            new_file_name = os.path.join(dir_name, file_id + file_extension_dict[feature_name])
            io_funcs.array_to_binary_file(gen_features, new_file_name)

            logger.debug('wrote to file %s' % new_file_name)
Пример #39
0
    def normal_standardization(self, in_file_list, out_file_list, feature_dimension):
    
#        self.dimension_dict = dimension_dict
        self.feature_dimension = feature_dimension

        mean_vector = self.compute_mean(in_file_list, 0, feature_dimension)
        std_vector = self.compute_std(in_file_list, mean_vector, 0, feature_dimension)
                
        io_funcs = BinaryIOCollection()
        file_number = len(in_file_list)
 
        for i in xrange(file_number):
                    
            features, current_frame_number = io_funcs.load_binary_file_frame(in_file_list[i], self.feature_dimension)

            mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1))
            std_matrix = numpy.tile(std_vector, (current_frame_number, 1))
            
            norm_features = (features - mean_matrix) / std_matrix
            
            io_funcs.array_to_binary_file(norm_features, out_file_list[i])

        return  mean_vector, std_vector
Пример #40
0
def compute_norm_stats(data, stats_file, method="MVN", no_scaling_ind=()):
    #### normalize training data ####
    io_funcs = BinaryIOCollection()

    if method=="MVN":
        scaler = preprocessing.StandardScaler(copy=False).fit(data)
        if no_scaling_ind:
            scaler.mean_[no_scaling_ind] = 0
            scaler.scale_[no_scaling_ind] = 1
        norm_matrix = np.vstack((scaler.mean_, scaler.scale_))
    elif method=="MINMAX":
        # TODO: this seems strange, if this is used, check the documentation:
        # http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
        scaler = preprocessing.MinMaxScaler(copy=False, feature_range=(0.01, 0.99)).fit(data)
        norm_matrix = np.vstack((scaler.min_, scaler.scale_))

    print(norm_matrix.shape)
    io_funcs.array_to_binary_file(norm_matrix, stats_file)

    # TODO: Why don't we make this a text file or pickle? Here it is
    np.savetxt(stats_file + ".csv", norm_matrix, delimiter=",", fmt='%.2f', newline='\n')

    return scaler
Пример #41
0
    def remove_silence(self, in_data_list, in_align_list, out_data_list, dur_file_list=None):
        file_number = len(in_data_list)
        align_file_number = len(in_align_list)

        if  file_number != align_file_number:
            print   "The number of input and output files does not equal!\n"
            sys.exit(1)
        if  file_number != len(out_data_list):
            print   "The number of input and output files does not equal!\n"
            sys.exit(1)

        io_funcs = BinaryIOCollection()
        for i in xrange(file_number):

            if self.label_type=="phone_align":
                if dur_file_list:
                    dur_file_name = dur_file_list[i]
                else:
                    dur_file_name = None
                nonsilence_indices = self.load_phone_alignment(in_align_list[i], dur_file_name)
            else:
                nonsilence_indices = self.load_alignment(in_align_list[i])

            ori_cmp_data = io_funcs.load_binary_file(in_data_list[i], self.n_cmp)
            
            frame_number = ori_cmp_data.size/self.n_cmp
            
            if len(nonsilence_indices) == frame_number:
                print 'WARNING: no silence found!'
                # previsouly: continue -- in fact we should keep non-silent data!

            ## if labels have a few extra frames than audio, this can break the indexing, remove them:
            nonsilence_indices = [ix for ix in nonsilence_indices if ix < frame_number]

            new_cmp_data = ori_cmp_data[nonsilence_indices,]

            io_funcs.array_to_binary_file(new_cmp_data, out_data_list[i])
Пример #42
0
    def feature_denormalisation(self, in_file_list, out_file_list, mean_vector, std_vector):
        io_funcs = BinaryIOCollection()
        file_number = len(in_file_list)
        try:
            assert len(in_file_list) == len(out_file_list)
        except  AssertionError:
            logger.critical('The input and output file numbers are not the same! %d vs %d' %(len(in_file_list), len(out_file_list)))
            raise

        try:
            assert  mean_vector.size == self.feature_dimension and std_vector.size == self.feature_dimension
        except AssertionError:
            logger.critical('the dimensionalities of the mean and standard derivation vectors are not the same as the dimensionality of the feature')
            raise

        for i in xrange(file_number):
            features, current_frame_number = io_funcs.load_binary_file_frame(in_file_list[i], self.feature_dimension)

            mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1))
            std_matrix = numpy.tile(std_vector, (current_frame_number, 1))

            norm_features = features * std_matrix + mean_matrix

            io_funcs.array_to_binary_file(norm_features, out_file_list[i])
def generate_wav(gen_dir, file_id_list, cfg):
        
    logger = logging.getLogger("wav_generation")
    
    SPTK     = cfg.SPTK
#    NND      = cfg.NND
    STRAIGHT = cfg.STRAIGHT
    WORLD    = cfg.WORLD

    ## to be moved
    pf_coef = cfg.pf_coef
    if isinstance(cfg.fw_alpha, basestring):
        if cfg.fw_alpha=='Bark':
            fw_coef = bark_alpha(cfg.sr)
        elif cfg.fw_alpha=='ERB':
            fw_coef = bark_alpha(cfg.sr)
        else:
            raise ValueError('cfg.fw_alpha='+cfg.fw_alpha+' not implemented, the frequency warping coefficient "fw_coef" cannot be deduced.')
    else:
        fw_coef = cfg.fw_alpha
    co_coef = cfg.co_coef
    fl_coef = cfg.fl

    if cfg.apply_GV:
        io_funcs = BinaryIOCollection()

        logger.info('loading global variance stats from %s' % (cfg.GV_dir))

        ref_gv_mean_file = os.path.join(cfg.GV_dir, 'ref_gv.mean')
        gen_gv_mean_file = os.path.join(cfg.GV_dir, 'gen_gv.mean')
        ref_gv_std_file  = os.path.join(cfg.GV_dir, 'ref_gv.std')
        gen_gv_std_file  = os.path.join(cfg.GV_dir, 'gen_gv.std')

        ref_gv_mean, frame_number = io_funcs.load_binary_file_frame(ref_gv_mean_file, 1)
        gen_gv_mean, frame_number = io_funcs.load_binary_file_frame(gen_gv_mean_file, 1)
        ref_gv_std, frame_number = io_funcs.load_binary_file_frame(ref_gv_std_file, 1)
        gen_gv_std, frame_number = io_funcs.load_binary_file_frame(gen_gv_std_file, 1)

    counter=1
    max_counter = len(file_id_list)

    for filename in file_id_list:

        logger.info('creating waveform for %4d of %4d: %s' % (counter,max_counter,filename) )
        counter=counter+1
        base   = filename
        files = {'sp'  : os.path.join(gen_dir, base + cfg.sp_ext),
                 'mgc' : os.path.join(gen_dir, base + cfg.mgc_ext),
                 'f0'  : os.path.join(gen_dir, base + '.f0'),
                 'lf0' : os.path.join(gen_dir, base + cfg.lf0_ext),
                 'ap'  : os.path.join(gen_dir, base + '.ap'),
                 'bap' : os.path.join(gen_dir, base + cfg.bap_ext),
                 'wav' : os.path.join(gen_dir, base + '.wav')}

        mgc_file_name = files['mgc']
        bap_file_name = files['bap']
        
        cur_dir = os.getcwd()
        os.chdir(gen_dir)

        ### post-filtering
        if cfg.do_post_filtering:
            line = "echo 1 1 "
            for i in range(2, cfg.mgc_dim):
                line = line + str(pf_coef) + " "

            run_process('{line} >{weighttxt}'
                        .format(line=line, weighttxt=os.path.join(gen_dir, 'weight.txt')))
            run_process('{x2x} +af < {weighttxt} > {weight}'
                        .format(x2x=SPTK['X2X'], weighttxt=os.path.join(gen_dir, 'weight.txt'), weight=os.path.join(gen_dir, 'weight.bin')))

            run_process('{freqt} -m {order} -a {fw} -M {co} -A 0 < {mgc} > {temp1}'
                        .format(freqt=SPTK['FREQT'], order=cfg.mgc_dim-1, fw=fw_coef, co=co_coef, mgc=files['mgc'], temp1=files['mgc']+'_r0temp1'))
            run_process('{c2acr} -m {co} -M 0 -l {fl} <{temp1} > {base_r0}'
                        .format(co=co_coef, c2acr=SPTK['C2ACR'], fl=fl_coef, base_r0=files['mgc']+'_r0', temp1=files['mgc']+'_r0temp1'))

            run_process('{vopr} -m -n {order} < {mgc} {weight} > {temp2}'
                        .format(vopr=SPTK['VOPR'], order=cfg.mgc_dim-1, mgc=files['mgc'], weight=os.path.join(gen_dir, 'weight.bin'),
                                temp2=files['mgc']+'_mgctemp2'))
            run_process('{freqt} -m {order} -a {fw} -M {co} -A 0 < {temp2} > {temp3}'
                        .format(order=cfg.mgc_dim-1, freqt=SPTK['FREQT'], fw=fw_coef, co=co_coef,
                                temp2=files['mgc'] + '_mgctemp2', temp3=files['mgc']+'_mgctemp3'))
            run_process('{c2acr} -m {co} -M 0 -l {fl} < {temp3} > {base_p_r0}'
                        .format(temp3=files['mgc']+'_mgctemp3', co=co_coef,
                                c2acr=SPTK['C2ACR'], fl=fl_coef, base_p_r0=files['mgc']+'_p_r0'))

            run_process('{vopr} -m -n {order} < {mgc} {weight} > {temp4}'
                        .format(vopr=SPTK['VOPR'], order=cfg.mgc_dim-1, mgc=files['mgc'], weight=os.path.join(gen_dir, 'weight.bin'),
                                temp4=files['mgc'] + '_mgctemp4'))
            run_process('{mc2b} -m {order} -a {fw} < {temp4} > {temp5}'
                        .format(order=cfg.mgc_dim-1, mc2b=SPTK['MC2B'], fw=fw_coef,
                                temp4=files['mgc'] + '_mgctemp4', temp5=files['mgc'] + '_mgctemp5'))
            run_process('{bcp} -n {order} -s 0 -e 0 < {temp5} > {base_b0}'
                        .format(order=cfg.mgc_dim-1, bcp=SPTK['BCP'], base_b0=files['mgc']+'_b0', temp5=files['mgc'] + '_mgctemp5'))



            run_process('{vopr} -d < {base_r0} {base_p_r0} > {temp6}'
                        .format(vopr=SPTK['VOPR'], base_r0=files['mgc']+'_r0', base_p_r0=files['mgc']+'_p_r0', temp6=files['mgc']+'_mgctemp6'))
            run_process('{sopr} -LN -d 2 < {temp6} > {temp7}'
                        .format(sopr=SPTK['SOPR'], temp6=files['mgc'] + '_mgctemp6', temp7=files['mgc'] + '_mgctemp7'))
            run_process('{vopr} -a {base_b0} < {temp7} > {base_p_b0}'
                        .format(vopr=SPTK['VOPR'], temp7=files['mgc'] + '_mgctemp7',
                                base_b0=files['mgc']+'_b0', base_p_b0=files['mgc']+'_p_b0'))

            run_process('{vopr} -m -n {order} < {mgc} {weight} > {temp8}'
                        .format(vopr=SPTK['VOPR'], order=cfg.mgc_dim-1, mgc=files['mgc'], weight=os.path.join(gen_dir, 'weight.bin'),
                                temp8=files['mgc'] + '_mgctemp8'))
            run_process('{mc2b} -m {order} -a {fw} < {temp8} > {temp9}'
                        .format(order=cfg.mgc_dim-1, mc2b=SPTK['MC2B'],  fw=fw_coef,
                                temp8=files['mgc'] + '_mgctemp8', temp9=files['mgc'] + '_mgctemp9'))
            run_process('{bcp} -n {order} -s 1 -e {order} < {temp9} > {temp10}'
                        .format(order=cfg.mgc_dim-1, bcp=SPTK['BCP'],
                                temp9=files['mgc'] + '_mgctemp9', temp10=files['mgc'] + '_mgctemp10'))
            run_process('{merge} -n {order2} -s 0 -N 0 {base_p_b0} < {temp10} > {temp11}'
                        .format(merge=SPTK['MERGE'], order2=cfg.mgc_dim-2, base_p_b0=files['mgc']+'_p_b0',
                                temp10=files['mgc'] + '_mgctemp10', temp11=files['mgc'] + '_mgctemp11'))
            run_process('{b2mc} -m {order} -a {fw} < {temp11} > {base_p_mgc}'
                        .format(order=cfg.mgc_dim-1, fw=fw_coef, b2mc=SPTK['B2MC'], base_p_mgc=files['mgc']+'_p_mgc', temp11=files['mgc'] + '_mgctemp11'))

            mgc_file_name = files['mgc']+'_p_mgc'
            
        if cfg.vocoder_type == "STRAIGHT" and cfg.apply_GV:
            gen_mgc, frame_number = io_funcs.load_binary_file_frame(mgc_file_name, cfg.mgc_dim)

            gen_mu  = np.reshape(np.mean(gen_mgc, axis=0), (-1, 1))
            gen_std = np.reshape(np.std(gen_mgc, axis=0), (-1, 1))
   
            local_gv = (ref_gv_std/gen_gv_std) * (gen_std - gen_gv_mean) + ref_gv_mean;
   
            enhanced_mgc = np.repeat(local_gv, frame_number, 1).T / np.repeat(gen_std, frame_number, 1).T * (gen_mgc - np.repeat(gen_mu, frame_number, 1).T) + np.repeat(gen_mu, frame_number, 1).T;
            
            new_mgc_file_name = files['mgc']+'_p_mgc'
            io_funcs.array_to_binary_file(enhanced_mgc, new_mgc_file_name) 
            
            mgc_file_name = files['mgc']+'_p_mgc'
        
        if cfg.do_post_filtering and cfg.apply_GV:
            logger.critical('Both smoothing techniques together can\'t be applied!!\n' )
            raise

        ###mgc to sp to wav
        if cfg.vocoder_type == 'STRAIGHT':
            run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} > {sp}'
                        .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.mgc_dim-1, fl=cfg.fl, mgc=mgc_file_name, sp=files['sp']))
            run_process('{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} > {f0}'.format(sopr=SPTK['SOPR'], lf0=files['lf0'], f0=files['f0']))
            run_process('{x2x} +fa {f0} > {f0a}'.format(x2x=SPTK['X2X'], f0=files['f0'], f0a=files['f0'] + '.a'))

            if cfg.use_cep_ap:
                run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 0 {bap} > {ap}'
                            .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.bap_dim-1, fl=cfg.fl, bap=files['bap'], ap=files['ap']))
            else:
                run_process('{bndap2ap} {bap} > {ap}' 
                             .format(bndap2ap=STRAIGHT['BNDAP2AP'], bap=files['bap'], ap=files['ap']))

            run_process('{synfft} -f {sr} -spec -fftl {fl} -shift {shift} -sigp 0.0 -cornf 400 -float -apfile {ap} {f0a} {sp} {wav}'
                        .format(synfft=STRAIGHT['SYNTHESIS_FFT'], sr=cfg.sr, fl=cfg.fl, shift=cfg.shift, ap=files['ap'], f0a=files['f0']+'.a', sp=files['sp'], wav=files['wav']))

            run_process('rm -f {sp} {f0} {f0a} {ap}'
                        .format(sp=files['sp'],f0=files['f0'],f0a=files['f0']+'.a',ap=files['ap']))
        elif cfg.vocoder_type == 'WORLD':        

            run_process('{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} > {temp12}'.format(sopr=SPTK['SOPR'], lf0=files['lf0'], temp12=files['f0'] + '_temp12'))
            run_process('{x2x} +fd < {temp12} > {f0}'.format(x2x=SPTK['X2X'], f0=files['f0'], temp12=files['f0'] + '_temp12'))

            run_process('{sopr} -c 0 {bap} > {temp13}'.format(sopr=SPTK['SOPR'],bap=files['bap'],temp13=files['ap'] + '_temp13'))
            run_process('{x2x} +fd < {temp13} > {ap}'.format(x2x=SPTK['X2X'],ap=files['ap'], temp13=files['ap'] + '_temp13'))

            ### If using world v2, please comment above line and uncomment this
            #run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 0 {bap} | {sopr} -d 32768.0 -P | {x2x} +fd > {ap}'
            #            .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.bap_dim, fl=cfg.fl, bap=bap_file_name, sopr=SPTK['SOPR'], x2x=SPTK['X2X'], ap=files['ap']))

            run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} > {temp14}'
                        .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.mgc_dim-1, fl=cfg.fl, mgc=mgc_file_name, temp14=files['sp'] + '_temp14'))
            run_process('{sopr} -d 32768.0 -P < {temp14} > {temp15}'
                        .format(sopr=SPTK['SOPR'], temp14=files['sp'] + '_temp14', temp15=files['sp'] + '_temp15'))
            run_process('{x2x} +fd < {temp15} > {sp}'
                        .format(x2x=SPTK['X2X'], sp=files['sp'], temp15=files['sp'] + '_temp15'))

            run_process('{synworld} {fl} {sr} {f0} {sp} {ap} {wav}'
                         .format(synworld=WORLD['SYNTHESIS'], fl=cfg.fl, sr=cfg.sr, f0=files['f0'], sp=files['sp'], ap=files['ap'], wav=files['wav']))
            
            #run_process('rm -f {ap} {sp} {f0}'.format(ap=files['ap'],sp=files['sp'],f0=files['f0']))

        else:
        
            logger.critical('The vocoder %s is not supported yet!\n' % cfg.vocoder_type )
            raise
        
        os.chdir(cur_dir)
Пример #44
0
    def prepare_data(self, in_file_list_dict, out_file_list, in_dimension_dict, out_dimension_dict):

        logger = logging.getLogger("acoustic_comp")

        stream_start_index = {}
        stream_dim_index = 0
        for stream_name in out_dimension_dict.keys():
            if not stream_start_index.has_key(stream_name):
                stream_start_index[stream_name] = stream_dim_index

            stream_dim_index += out_dimension_dict[stream_name]
                
        io_funcs = BinaryIOCollection()

        for i in xrange(self.file_number):
            out_file_name = out_file_list[i]

            #if os.path.isfile(out_file_name):
            #    logger.info('processing file %4d of %4d : %s exists' % (i+1, self.file_number, out_file_name))
		    #    continue

            logger.info('processing file %4d of %4d : %s' % (i+1,self.file_number,out_file_name))

            out_data_matrix = None
            out_frame_number = 0


            for k in xrange(self.data_stream_number):
                data_stream_name = self.data_stream_list[k]

                in_file_name = in_file_list_dict[data_stream_name][i]

                in_feature_dim = in_dimension_dict[data_stream_name]
                features, frame_number = io_funcs.load_binary_file_frame(in_file_name, in_feature_dim)

                if k == 0:
                    out_frame_number = frame_number
                    out_data_matrix = numpy.zeros((out_frame_number, self.out_dimension))

                if frame_number > out_frame_number:
                    features = features[0:out_frame_number, ]
                    frame_number = out_frame_number
                
                try:
                    assert  out_frame_number == frame_number
                except AssertionError:
                    logger.critical('the frame number of data stream %s is not consistent with others: current %d others %d' 
                                         %(data_stream_name, out_frame_number, frame_number))
                    raise

                dim_index = stream_start_index[data_stream_name]

                if data_stream_name in ['lf0', 'F0']:   ## F0 added for GlottHMM
                    features, vuv_vector = self.interpolate_f0(features)

                    ### if vuv information to be recorded, store it in corresponding column
                    if self.record_vuv:
                        out_data_matrix[0:out_frame_number, stream_start_index['vuv']:stream_start_index['vuv']+1] = vuv_vector

                out_data_matrix[0:out_frame_number, dim_index:dim_index+in_feature_dim] = features
                dim_index = dim_index+in_feature_dim

                if self.compute_dynamic[data_stream_name]: 

                    delta_features = self.compute_dynamic_matrix(features, self.delta_win, frame_number, in_feature_dim)
                    acc_features   = self.compute_dynamic_matrix(features, self.acc_win, frame_number, in_feature_dim)


                    out_data_matrix[0:out_frame_number, dim_index:dim_index+in_feature_dim] = delta_features
                    dim_index = dim_index+in_feature_dim

                    out_data_matrix[0:out_frame_number, dim_index:dim_index+in_feature_dim] = acc_features
            
            ### write data to file
            io_funcs.array_to_binary_file(out_data_matrix, out_file_name)
            logger.debug(' wrote %d frames of features',out_frame_number )
Пример #45
0
def wavgen_straight_type_vocoder(gen_dir, file_id_list, cfg, logger):
    '''
    Waveform generation with STRAIGHT or WORLD vocoders.
    (whose acoustic parameters are: mgc, bap, and lf0)
    '''

    SPTK = cfg.SPTK
    #    NND      = cfg.NND
    STRAIGHT = cfg.STRAIGHT
    WORLD = cfg.WORLD

    ## to be moved
    pf_coef = cfg.pf_coef
    if isinstance(cfg.fw_alpha, str):
        if cfg.fw_alpha == 'Bark':
            fw_coef = bark_alpha(cfg.sr)
        elif cfg.fw_alpha == 'ERB':
            fw_coef = bark_alpha(cfg.sr)
        else:
            raise ValueError(
                'cfg.fw_alpha=' + cfg.fw_alpha +
                ' not implemented, the frequency warping coefficient "fw_coef" cannot be deduced.'
            )
    else:
        fw_coef = cfg.fw_alpha
    co_coef = cfg.co_coef
    fl_coef = cfg.fl

    if cfg.apply_GV:
        io_funcs = BinaryIOCollection()

        logger.info('loading global variance stats from %s' % (cfg.GV_dir))

        ref_gv_mean_file = os.path.join(cfg.GV_dir, 'ref_gv.mean')
        gen_gv_mean_file = os.path.join(cfg.GV_dir, 'gen_gv.mean')
        ref_gv_std_file = os.path.join(cfg.GV_dir, 'ref_gv.std')
        gen_gv_std_file = os.path.join(cfg.GV_dir, 'gen_gv.std')

        ref_gv_mean, frame_number = io_funcs.load_binary_file_frame(
            ref_gv_mean_file, 1)
        gen_gv_mean, frame_number = io_funcs.load_binary_file_frame(
            gen_gv_mean_file, 1)
        ref_gv_std, frame_number = io_funcs.load_binary_file_frame(
            ref_gv_std_file, 1)
        gen_gv_std, frame_number = io_funcs.load_binary_file_frame(
            gen_gv_std_file, 1)

    counter = 1
    max_counter = len(file_id_list)

    for filename in file_id_list:

        logger.info('creating waveform for %4d of %4d: %s' %
                    (counter, max_counter, filename))
        counter = counter + 1
        base = filename
        files = {
            'sp': base + cfg.sp_ext,
            'mgc': base + cfg.mgc_ext,
            'f0': base + '.f0',
            'lf0': base + cfg.lf0_ext,
            'ap': base + '.ap',
            'bap': base + cfg.bap_ext,
            'wav': base + '.wav'
        }

        mgc_file_name = files['mgc']
        bap_file_name = files['bap']

        cur_dir = os.getcwd()
        os.chdir(gen_dir)

        ### post-filtering
        if cfg.do_post_filtering:

            mgc_file_name = files['mgc'] + '_p_mgc'
            post_filter(files['mgc'], mgc_file_name, cfg.mgc_dim, pf_coef,
                        fw_coef, co_coef, fl_coef, gen_dir, cfg)

        if cfg.vocoder_type == "STRAIGHT" and cfg.apply_GV:
            gen_mgc, frame_number = io_funcs.load_binary_file_frame(
                mgc_file_name, cfg.mgc_dim)

            gen_mu = np.reshape(np.mean(gen_mgc, axis=0), (-1, 1))
            gen_std = np.reshape(np.std(gen_mgc, axis=0), (-1, 1))

            local_gv = (ref_gv_std / gen_gv_std) * (gen_std -
                                                    gen_gv_mean) + ref_gv_mean

            enhanced_mgc = np.repeat(local_gv, frame_number, 1).T / np.repeat(
                gen_std, frame_number, 1).T * (gen_mgc - np.repeat(
                    gen_mu, frame_number, 1).T) + np.repeat(
                        gen_mu, frame_number, 1).T

            new_mgc_file_name = files['mgc'] + '_p_mgc'
            io_funcs.array_to_binary_file(enhanced_mgc, new_mgc_file_name)

            mgc_file_name = files['mgc'] + '_p_mgc'

        if cfg.do_post_filtering and cfg.apply_GV:
            logger.critical(
                'Both smoothing techniques together can\'t be applied!!\n')
            raise

        ###mgc to sp to wav
        if cfg.vocoder_type == 'STRAIGHT':
            run_process(
                '{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} > {sp}'
                .format(mgc2sp=SPTK['MGC2SP'],
                        alpha=cfg.fw_alpha,
                        order=cfg.mgc_dim - 1,
                        fl=cfg.fl,
                        mgc=mgc_file_name,
                        sp=files['sp']))
            run_process(
                '{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} > {f0}'.format(
                    sopr=SPTK['SOPR'], lf0=files['lf0'], f0=files['f0']))
            run_process('{x2x} +fa {f0} > {f0a}'.format(x2x=SPTK['X2X'],
                                                        f0=files['f0'],
                                                        f0a=files['f0'] +
                                                        '.a'))

            if cfg.use_cep_ap:
                run_process(
                    '{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 0 {bap} > {ap}'
                    .format(mgc2sp=SPTK['MGC2SP'],
                            alpha=cfg.fw_alpha,
                            order=cfg.bap_dim - 1,
                            fl=cfg.fl,
                            bap=files['bap'],
                            ap=files['ap']))
            else:
                run_process('{bndap2ap} {bap} > {ap}'.format(
                    bndap2ap=STRAIGHT['BNDAP2AP'],
                    bap=files['bap'],
                    ap=files['ap']))

            run_process(
                '{synfft} -f {sr} -spec -fftl {fl} -shift {shift} -sigp 1.2 -cornf 4000 -float -apfile {ap} {f0a} {sp} {wav}'
                .format(synfft=STRAIGHT['SYNTHESIS_FFT'],
                        sr=cfg.sr,
                        fl=cfg.fl,
                        shift=cfg.shift,
                        ap=files['ap'],
                        f0a=files['f0'] + '.a',
                        sp=files['sp'],
                        wav=files['wav']))

            run_process('rm -f {sp} {f0} {f0a} {ap}'.format(sp=files['sp'],
                                                            f0=files['f0'],
                                                            f0a=files['f0'] +
                                                            '.a',
                                                            ap=files['ap']))
        elif cfg.vocoder_type == 'WORLD':

            run_process(
                '{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} | {x2x} +fd > {f0}'
                .format(sopr=SPTK['SOPR'],
                        lf0=files['lf0'],
                        x2x=SPTK['X2X'],
                        f0=files['f0']))

            run_process('{sopr} -c 0 {bap} | {x2x} +fd > {ap}'.format(
                sopr=SPTK['SOPR'],
                bap=files['bap'],
                x2x=SPTK['X2X'],
                ap=files['ap']))

            ### If using world v2, please comment above line and uncomment this
            #run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 0 {bap} | {sopr} -d 32768.0 -P | {x2x} +fd > {ap}'
            #            .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.bap_dim, fl=cfg.fl, bap=bap_file_name, sopr=SPTK['SOPR'], x2x=SPTK['X2X'], ap=files['ap']))

            run_process(
                '{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} | {sopr} -d 32768.0 -P | {x2x} +fd > {sp}'
                .format(mgc2sp=SPTK['MGC2SP'],
                        alpha=cfg.fw_alpha,
                        order=cfg.mgc_dim - 1,
                        fl=cfg.fl,
                        mgc=mgc_file_name,
                        sopr=SPTK['SOPR'],
                        x2x=SPTK['X2X'],
                        sp=files['sp']))

            run_process('{synworld} {fl} {sr} {f0} {sp} {ap} {wav}'.format(
                synworld=WORLD['SYNTHESIS'],
                fl=cfg.fl,
                sr=cfg.sr,
                f0=files['f0'],
                sp=files['sp'],
                ap=files['ap'],
                wav=files['wav']))

            run_process('rm -f {ap} {sp} {f0}'.format(ap=files['ap'],
                                                      sp=files['sp'],
                                                      f0=files['f0']))

        os.chdir(cur_dir)
Пример #46
0
def trim_silence(in_list, out_list, in_dimension, label_list, label_dimension, \
                 silence_feature_index, percent_to_keep=0):
    '''
    Function to trim silence from binary label/speech files based on binary labels.
        in_list: list of binary label/speech files to trim
        out_list: trimmed files
        in_dimension: dimension of data to trim
        label_list: list of binary labels which contain trimming criterion
        label_dimesion:
        silence_feature_index: index of feature in labels which is silence: 1 means silence (trim), 0 means leave.
    '''
    assert len(in_list) == len(out_list) == len(label_list)
    io_funcs = BinaryIOCollection()
    for (infile, outfile, label_file) in zip(in_list, out_list, label_list):

        data = io_funcs.load_binary_file(infile, in_dimension)
        label = io_funcs.load_binary_file(label_file, label_dimension)

        audio_label_difference = data.shape[0] - label.shape[0]
        assert math.fabs(audio_label_difference) < 3, '%s and %s contain different numbers of frames: %s %s' % (
            infile, label_file, data.shape[0], label.shape[0])

        ## In case they are different, resize -- keep label fixed as we assume this has
        ## already been processed. (This problem only arose with STRAIGHT features.)
        if audio_label_difference < 0:  ## label is longer -- pad audio to match by repeating last frame:
            print('audio too short -- pad')
            padding = numpy.vstack([data[-1, :]] * int(math.fabs(audio_label_difference)))
            data = numpy.vstack([data, padding])
        elif audio_label_difference > 0:  ## audio is longer -- cut it
            print('audio too long -- trim')
            new_length = label.shape[0]
            data = data[:new_length, :]
        # else: -- expected case -- lengths match, so do nothing

        silence_flag = label[:, silence_feature_index]
        #         print silence_flag
        if not (numpy.unique(silence_flag) == numpy.array([0, 1])).all():
            ## if it's all 0s or 1s, that's ok:
            assert (numpy.unique(silence_flag) == numpy.array([0]).all()) or \
                   (numpy.unique(silence_flag) == numpy.array([1]).all()), \
                'dimension %s of %s contains values other than 0 and 1' % (silence_feature_index, infile)
        print('Remove %d%% of frames (%s frames) as silence... ' % (
            100 * numpy.sum(silence_flag / float(len(silence_flag))), int(numpy.sum(silence_flag))))
        non_silence_indices = numpy.nonzero(
            silence_flag == 0)  ## get the indices where silence_flag == 0 is True (i.e. != 0)
        if percent_to_keep != 0:
            assert type(percent_to_keep) == int and percent_to_keep > 0
            # print silence_flag
            silence_indices = numpy.nonzero(silence_flag == 1)
            ## nonzero returns a tuple of arrays, one for each dimension of input array
            silence_indices = silence_indices[0]
            every_nth = 100 / percent_to_keep
            silence_indices_to_keep = silence_indices[::every_nth]  ## every_nth used +as step value in slice
            ## -1 due to weird error with STRAIGHT features at line 144:
            ## IndexError: index 445 is out of bounds for axis 0 with size 445
            if len(silence_indices_to_keep) == 0:
                silence_indices_to_keep = numpy.array([1])  ## avoid errors in case there is no silence
            print('   Restore %s%% (every %sth frame: %s frames) of silent frames' % (
                percent_to_keep, every_nth, len(silence_indices_to_keep)))

            ## Append to end of utt -- same function used for labels and audio
            ## means that violation of temporal order doesn't matter -- will be consistent.
            ## Later, frame shuffling will disperse silent frames evenly across minibatches:
            non_silence_indices = (numpy.hstack([non_silence_indices[0], silence_indices_to_keep]))
            ##  ^---- from tuple and back (see nonzero note above)

        trimmed_data = data[non_silence_indices, :]  ## advanced integer indexing
        io_funcs.array_to_binary_file(trimmed_data, outfile)
Пример #47
0
def wavgen_straight_type_vocoder(gen_dir, file_id_list, cfg, logger):
    '''
    Waveform generation with STRAIGHT or WORLD vocoders.
    (whose acoustic parameters are: mgc, bap, and lf0)
    '''

    SPTK     = cfg.SPTK
#    NND      = cfg.NND
    STRAIGHT = cfg.STRAIGHT
    WORLD    = cfg.WORLD

    ## to be moved
    pf_coef = cfg.pf_coef
    if isinstance(cfg.fw_alpha, str):
        if cfg.fw_alpha=='Bark':
            fw_coef = bark_alpha(cfg.sr)
        elif cfg.fw_alpha=='ERB':
            fw_coef = bark_alpha(cfg.sr)
        else:
            raise ValueError('cfg.fw_alpha='+cfg.fw_alpha+' not implemented, the frequency warping coefficient "fw_coef" cannot be deduced.')
    else:
        fw_coef = cfg.fw_alpha
    co_coef = cfg.co_coef
    fl_coef = cfg.fl

    if cfg.apply_GV:
        io_funcs = BinaryIOCollection()

        logger.info('loading global variance stats from %s' % (cfg.GV_dir))

        ref_gv_mean_file = os.path.join(cfg.GV_dir, 'ref_gv.mean')
        gen_gv_mean_file = os.path.join(cfg.GV_dir, 'gen_gv.mean')
        ref_gv_std_file  = os.path.join(cfg.GV_dir, 'ref_gv.std')
        gen_gv_std_file  = os.path.join(cfg.GV_dir, 'gen_gv.std')

        ref_gv_mean, frame_number = io_funcs.load_binary_file_frame(ref_gv_mean_file, 1)
        gen_gv_mean, frame_number = io_funcs.load_binary_file_frame(gen_gv_mean_file, 1)
        ref_gv_std, frame_number = io_funcs.load_binary_file_frame(ref_gv_std_file, 1)
        gen_gv_std, frame_number = io_funcs.load_binary_file_frame(gen_gv_std_file, 1)

    counter=1
    max_counter = len(file_id_list)

    for filename in file_id_list:

        logger.info('creating waveform for %4d of %4d: %s' % (counter,max_counter,filename) )
        counter=counter+1
        base   = filename
        files = {'sp'  : base + cfg.sp_ext,
                 'mgc' : base + cfg.mgc_ext,
                 'f0'  : base + '.f0',
                 'lf0' : base + cfg.lf0_ext,
                 'ap'  : base + '.ap',
                 'bap' : base + cfg.bap_ext,
                 'wav' : base + '.wav'}

        mgc_file_name = files['mgc']
        bap_file_name = files['bap']

        cur_dir = os.getcwd()
        os.chdir(gen_dir)

        ### post-filtering
        if cfg.do_post_filtering:

            mgc_file_name = files['mgc']+'_p_mgc'
            post_filter(files['mgc'], mgc_file_name, cfg.mgc_dim, pf_coef, fw_coef, co_coef, fl_coef, gen_dir, cfg)

        if cfg.vocoder_type == "STRAIGHT" and cfg.apply_GV:
            gen_mgc, frame_number = io_funcs.load_binary_file_frame(mgc_file_name, cfg.mgc_dim)

            gen_mu  = np.reshape(np.mean(gen_mgc, axis=0), (-1, 1))
            gen_std = np.reshape(np.std(gen_mgc, axis=0), (-1, 1))

            local_gv = (ref_gv_std/gen_gv_std) * (gen_std - gen_gv_mean) + ref_gv_mean;

            enhanced_mgc = np.repeat(local_gv, frame_number, 1).T / np.repeat(gen_std, frame_number, 1).T * (gen_mgc - np.repeat(gen_mu, frame_number, 1).T) + np.repeat(gen_mu, frame_number, 1).T;

            new_mgc_file_name = files['mgc']+'_p_mgc'
            io_funcs.array_to_binary_file(enhanced_mgc, new_mgc_file_name)

            mgc_file_name = files['mgc']+'_p_mgc'

        if cfg.do_post_filtering and cfg.apply_GV:
            logger.critical('Both smoothing techniques together can\'t be applied!!\n' )
            raise

        ###mgc to sp to wav
        if cfg.vocoder_type == 'STRAIGHT':
            run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} > {sp}'
                        .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.mgc_dim-1, fl=cfg.fl, mgc=mgc_file_name, sp=files['sp']))
            run_process('{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} > {f0}'.format(sopr=SPTK['SOPR'], lf0=files['lf0'], f0=files['f0']))
            run_process('{x2x} +fa {f0} > {f0a}'.format(x2x=SPTK['X2X'], f0=files['f0'], f0a=files['f0'] + '.a'))

            if cfg.use_cep_ap:
                run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 0 {bap} > {ap}'
                            .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.bap_dim-1, fl=cfg.fl, bap=files['bap'], ap=files['ap']))
            else:
                run_process('{bndap2ap} {bap} > {ap}'
                             .format(bndap2ap=STRAIGHT['BNDAP2AP'], bap=files['bap'], ap=files['ap']))

            run_process('{synfft} -f {sr} -spec -fftl {fl} -shift {shift} -sigp 1.2 -cornf 4000 -float -apfile {ap} {f0a} {sp} {wav}'
                        .format(synfft=STRAIGHT['SYNTHESIS_FFT'], sr=cfg.sr, fl=cfg.fl, shift=cfg.shift, ap=files['ap'], f0a=files['f0']+'.a', sp=files['sp'], wav=files['wav']))

            run_process('rm -f {sp} {f0} {f0a} {ap}'
                        .format(sp=files['sp'],f0=files['f0'],f0a=files['f0']+'.a',ap=files['ap']))
        elif cfg.vocoder_type == 'WORLD':

            run_process('{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} | {x2x} +fd > {f0}'.format(sopr=SPTK['SOPR'], lf0=files['lf0'], x2x=SPTK['X2X'], f0=files['f0']))

            run_process('{sopr} -c 0 {bap} | {x2x} +fd > {ap}'.format(sopr=SPTK['SOPR'],bap=files['bap'],x2x=SPTK['X2X'],ap=files['ap']))

            ### If using world v2, please comment above line and uncomment this
            #run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 0 {bap} | {sopr} -d 32768.0 -P | {x2x} +fd > {ap}'
            #            .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.bap_dim, fl=cfg.fl, bap=bap_file_name, sopr=SPTK['SOPR'], x2x=SPTK['X2X'], ap=files['ap']))

            run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} | {sopr} -d 32768.0 -P | {x2x} +fd > {sp}'
                        .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.mgc_dim-1, fl=cfg.fl, mgc=mgc_file_name, sopr=SPTK['SOPR'], x2x=SPTK['X2X'], sp=files['sp']))

            run_process('{synworld} {fl} {sr} {f0} {sp} {ap} {wav}'
                         .format(synworld=WORLD['SYNTHESIS'], fl=cfg.fl, sr=cfg.sr, f0=files['f0'], sp=files['sp'], ap=files['ap'], wav=files['wav']))

            run_process('rm -f {ap} {sp} {f0}'.format(ap=files['ap'],sp=files['sp'],f0=files['f0']))

        os.chdir(cur_dir)
Пример #48
0
def trim_silence(in_list, out_list, in_dimension, label_list, label_dimension, \
                 silence_feature_index, percent_to_keep=0):
    '''
    Function to trim silence from binary label/speech files based on binary labels.
        in_list: list of binary label/speech files to trim
        out_list: trimmed files
        in_dimension: dimension of data to trim
        label_list: list of binary labels which contain trimming criterion
        label_dimesion:
        silence_feature_index: index of feature in labels which is silence: 1 means silence (trim), 0 means leave.
    '''
    assert len(in_list) == len(out_list) == len(label_list)
    io_funcs = BinaryIOCollection()
    for (infile, outfile, label_file) in zip(in_list, out_list, label_list):

        data = io_funcs.load_binary_file(infile, in_dimension)
        label = io_funcs.load_binary_file(label_file, label_dimension)

        audio_label_difference = data.shape[0] - label.shape[0]
        assert math.fabs(
            audio_label_difference
        ) < 3, '%s and %s contain different numbers of frames: %s %s' % (
            infile, label_file, data.shape[0], label.shape[0])

        ## In case they are different, resize -- keep label fixed as we assume this has
        ## already been processed. (This problem only arose with STRAIGHT features.)
        if audio_label_difference < 0:  ## label is longer -- pad audio to match by repeating last frame:
            print('audio too short -- pad')
            padding = numpy.vstack([data[-1, :]] *
                                   int(math.fabs(audio_label_difference)))
            data = numpy.vstack([data, padding])
        elif audio_label_difference > 0:  ## audio is longer -- cut it
            print('audio too long -- trim')
            new_length = label.shape[0]
            data = data[:new_length, :]
        # else: -- expected case -- lengths match, so do nothing

        silence_flag = label[:, silence_feature_index]
        #         print silence_flag
        if not (numpy.unique(silence_flag) == numpy.array([0, 1])).all():
            ## if it's all 0s or 1s, that's ok:
            assert (numpy.unique(silence_flag) == numpy.array([0]).all()) or \
                   (numpy.unique(silence_flag) == numpy.array([1]).all()), \
                'dimension %s of %s contains values other than 0 and 1' % (silence_feature_index, infile)
        print('Remove %d%% of frames (%s frames) as silence... ' %
              (100 * numpy.sum(silence_flag / float(len(silence_flag))),
               int(numpy.sum(silence_flag))))
        non_silence_indices = numpy.nonzero(
            silence_flag ==
            0)  ## get the indices where silence_flag == 0 is True (i.e. != 0)
        if percent_to_keep != 0:
            assert type(percent_to_keep) == int and percent_to_keep > 0
            # print silence_flag
            silence_indices = numpy.nonzero(silence_flag == 1)
            ## nonzero returns a tuple of arrays, one for each dimension of input array
            silence_indices = silence_indices[0]
            every_nth = 100 / percent_to_keep
            silence_indices_to_keep = silence_indices[::
                                                      every_nth]  ## every_nth used +as step value in slice
            ## -1 due to weird error with STRAIGHT features at line 144:
            ## IndexError: index 445 is out of bounds for axis 0 with size 445
            if len(silence_indices_to_keep) == 0:
                silence_indices_to_keep = numpy.array(
                    [1])  ## avoid errors in case there is no silence
            print(
                '   Restore %s%% (every %sth frame: %s frames) of silent frames'
                % (percent_to_keep, every_nth, len(silence_indices_to_keep)))

            ## Append to end of utt -- same function used for labels and audio
            ## means that violation of temporal order doesn't matter -- will be consistent.
            ## Later, frame shuffling will disperse silent frames evenly across minibatches:
            non_silence_indices = (numpy.hstack(
                [non_silence_indices[0], silence_indices_to_keep]))
            ##  ^---- from tuple and back (see nonzero note above)

        trimmed_data = data[
            non_silence_indices, :]  ## advanced integer indexing
        io_funcs.array_to_binary_file(trimmed_data, outfile)
Пример #49
0
def dnn_generation(valid_file_list, nnets_file_name, n_ins, n_outs, out_file_list, target_mean_vector, target_std_vector, out_dimension_dict, file_extension_dict, vocoder='straight'):
    logger = logging.getLogger("dnn_generation")
    logger.debug('Starting dnn_generation')

    inf_float = -1.0e+10

    plotlogger = logging.getLogger("plotting")

    cfg.gen_wav_features

    if vocoder == 'straight':
        gen_wav_features = ['mgc', 'lf0', 'bap']
    elif vocoder == 'glotthmm':
        gen_wav_features = ['F0', 'Gain', 'HNR', 'LSF','LSFsource']  ## TODO: take this from config
    else:
        sys.exit('unsupported vocoder %s !'%(vocoder))

    stream_start_index = {}
    dimension_index = 0
    for feature_name in list(out_dimension_dict.keys()):
        stream_start_index[feature_name] = dimension_index
        dimension_index += out_dimension_dict[feature_name]

    dnn_model = pickle.load(open(nnets_file_name, 'rb'))

    file_number = len(valid_file_list)
    io_funcs = BinaryIOCollection()

    mlpg = MLParameterGenerationFast()

    for i in range(file_number):
        logger.info('generating %4d of %4d: %s' % (i+1,file_number,valid_file_list[i]) )
        fid_lab = open(valid_file_list[i], 'rb')
        features = numpy.fromfile(fid_lab, dtype=numpy.float32)
        fid_lab.close()
        features = features[:(n_ins * (features.size / n_ins))]
        features = features.reshape((-1, n_ins))

        frame_number = features.shape[0]

        test_set_x = theano.shared(numpy.asarray(features, dtype=theano.config.floatX))

        mean_matrix = numpy.tile(target_mean_vector, (features.shape[0], 1))
        std_matrix = numpy.tile(target_std_vector, (features.shape[0], 1))

        predicted_mix   = dnn_model.parameter_prediction_mix(test_set_x = test_set_x)
        max_index = numpy.argmax(predicted_mix, axis=1)

        temp_predicted_mu = dnn_model.parameter_prediction(test_set_x=test_set_x)
        temp_predicted_sigma = dnn_model.parameter_prediction_sigma(test_set_x = test_set_x)
        predicted_mu = numpy.zeros((temp_predicted_mu.shape[0], n_outs))
        predicted_sigma = numpy.zeros((temp_predicted_sigma.shape[0], n_outs))
        for kk in range(temp_predicted_mu.shape[0]):
            predicted_mu[kk, :] = temp_predicted_mu[kk, max_index[kk]*n_outs:(max_index[kk]+1)*n_outs]
            predicted_sigma[kk, :] = temp_predicted_sigma[kk, max_index[kk]*n_outs:(max_index[kk]+1)*n_outs]
#        print   predicted_mu.shape
#        predicted_mu = predicted_mu[aa*n_outs:(aa+1)*n_outs]
        predicted_mu = predicted_mu * std_matrix + mean_matrix
        predicted_sigma = ((predicted_sigma ** 0.5) * std_matrix ) ** 2

        dir_name = os.path.dirname(out_file_list[i])
        file_id = os.path.splitext(os.path.basename(out_file_list[i]))[0]

        mlpg = MLParameterGenerationFast()
        for feature_name in gen_wav_features:
            current_features = predicted_mu[:, stream_start_index[feature_name]:stream_start_index[feature_name]+out_dimension_dict[feature_name]]
            current_sigma    = predicted_sigma[:, stream_start_index[feature_name]:stream_start_index[feature_name]+out_dimension_dict[feature_name]]

            gen_features = mlpg.generation(current_features, current_sigma, out_dimension_dict[feature_name]/3)

            if feature_name in ['lf0', 'F0']:
                if 'vuv' in stream_start_index:
                    vuv_feature = predicted_mu[:, stream_start_index['vuv']:stream_start_index['vuv']+1]
                    for i in range(frame_number):
                        if vuv_feature[i, 0] < 0.5:
                            gen_features[i, 0] = inf_float
#                print   gen_features
            new_file_name = os.path.join(dir_name, file_id + file_extension_dict[feature_name])

            io_funcs.array_to_binary_file(gen_features, new_file_name)
Пример #50
0
def generate_wav(gen_dir, file_id_list, cfg):
        
    logger = logging.getLogger("wav_generation")
    
    SPTK     = cfg.SPTK
#    NND      = cfg.NND
    STRAIGHT = cfg.STRAIGHT
    WORLD    = cfg.WORLD

    ## to be moved
    pf_coef = cfg.pf_coef
    if isinstance(cfg.fw_alpha, basestring):
        if cfg.fw_alpha=='Bark':
            fw_coef = bark_alpha(cfg.sr)
        elif cfg.fw_alpha=='ERB':
            fw_coef = bark_alpha(cfg.sr)
        else:
            raise ValueError('cfg.fw_alpha='+cfg.fw_alpha+' not implemented, the frequency warping coefficient "fw_coef" cannot be deduced.')
    else:
        fw_coef = cfg.fw_alpha
    co_coef = cfg.co_coef
    fl_coef = cfg.fl

    if cfg.apply_GV:
        io_funcs = BinaryIOCollection()

        logger.info('loading global variance stats from %s' % (cfg.GV_dir))

        ref_gv_mean_file = os.path.join(cfg.GV_dir, 'ref_gv.mean')
        gen_gv_mean_file = os.path.join(cfg.GV_dir, 'gen_gv.mean')
        ref_gv_std_file  = os.path.join(cfg.GV_dir, 'ref_gv.std')
        gen_gv_std_file  = os.path.join(cfg.GV_dir, 'gen_gv.std')

        ref_gv_mean, frame_number = io_funcs.load_binary_file_frame(ref_gv_mean_file, 1)
        gen_gv_mean, frame_number = io_funcs.load_binary_file_frame(gen_gv_mean_file, 1)
        ref_gv_std, frame_number = io_funcs.load_binary_file_frame(ref_gv_std_file, 1)
        gen_gv_std, frame_number = io_funcs.load_binary_file_frame(gen_gv_std_file, 1)

    counter=1
    max_counter = len(file_id_list)

    for filename in file_id_list:

        logger.info('creating waveform for %4d of %4d: %s' % (counter,max_counter,filename) )
        counter=counter+1
        base   = filename
        files = {'sp'  : base + cfg.sp_ext,
                 'mgc' : base + cfg.mgc_ext,
                 'f0'  : base + '.f0',
                 'lf0' : base + cfg.lf0_ext,
                 'ap'  : base + '.ap',
                 'bap' : base + cfg.bap_ext,
                 'wav' : base + '.wav'}

        mgc_file_name = files['mgc']
        bap_file_name = files['bap']
        
        cur_dir = os.getcwd()
        os.chdir(gen_dir)

        ### post-filtering
        if cfg.do_post_filtering:
            line = "echo 1 1 "
            for i in range(2, cfg.mgc_dim):
                line = line + str(pf_coef) + " "

            run_process('{line} | {x2x} +af > {weight}'
                        .format(line=line, x2x=SPTK['X2X'], weight=os.path.join(gen_dir, 'weight')))

            run_process('{freqt} -m {order} -a {fw} -M {co} -A 0 < {mgc} | {c2acr} -m {co} -M 0 -l {fl} > {base_r0}'
                        .format(freqt=SPTK['FREQT'], order=cfg.mgc_dim-1, fw=fw_coef, co=co_coef, mgc=files['mgc'], c2acr=SPTK['C2ACR'], fl=fl_coef, base_r0=files['mgc']+'_r0'))

            run_process('{vopr} -m -n {order} < {mgc} {weight} | {freqt} -m {order} -a {fw} -M {co} -A 0 | {c2acr} -m {co} -M 0 -l {fl} > {base_p_r0}'
                        .format(vopr=SPTK['VOPR'], order=cfg.mgc_dim-1, mgc=files['mgc'], weight=os.path.join(gen_dir, 'weight'),
                                freqt=SPTK['FREQT'], fw=fw_coef, co=co_coef, 
                                c2acr=SPTK['C2ACR'], fl=fl_coef, base_p_r0=files['mgc']+'_p_r0'))

            run_process('{vopr} -m -n {order} < {mgc} {weight} | {mc2b} -m {order} -a {fw} | {bcp} -n {order} -s 0 -e 0 > {base_b0}'
                        .format(vopr=SPTK['VOPR'], order=cfg.mgc_dim-1, mgc=files['mgc'], weight=os.path.join(gen_dir, 'weight'),
                                mc2b=SPTK['MC2B'], fw=fw_coef, 
                                bcp=SPTK['BCP'], base_b0=files['mgc']+'_b0'))

            run_process('{vopr} -d < {base_r0} {base_p_r0} | {sopr} -LN -d 2 | {vopr} -a {base_b0} > {base_p_b0}'
                        .format(vopr=SPTK['VOPR'], base_r0=files['mgc']+'_r0', base_p_r0=files['mgc']+'_p_r0', 
                                sopr=SPTK['SOPR'], 
                                base_b0=files['mgc']+'_b0', base_p_b0=files['mgc']+'_p_b0'))
          
            run_process('{vopr} -m -n {order} < {mgc} {weight} | {mc2b} -m {order} -a {fw} | {bcp} -n {order} -s 1 -e {order} | {merge} -n {order2} -s 0 -N 0 {base_p_b0} | {b2mc} -m {order} -a {fw} > {base_p_mgc}'
                        .format(vopr=SPTK['VOPR'], order=cfg.mgc_dim-1, mgc=files['mgc'], weight=os.path.join(gen_dir, 'weight'),
                                mc2b=SPTK['MC2B'],  fw=fw_coef, 
                                bcp=SPTK['BCP'], 
                                merge=SPTK['MERGE'], order2=cfg.mgc_dim-2, base_p_b0=files['mgc']+'_p_b0',
                                b2mc=SPTK['B2MC'], base_p_mgc=files['mgc']+'_p_mgc'))

            mgc_file_name = files['mgc']+'_p_mgc'
            
        if cfg.vocoder_type == "STRAIGHT" and cfg.apply_GV:
            gen_mgc, frame_number = io_funcs.load_binary_file_frame(mgc_file_name, cfg.mgc_dim)

            gen_mu  = np.reshape(np.mean(gen_mgc, axis=0), (-1, 1))
            gen_std = np.reshape(np.std(gen_mgc, axis=0), (-1, 1))
   
            local_gv = (ref_gv_std/gen_gv_std) * (gen_std - gen_gv_mean) + ref_gv_mean;
   
            enhanced_mgc = np.repeat(local_gv, frame_number, 1).T / np.repeat(gen_std, frame_number, 1).T * (gen_mgc - np.repeat(gen_mu, frame_number, 1).T) + np.repeat(gen_mu, frame_number, 1).T;
            
            new_mgc_file_name = files['mgc']+'_p_mgc'
            io_funcs.array_to_binary_file(enhanced_mgc, new_mgc_file_name) 
            
            mgc_file_name = files['mgc']+'_p_mgc'
        
        if cfg.do_post_filtering and cfg.apply_GV:
            logger.critical('Both smoothing techniques together can\'t be applied!!\n' )
            raise

        ###mgc to sp to wav
        if cfg.vocoder_type == 'STRAIGHT':
            run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} > {sp}'
                        .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.mgc_dim-1, fl=cfg.fl, mgc=mgc_file_name, sp=files['sp']))
            run_process('{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} > {f0}'.format(sopr=SPTK['SOPR'], lf0=files['lf0'], f0=files['f0']))
            run_process('{x2x} +fa {f0} > {f0a}'.format(x2x=SPTK['X2X'], f0=files['f0'], f0a=files['f0'] + '.a'))

            if cfg.use_cep_ap:
                run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 0 {bap} > {ap}'
                            .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.bap_dim-1, fl=cfg.fl, bap=files['bap'], ap=files['ap']))
            else:
                run_process('{bndap2ap} {bap} > {ap}' 
                             .format(bndap2ap=STRAIGHT['BNDAP2AP'], bap=files['bap'], ap=files['ap']))

            run_process('{synfft} -f {sr} -spec -fftl {fl} -shift {shift} -sigp 0.0 -cornf 400 -float -apfile {ap} {f0a} {sp} {wav}'
                        .format(synfft=STRAIGHT['SYNTHESIS_FFT'], sr=cfg.sr, fl=cfg.fl, shift=cfg.shift, ap=files['ap'], f0a=files['f0']+'.a', sp=files['sp'], wav=files['wav']))

            run_process('rm -f {sp} {f0} {f0a} {ap}'
                        .format(sp=files['sp'],f0=files['f0'],f0a=files['f0']+'.a',ap=files['ap']))
        elif cfg.vocoder_type == 'WORLD':        

            run_process('{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} | {x2x} +fd > {f0}'.format(sopr=SPTK['SOPR'], lf0=files['lf0'], x2x=SPTK['X2X'], f0=files['f0']))        
            
            run_process('{sopr} -c 0 {bap} | {x2x} +fd > {ap}'.format(sopr=SPTK['SOPR'],bap=files['bap'],x2x=SPTK['X2X'],ap=files['ap']))
            
            ### If using world v2, please comment above line and uncomment this
            #run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 0 {bap} | {sopr} -d 32768.0 -P | {x2x} +fd > {ap}'
            #            .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.bap_dim, fl=cfg.fl, bap=bap_file_name, sopr=SPTK['SOPR'], x2x=SPTK['X2X'], ap=files['ap']))

            run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} | {sopr} -d 32768.0 -P | {x2x} +fd > {sp}'
                        .format(mgc2sp=SPTK['MGC2SP'], alpha=cfg.fw_alpha, order=cfg.mgc_dim-1, fl=cfg.fl, mgc=mgc_file_name, sopr=SPTK['SOPR'], x2x=SPTK['X2X'], sp=files['sp']))

            run_process('{synworld} {fl} {sr} {f0} {sp} {ap} {wav}'
                         .format(synworld=WORLD['SYNTHESIS'], fl=cfg.fl, sr=cfg.sr, f0=files['f0'], sp=files['sp'], ap=files['ap'], wav=files['wav']))
            
            run_process('rm -f {ap} {sp} {f0}'.format(ap=files['ap'],sp=files['sp'],f0=files['f0']))

        else:
        
            logger.critical('The vocoder %s is not supported yet!\n' % cfg.vocoder_type )
            raise
        
        os.chdir(cur_dir)
Пример #51
0
    def make_labels(self,input_file_descriptors,out_file_name=None,\
                                    fill_missing_values=False,iterate_over_frames=False):

        ## input_file_descriptors is e.g. {'xpath': <open XML file for reading>}

        # file_descriptors is a dictionary of open label files all for the same utterance
        # currently supports XPATH or HTS file formats only
        # keys should be 'xpath' or 'hts'
        
        # an array in which to assemble all the features
        all_labels = None
        
        try:
            assert self.configuration
        except AssertionError:
            self.logger.critical('no label configuration loaded, so cannot make labels')
            raise
            
            
        # now iterate through the features, and create the features from the appropriate open label file
        
        xpath_list = []  ## gather all here and extact all features in one pass
        mapper_list = []
        
        for (item_number, feature_specification) in enumerate(self.configuration.labels):
        
            #osw# self.logger.debug('constructing feature %.80s ...' % feature_specification )
                        
            ## osw -- we'll append frame features to the data for the *LAST* 
            ##        feature_specification in our list 
            add_frame_features = False
            if item_number+1 == len(self.configuration.labels):
                add_frame_features = True
                #osw# self.logger.debug('append frame features')
                        
            # which label file should we use?
            if feature_specification.has_key('xpath'):
                # xpath and hts are mutually exclusive label styles
                assert not feature_specification.has_key('hts')
                #osw# self.logger.debug(' feature style: xpath ; XPATH: %s' % feature_specification['xpath']  )
            
                # actually make the features from this open file and the current XPATH

                try:
                    assert self.configuration.target_nodes
                except:
                    self.logger.critical('When using XPATH features, "target_nodes" must be defined in the label config file')
                    raise

                try:
                    xpath_list.append(feature_specification['xpath'])
                    if feature_specification.has_key('mapper'):
                        mapper_list.append(feature_specification['mapper'])
                    else:
                        mapper_list.append(None)
                except:
                    self.logger.critical('error creating XMLLabelNormalisation object for feature %s' % feature_specification )
                    raise
                    
                    
            if feature_specification.has_key('hts'):
                assert not feature_specification.has_key('xpath')
                # not yet implemented !
                self.logger.warning('HTS features not implemented - ignoring them!')
                #these_labels=None
                # to do, with implementation: deal with fill_missing_values correctly                     
                          
                          
        ## Now extract all feats in one go -- go straight to all_labels -- don't compose from 'these_labels':
        label_normaliser = XMLLabelNormalisation(xpath=xpath_list,mapper=mapper_list,fill_missing_values=fill_missing_values,target_nodes=self.configuration.target_nodes,use_compiled_xpath=self.use_precompiled_xpaths,iterate_over_frames=iterate_over_frames)
                            
        try:
            all_labels = label_normaliser.extract_linguistic_features(input_file_descriptors['xpath'], add_frame_features=add_frame_features)
        except KeyError:
            self.logger.critical('no open xpath label file available to create feature %s' % feature_specification )
            raise
        
            
        
#             # add these_features as additional columns of all_features
#             if (these_labels != None):
#                 if all_labels != None:
#                     all_labels = numpy.hstack((all_labels,these_labels))
#                 else:
#                     all_labels= these_labels

        if all_labels != None:
            self.logger.debug(' composed features now have dimension %d' % all_labels.shape[1])
            
        #osw# self.logger.debug( 'first line of labels: ' + str(all_labels[0,:]))
                
        
        # finally, save the labels
        if out_file_name:
            io_funcs = BinaryIOCollection()
            io_funcs.array_to_binary_file(all_labels, out_file_name)
            
            ## osw: useful for debugging:
            ##numpy.savetxt(out_file_name + '.TXT', all_labels, delimiter='\t')
            
            
            # debug
            # with printoptions(threshold=3000, linewidth=1000, edgeitems=1000, precision=1, suppress=True):
            #     # print all_labels
            #     print all_labels.sum(axis=1)
            
            
            self.logger.info('saved numerical features of shape %s to %s' % (all_labels.shape,out_file_name) )
        else:
            return all_features
Пример #52
0
    def acoustic_decomposition(self, in_file_list, dimension, out_dimension_dict, file_extension_dict, var_file_dict, do_MLPG=True, cfg=None):

        logger = logging.getLogger('param_generation')

        logger.debug('acoustic_decomposition for %d files' % len(in_file_list) )

        self.load_covariance(var_file_dict, out_dimension_dict)

        stream_start_index = {}
        dimension_index = 0
        recorded_vuv = False
        vuv_dimension = None

        for feature_name in list(out_dimension_dict.keys()):
#            if feature_name != 'vuv':
            stream_start_index[feature_name] = dimension_index
#            else:
#                vuv_dimension = dimension_index
#                recorded_vuv = True

            dimension_index += out_dimension_dict[feature_name]

        io_funcs = BinaryIOCollection()

        mlpg_algo = MLParameterGeneration()

        findex=0
        flen=len(in_file_list)
        for file_name in in_file_list:

            findex=findex+1

            dir_name = os.path.dirname(file_name)
            file_id = os.path.splitext(os.path.basename(file_name))[0]

            features, frame_number = io_funcs.load_binary_file_frame(file_name, dimension)

            logger.info('processing %4d of %4d: %s' % (findex,flen,file_name) )

            for feature_name in self.gen_wav_features:

                logger.debug(' feature: %s' % feature_name)

                current_features = features[:, stream_start_index[feature_name]:stream_start_index[feature_name]+out_dimension_dict[feature_name]]
                if FAST_MLPG:
                    ### fast version wants variance per frame, not single global one:
                    var = self.var[feature_name]
                    var = numpy.transpose(numpy.tile(var,frame_number))
                else:
                    var = self.var[feature_name]

#                print  var.shape[1]
                if do_MLPG == False:
                    gen_features = current_features
                else:
                    gen_features = mlpg_algo.generation(current_features, var, out_dimension_dict[feature_name]//3)
#                else:
#                    self.logger.critical("the dimensions do not match for MLPG: %d vs %d" %(var.shape[1], out_dimension_dict[feature_name]))
#                    raise

                logger.debug(' feature dimensions: %d by %d' %(gen_features.shape[0], gen_features.shape[1]))

                if feature_name in ['lf0', 'F0']:
                    if 'vuv' in stream_start_index:
                        vuv_feature = features[:, stream_start_index['vuv']:stream_start_index['vuv']+1]

                        for i in range(frame_number):
                            if vuv_feature[i, 0] < 0.5 or gen_features[i, 0] < numpy.log(20):
                                gen_features[i, 0] = self.inf_float

                new_file_name = os.path.join(dir_name, file_id + file_extension_dict[feature_name])

                if self.enforce_silence:
                    silence_pattern = cfg.silence_pattern
                    label_align_dir = cfg.in_label_align_dir
                    in_f = open(label_align_dir+'/'+file_id+'.lab','r')
                    for line in in_f.readlines():
                        line = line.strip()

                        if len(line) < 1:
                            continue
                        temp_list  = re.split('\s+', line)
                        start_time = int(int(temp_list[0])*(10**-4)/5)
                        end_time   = int(int(temp_list[1])*(10**-4)/5)

                        full_label = temp_list[2]

                        label_binary_flag = self.check_silence_pattern(full_label, silence_pattern)

                        if label_binary_flag:
                            if feature_name in ['lf0', 'F0', 'mag']:
                                gen_features[start_time:end_time, :] = self.inf_float
                            else:
                                gen_features[start_time:end_time, :] = 0.0

                io_funcs.array_to_binary_file(gen_features, new_file_name)
                logger.debug(' wrote to file %s' % new_file_name)
Пример #53
0
def generate_wav(data,
                 gen_dir,
                 base,
                 sptk_dir,
                 world_dir,
                 norm_info_file,
                 do_post_filtering=True,
                 mgc_dim=60,
                 fl=1024,
                 sr=16000):

    io_funcs = BinaryIOCollection()
    file_name = os.path.join(gen_dir, base + ".cmp")

    fid = open(norm_info_file, 'rb')
    cmp_info = numpy.fromfile(fid, dtype=numpy.float32)
    fid.close()
    cmp_info = cmp_info.reshape((2, -1))
    cmp_mean = cmp_info[0, ]
    cmp_std = cmp_info[1, ]

    data = data * cmp_std + cmp_mean

    io_funcs.array_to_binary_file(data, file_name)

    # This code was adapted from Merlin. I should add the license.

    out_dimension_dict = {'bap': 1, 'lf0': 1, 'mgc': 60, 'vuv': 1}
    stream_start_index = {}
    file_extension_dict = {
        'mgc': '.mgc',
        'bap': '.bap',
        'lf0': '.lf0',
        'dur': '.dur',
        'cmp': '.cmp'
    }
    gen_wav_features = ['mgc', 'lf0', 'bap']

    dimension_index = 0
    for feature_name in out_dimension_dict.keys():
        stream_start_index[feature_name] = dimension_index
        dimension_index += out_dimension_dict[feature_name]

    dir_name = os.path.dirname(file_name)
    file_id = os.path.splitext(os.path.basename(file_name))[0]
    features, frame_number = io_funcs.load_binary_file_frame(file_name, 63)

    for feature_name in gen_wav_features:

        current_features = features[:, stream_start_index[feature_name]:
                                    stream_start_index[feature_name] +
                                    out_dimension_dict[feature_name]]

        gen_features = current_features

        if feature_name in ['lf0', 'F0']:
            if 'vuv' in stream_start_index.keys():
                vuv_feature = features[:, stream_start_index['vuv']:
                                       stream_start_index['vuv'] + 1]

                for i in xrange(frame_number):
                    if vuv_feature[i, 0] < 0.5:
                        gen_features[i, 0] = -1.0e+10  # self.inf_float

        new_file_name = os.path.join(
            dir_name, file_id + file_extension_dict[feature_name])

        io_funcs.array_to_binary_file(gen_features, new_file_name)

    pf_coef = 1.4
    fw_alpha = 0.58
    co_coef = 511

    sptk_path = {
        'SOPR': sptk_dir + 'sopr',
        'FREQT': sptk_dir + 'freqt',
        'VSTAT': sptk_dir + 'vstat',
        'MGC2SP': sptk_dir + 'mgc2sp',
        'MERGE': sptk_dir + 'merge',
        'BCP': sptk_dir + 'bcp',
        'MC2B': sptk_dir + 'mc2b',
        'C2ACR': sptk_dir + 'c2acr',
        'MLPG': sptk_dir + 'mlpg',
        'VOPR': sptk_dir + 'vopr',
        'B2MC': sptk_dir + 'b2mc',
        'X2X': sptk_dir + 'x2x',
        'VSUM': sptk_dir + 'vsum'
    }

    world_path = {
        'ANALYSIS': world_dir + 'analysis',
        'SYNTHESIS': world_dir + 'synth'
    }

    fw_coef = fw_alpha
    fl_coef = fl

    files = {
        'sp': base + '.sp',
        'mgc': base + '.mgc',
        'f0': base + '.f0',
        'lf0': base + '.lf0',
        'ap': base + '.ap',
        'bap': base + '.bap',
        'wav': base + '.wav'
    }

    mgc_file_name = files['mgc']

    cur_dir = os.getcwd()
    os.chdir(gen_dir)

    #  post-filtering
    if do_post_filtering:
        line = "echo 1 1 "
        for i in range(2, mgc_dim):
            line = line + str(pf_coef) + " "

        run_process('{line} | {x2x} +af > {weight}'.format(
            line=line,
            x2x=sptk_path['X2X'],
            weight=os.path.join(gen_dir, 'weight')))

        run_process('{freqt} -m {order} -a {fw} -M {co} -A 0 < {mgc} | '
                    '{c2acr} -m {co} -M 0 -l {fl} > {base_r0}'.format(
                        freqt=sptk_path['FREQT'],
                        order=mgc_dim - 1,
                        fw=fw_coef,
                        co=co_coef,
                        mgc=files['mgc'],
                        c2acr=sptk_path['C2ACR'],
                        fl=fl_coef,
                        base_r0=files['mgc'] + '_r0'))

        run_process('{vopr} -m -n {order} < {mgc} {weight} | '
                    '{freqt} -m {order} -a {fw} -M {co} -A 0 | '
                    '{c2acr} -m {co} -M 0 -l {fl} > {base_p_r0}'.format(
                        vopr=sptk_path['VOPR'],
                        order=mgc_dim - 1,
                        mgc=files['mgc'],
                        weight=os.path.join(gen_dir, 'weight'),
                        freqt=sptk_path['FREQT'],
                        fw=fw_coef,
                        co=co_coef,
                        c2acr=sptk_path['C2ACR'],
                        fl=fl_coef,
                        base_p_r0=files['mgc'] + '_p_r0'))

        run_process('{vopr} -m -n {order} < {mgc} {weight} | '
                    '{mc2b} -m {order} -a {fw} | '
                    '{bcp} -n {order} -s 0 -e 0 > {base_b0}'.format(
                        vopr=sptk_path['VOPR'],
                        order=mgc_dim - 1,
                        mgc=files['mgc'],
                        weight=os.path.join(gen_dir, 'weight'),
                        mc2b=sptk_path['MC2B'],
                        fw=fw_coef,
                        bcp=sptk_path['BCP'],
                        base_b0=files['mgc'] + '_b0'))

        run_process(
            '{vopr} -d < {base_r0} {base_p_r0} | '
            '{sopr} -LN -d 2 | {vopr} -a {base_b0} > {base_p_b0}'.format(
                vopr=sptk_path['VOPR'],
                base_r0=files['mgc'] + '_r0',
                base_p_r0=files['mgc'] + '_p_r0',
                sopr=sptk_path['SOPR'],
                base_b0=files['mgc'] + '_b0',
                base_p_b0=files['mgc'] + '_p_b0'))

        run_process('{vopr} -m -n {order} < {mgc} {weight} | '
                    '{mc2b} -m {order} -a {fw} | '
                    '{bcp} -n {order} -s 1 -e {order} | '
                    '{merge} -n {order2} -s 0 -N 0 {base_p_b0} | '
                    '{b2mc} -m {order} -a {fw} > {base_p_mgc}'.format(
                        vopr=sptk_path['VOPR'],
                        order=mgc_dim - 1,
                        mgc=files['mgc'],
                        weight=os.path.join(gen_dir, 'weight'),
                        mc2b=sptk_path['MC2B'],
                        fw=fw_coef,
                        bcp=sptk_path['BCP'],
                        merge=sptk_path['MERGE'],
                        order2=mgc_dim - 2,
                        base_p_b0=files['mgc'] + '_p_b0',
                        b2mc=sptk_path['B2MC'],
                        base_p_mgc=files['mgc'] + '_p_mgc'))

        mgc_file_name = files['mgc'] + '_p_mgc'

    # Vocoder WORLD

    run_process('{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} | '
                '{x2x} +fd > {f0}'.format(sopr=sptk_path['SOPR'],
                                          lf0=files['lf0'],
                                          x2x=sptk_path['X2X'],
                                          f0=files['f0']))

    run_process('{sopr} -c 0 {bap} | {x2x} +fd > {ap}'.format(
        sopr=sptk_path['SOPR'],
        bap=files['bap'],
        x2x=sptk_path['X2X'],
        ap=files['ap']))

    run_process('{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} | '
                '{sopr} -d 32768.0 -P | {x2x} +fd > {sp}'.format(
                    mgc2sp=sptk_path['MGC2SP'],
                    alpha=fw_alpha,
                    order=mgc_dim - 1,
                    fl=fl,
                    mgc=mgc_file_name,
                    sopr=sptk_path['SOPR'],
                    x2x=sptk_path['X2X'],
                    sp=files['sp']))

    run_process('{synworld} {fl} {sr} {f0} {sp} {ap} {wav}'.format(
        synworld=world_path['SYNTHESIS'],
        fl=fl,
        sr=sr,
        f0=files['f0'],
        sp=files['sp'],
        ap=files['ap'],
        wav=files['wav']))

    run_process('rm -f {ap} {sp} {f0} {bap} {lf0} {mgc} {mgc}_b0 {mgc}_p_b0 '
                '{mgc}_p_mgc {mgc}_p_r0 {mgc}_r0 {cmp} weight'.format(
                    ap=files['ap'],
                    sp=files['sp'],
                    f0=files['f0'],
                    bap=files['bap'],
                    lf0=files['lf0'],
                    mgc=files['mgc'],
                    cmp=base + '.cmp'))
    os.chdir(cur_dir)
Пример #54
0
         max_d=max_d,  # plot a horizontal cut-off line
     )
     plt.show()
     
     
     clusters_contours = []
     for xi in range(k):
         ind_cnt = np.where(clusters == xi+1)
         clusters_contours = np.concatenate((clusters_contours, np.mean(Y1[ind_cnt], axis=0)), axis=0)
     
     final_clusters = clusters_contours.reshape(k,coef_size-1)
     
     if k%2==0:
         plot_templates(final_clusters)
     
     io_funcs.array_to_binary_file(final_clusters, clusters_file)   
     # ## comment below line to run full list of files
     # break; ### breaks after processing one file - to check errors
 
 train_clusters = []
 dev_clusters   = []
 test_clusters = []
 
 train_utt = 3850; valid_utt = 116; test_utt=271;
 if templatefeats:
     stat_fname = feat_dir_path + '.txt'
     stats_template_file = os.path.join(work_dir, 'Data/inter-module/'+speaker+'/misc/', stat_fname)
     filelist = os.path.join(work_dir, 'Data/fileList/'+speaker+'.scp')
     list_arr = io_funcs.load_file_list(filelist)
     
     prosody_feats = []; flens = [];syl_dur_lens=[]
Пример #55
0
    def prepare_data(self, in_file_list_dict, out_file_list, in_dimension_dict, out_dimension_dict):

        logger = logging.getLogger("acoustic_comp")

        stream_start_index = {}
        stream_dim_index = 0
        # prepare data 
        # counter = 0
        # sort_order = {'bap': 0, 'lf0': 1, 'mgc': 2,'vuv': 3, 'spk': 4}
        # for i in out_dimension_dict:
        #     odrded_keys = sorted(i,key=lambda x: sort_order[x])
        #     # tmp = OrderedDict(sorted(i.items(),key=lambda x: sort_order[x[0]]))
        # #     data[counter].update(tmp)
        # #     counter+=1
        # # print(data)
        data = defaultdict(OrderedDict)
        source = [out_dimension_dict]
        counter = 0


        if not 'dur' in out_dimension_dict:

            sort_order = {'bap': 1, 'lf0': 2, 'vuv': 3,'mgc': 0, 'spk': 4}
        else:
            sort_order = {'dur': 0,'spk':1}


        for i in source:
            odrded_keys = sorted(i,key=lambda x: sort_order[x])
            tmp = OrderedDict(sorted(i.items(),key=lambda x: sort_order[x[0]]))
            data[counter].update(tmp)
            counter+=1

        out_dimension_dict=data[0]





        # print('keys of out dimensioon   {}'.format(out_dimension_dict.keys()))
        for stream_name in list(out_dimension_dict.keys()):

            if stream_name not in stream_start_index:
                stream_start_index[stream_name] = stream_dim_index

            stream_dim_index += out_dimension_dict[stream_name]
        # print('out put dimension dict {}'.format(out_dimension_dict))
        io_funcs = BinaryIOCollection()

        for i in range(self.file_number):
            out_file_name = out_file_list[i]

            #if os.path.isfile(out_file_name):
            #    logger.info('processing file %4d of %4d : %s exists' % (i+1, self.file_number, out_file_name))
                    #    continue

            logger.info('processing file %4d of %4d : %s' % (i+1,self.file_number,out_file_name))

            out_data_matrix = None
            out_frame_number = 0


            for k in range(self.data_stream_number):
                data_stream_name = self.data_stream_list[k]
                in_file_name   = in_file_list_dict[data_stream_name][i]
                in_feature_dim = in_dimension_dict[data_stream_name]
                features, frame_number = io_funcs.load_binary_file_frame(in_file_name, in_feature_dim)


                # if in_file_name.split('.')[1]=='spk':
                #     print('load features related to speaker ')
                #     print(features)
                # print('prepare data from the acoustic composition {} in_file_name {} in_feature_dim {} frame_number {} '.format(k,in_file_name,in_feature_dim,frame_number))
                if k == 0:
                    out_frame_number = frame_number
                    out_data_matrix = numpy.zeros((out_frame_number, self.out_dimension))

                if frame_number > out_frame_number:
                    features = features[0:out_frame_number, ]
                    frame_number = out_frame_number

                try:
                    assert  out_frame_number == frame_number
                except AssertionError:
                    logger.critical('the frame number of data stream %s is not consistent with others: current %d others %d'
                                         %(data_stream_name, out_frame_number, frame_number))
                    raise

                dim_index = stream_start_index[data_stream_name]

                if data_stream_name in ['lf0', 'F0']:   ## F0 added for GlottHMM
                    features, vuv_vector = self.interpolate_f0(features)

                    ### if vuv information to be recorded, store it in corresponding column
                    if self.record_vuv:
                        out_data_matrix[0:out_frame_number, stream_start_index['vuv']:stream_start_index['vuv']+1] = vuv_vector

                out_data_matrix[0:out_frame_number, dim_index:dim_index+in_feature_dim] = features
                dim_index = dim_index+in_feature_dim

                if self.compute_dynamic[data_stream_name]:

                    delta_features = self.compute_dynamic_matrix(features, self.delta_win, frame_number, in_feature_dim)
                    acc_features   = self.compute_dynamic_matrix(features, self.acc_win, frame_number, in_feature_dim)


                    out_data_matrix[0:out_frame_number, dim_index:dim_index+in_feature_dim] = delta_features
                    dim_index = dim_index+in_feature_dim

                    out_data_matrix[0:out_frame_number, dim_index:dim_index+in_feature_dim] = acc_features

            ### write data to file
            io_funcs.array_to_binary_file(out_data_matrix, out_file_name)
            logger.debug(' wrote %d frames of features',out_frame_number )
Пример #56
0
    def prepare_data(self, in_file_list_dict, out_file_list, in_dimension_dict,
                     out_dimension_dict):

        logger = logging.getLogger("acoustic_comp")

        stream_start_index = {}
        stream_dim_index = 0
        for stream_name in out_dimension_dict.keys():
            if not stream_start_index.has_key(stream_name):
                stream_start_index[stream_name] = stream_dim_index

            stream_dim_index += out_dimension_dict[stream_name]

        io_funcs = BinaryIOCollection()

        for i in xrange(self.file_number):
            out_file_name = out_file_list[i]
            #if os.path.isfile(out_file_name):
            #    logger.info('processing file %4d of %4d : %s exists' % (i+1, self.file_number, out_file_name))
            #    continue

            logger.info('processing file %4d of %4d : %s' %
                        (i + 1, self.file_number, out_file_name))

            out_data_matrix = None
            out_frame_number = 0

            for k in xrange(self.data_stream_number):
                data_stream_name = self.data_stream_list[k]

                in_file_name = in_file_list_dict[data_stream_name][i]

                in_feature_dim = in_dimension_dict[data_stream_name]
                features, frame_number = io_funcs.load_binary_file_frame(
                    in_file_name, in_feature_dim)

                if k == 0:
                    out_frame_number = frame_number
                    out_data_matrix = numpy.zeros(
                        (out_frame_number, self.out_dimension))

                if frame_number > out_frame_number:
                    features = features[0:out_frame_number, ]
                    frame_number = out_frame_number

                try:
                    assert out_frame_number == frame_number
                except AssertionError:
                    logger.critical(
                        'the frame number of data stream %s is not consistent with others: current %d others %d'
                        % (data_stream_name, out_frame_number, frame_number))
                    raise

                dim_index = stream_start_index[data_stream_name]

                if data_stream_name in ['lf0', 'F0']:  ## F0 added for GlottHMM
                    features, vuv_vector = self.interpolate_f0(features)

                    ### if vuv information to be recorded, store it in corresponding column
                    if self.record_vuv:
                        out_data_matrix[0:out_frame_number,
                                        stream_start_index['vuv']:
                                        stream_start_index['vuv'] +
                                        1] = vuv_vector

                out_data_matrix[0:out_frame_number, dim_index:dim_index +
                                in_feature_dim] = features
                dim_index = dim_index + in_feature_dim

                if self.compute_dynamic[data_stream_name]:
                    print features.shape, out_file_name
                    delta_features = self.compute_dynamic_matrix(
                        features, self.delta_win, frame_number, in_feature_dim)
                    acc_features = self.compute_dynamic_matrix(
                        features, self.acc_win, frame_number, in_feature_dim)

                    out_data_matrix[0:out_frame_number, dim_index:dim_index +
                                    in_feature_dim] = delta_features
                    dim_index = dim_index + in_feature_dim

                    out_data_matrix[0:out_frame_number, dim_index:dim_index +
                                    in_feature_dim] = acc_features

            ### write data to file
            io_funcs.array_to_binary_file(out_data_matrix, out_file_name)
            logger.debug(' wrote %d frames of features', out_frame_number)
Пример #57
0
def generate_wav(
        data, gen_dir, base, sptk_dir, world_dir, norm_info_file,
        do_post_filtering=True, mgc_dim=60, fl=1024, sr=16000):

    io_funcs = BinaryIOCollection()
    file_name = os.path.join(gen_dir, base + ".cmp")

    fid = open(norm_info_file, 'rb')
    cmp_info = numpy.fromfile(fid, dtype=numpy.float32)
    fid.close()
    cmp_info = cmp_info.reshape((2, -1))
    cmp_mean = cmp_info[0, ]
    cmp_std = cmp_info[1, ]

    data = data * cmp_std + cmp_mean

    io_funcs.array_to_binary_file(data, file_name)

    # This code was adapted from Merlin. I should add the license.

    out_dimension_dict = {'bap': 1, 'lf0': 1, 'mgc': 60, 'vuv': 1}
    stream_start_index = {}
    file_extension_dict = {
        'mgc': '.mgc', 'bap': '.bap', 'lf0': '.lf0',
        'dur': '.dur', 'cmp': '.cmp'}
    gen_wav_features = ['mgc', 'lf0', 'bap']

    dimension_index = 0
    for feature_name in out_dimension_dict.keys():
        stream_start_index[feature_name] = dimension_index
        dimension_index += out_dimension_dict[feature_name]

    dir_name = os.path.dirname(file_name)
    file_id = os.path.splitext(os.path.basename(file_name))[0]
    features, frame_number = io_funcs.load_binary_file_frame(file_name, 63)

    for feature_name in gen_wav_features:

        current_features = features[
            :, stream_start_index[feature_name]:
            stream_start_index[feature_name] +
            out_dimension_dict[feature_name]]

        gen_features = current_features

        if feature_name in ['lf0', 'F0']:
            if 'vuv' in stream_start_index.keys():
                vuv_feature = features[
                    :, stream_start_index['vuv']:stream_start_index['vuv'] + 1]

                for i in xrange(frame_number):
                    if vuv_feature[i, 0] < 0.5:
                        gen_features[i, 0] = -1.0e+10  # self.inf_float

        new_file_name = os.path.join(
            dir_name, file_id + file_extension_dict[feature_name])

        io_funcs.array_to_binary_file(gen_features, new_file_name)

    pf_coef = 1.4
    fw_alpha = 0.58
    co_coef = 511

    sptk_path = {
        'SOPR': sptk_dir + 'sopr',
        'FREQT': sptk_dir + 'freqt',
        'VSTAT': sptk_dir + 'vstat',
        'MGC2SP': sptk_dir + 'mgc2sp',
        'MERGE': sptk_dir + 'merge',
        'BCP': sptk_dir + 'bcp',
        'MC2B': sptk_dir + 'mc2b',
        'C2ACR': sptk_dir + 'c2acr',
        'MLPG': sptk_dir + 'mlpg',
        'VOPR': sptk_dir + 'vopr',
        'B2MC': sptk_dir + 'b2mc',
        'X2X': sptk_dir + 'x2x',
        'VSUM': sptk_dir + 'vsum'}

    world_path = {
        'ANALYSIS': world_dir + 'analysis',
        'SYNTHESIS': world_dir + 'synth'}

    fw_coef = fw_alpha
    fl_coef = fl

    files = {'sp': base + '.sp',
             'mgc': base + '.mgc',
             'f0': base + '.f0',
             'lf0': base + '.lf0',
             'ap': base + '.ap',
             'bap': base + '.bap',
             'wav': base + '.wav'}

    mgc_file_name = files['mgc']

    cur_dir = os.getcwd()
    os.chdir(gen_dir)

    #  post-filtering
    if do_post_filtering:
        line = "echo 1 1 "
        for i in range(2, mgc_dim):
            line = line + str(pf_coef) + " "

        run_process(
            '{line} | {x2x} +af > {weight}'
            .format(
                line=line, x2x=sptk_path['X2X'],
                weight=os.path.join(gen_dir, 'weight')))

        run_process(
            '{freqt} -m {order} -a {fw} -M {co} -A 0 < {mgc} | '
            '{c2acr} -m {co} -M 0 -l {fl} > {base_r0}'
            .format(
                freqt=sptk_path['FREQT'], order=mgc_dim - 1,
                fw=fw_coef, co=co_coef, mgc=files['mgc'],
                c2acr=sptk_path['C2ACR'], fl=fl_coef,
                base_r0=files['mgc'] + '_r0'))

        run_process(
            '{vopr} -m -n {order} < {mgc} {weight} | '
            '{freqt} -m {order} -a {fw} -M {co} -A 0 | '
            '{c2acr} -m {co} -M 0 -l {fl} > {base_p_r0}'
            .format(
                vopr=sptk_path['VOPR'], order=mgc_dim - 1,
                mgc=files['mgc'],
                weight=os.path.join(gen_dir, 'weight'),
                freqt=sptk_path['FREQT'], fw=fw_coef, co=co_coef,
                c2acr=sptk_path['C2ACR'], fl=fl_coef,
                base_p_r0=files['mgc'] + '_p_r0'))

        run_process(
            '{vopr} -m -n {order} < {mgc} {weight} | '
            '{mc2b} -m {order} -a {fw} | '
            '{bcp} -n {order} -s 0 -e 0 > {base_b0}'
            .format(
                vopr=sptk_path['VOPR'], order=mgc_dim - 1,
                mgc=files['mgc'],
                weight=os.path.join(gen_dir, 'weight'),
                mc2b=sptk_path['MC2B'], fw=fw_coef,
                bcp=sptk_path['BCP'], base_b0=files['mgc'] + '_b0'))

        run_process(
            '{vopr} -d < {base_r0} {base_p_r0} | '
            '{sopr} -LN -d 2 | {vopr} -a {base_b0} > {base_p_b0}'
            .format(
                vopr=sptk_path['VOPR'],
                base_r0=files['mgc'] + '_r0',
                base_p_r0=files['mgc'] + '_p_r0',
                sopr=sptk_path['SOPR'],
                base_b0=files['mgc'] + '_b0',
                base_p_b0=files['mgc'] + '_p_b0'))

        run_process(
            '{vopr} -m -n {order} < {mgc} {weight} | '
            '{mc2b} -m {order} -a {fw} | '
            '{bcp} -n {order} -s 1 -e {order} | '
            '{merge} -n {order2} -s 0 -N 0 {base_p_b0} | '
            '{b2mc} -m {order} -a {fw} > {base_p_mgc}'
            .format(
                vopr=sptk_path['VOPR'], order=mgc_dim - 1,
                mgc=files['mgc'],
                weight=os.path.join(gen_dir, 'weight'),
                mc2b=sptk_path['MC2B'], fw=fw_coef,
                bcp=sptk_path['BCP'],
                merge=sptk_path['MERGE'], order2=mgc_dim - 2,
                base_p_b0=files['mgc'] + '_p_b0',
                b2mc=sptk_path['B2MC'],
                base_p_mgc=files['mgc'] + '_p_mgc'))

        mgc_file_name = files['mgc'] + '_p_mgc'

    # Vocoder WORLD

    run_process(
        '{sopr} -magic -1.0E+10 -EXP -MAGIC 0.0 {lf0} | '
        '{x2x} +fd > {f0}'
        .format(
            sopr=sptk_path['SOPR'], lf0=files['lf0'],
            x2x=sptk_path['X2X'], f0=files['f0']))

    run_process(
        '{sopr} -c 0 {bap} | {x2x} +fd > {ap}'.format(
            sopr=sptk_path['SOPR'], bap=files['bap'],
            x2x=sptk_path['X2X'], ap=files['ap']))

    run_process(
        '{mgc2sp} -a {alpha} -g 0 -m {order} -l {fl} -o 2 {mgc} | '
        '{sopr} -d 32768.0 -P | {x2x} +fd > {sp}'.format(
            mgc2sp=sptk_path['MGC2SP'], alpha=fw_alpha,
            order=mgc_dim - 1, fl=fl, mgc=mgc_file_name,
            sopr=sptk_path['SOPR'], x2x=sptk_path['X2X'], sp=files['sp']))

    run_process(
        '{synworld} {fl} {sr} {f0} {sp} {ap} {wav}'.format(
            synworld=world_path['SYNTHESIS'], fl=fl, sr=sr,
            f0=files['f0'], sp=files['sp'], ap=files['ap'],
            wav=files['wav']))

    run_process(
        'rm -f {ap} {sp} {f0} {bap} {lf0} {mgc} {mgc}_b0 {mgc}_p_b0 '
        '{mgc}_p_mgc {mgc}_p_r0 {mgc}_r0 {cmp} weight'.format(
            ap=files['ap'], sp=files['sp'], f0=files['f0'],
            bap=files['bap'], lf0=files['lf0'], mgc=files['mgc'],
            cmp=base + '.cmp'))
    os.chdir(cur_dir)