def __init__(self, ):
        # basic parameters of acoustic feature
        self.windows = [
            (0, 0, np.array([1.0])),
            (1, 1, np.array([-0.5, 0.0, 0.5])),
            (1, 1, np.array([1.0, -2.0, 1.0])),
        ]

        self.sr = 16000
        self.alpha = pysptk.util.mcepalpha(self.sr)
        self.fftlen = 1024
        self.frame_period = 5

        self.mgc_start_idx = 0
        self.lf0_start_idx = 180
        self.vuv_start_idx = 183
        self.bap_start_idx = 184

        # configuration of neural network
        # self.n_in = 421
        self.n_in = casia.n_in

        # self.hidden_layer_size = [1024, 1024, 1024, 1024, 1024, 1024]
        self.hidden_layer_size = casia.hidden_layer_size

        self.n_out = 187
        self.hidden_layer_type = [
            'tanh', 'tanh', 'tanh', 'tanh', 'tanh', 'tanh'
        ]
        self.norm_info_file = casia.norm_info_file
        self.test_norm_path = casia.test_norm_path
        self.model_dir = casia.model_dir
        self.training_num = casia.training_num
        # path
        # acoustic mean and var
        # self.norm_info_file = "/home/gyzhang/merlin/egs/cmu_arctic/s1/experiments/cmu_arctic_2/acoustic_model/inter_module/norm_info__mgc_lf0_vuv_bap_187_MVN.dat"
        # linguistic norm features
        # self.test_norm_path = "/home/gyzhang/merlin/egs/cmu_arctic/s1/experiments/cmu_arctic_2/acoustic_model/inter_module/nn_no_silence_lab_norm_421/arctic_b0452.lab"

        self.tensorflow_models = self.load_tensorflow_model()
        self.sequence_model = True
        self.mlpg_algo = MLParameterGeneration()
class Predict_Syn(object):
    """docstring for Predict_syn"""

    def __init__(self, ):
        # basic parameters of acoustic feature
        self.windows = [
            (0, 0, np.array([1.0])),
            (1, 1, np.array([-0.5, 0.0, 0.5])),
            (1, 1, np.array([1.0, -2.0, 1.0])),
        ]

        self.sr = 16000
        self.alpha = pysptk.util.mcepalpha(self.sr)
        self.fftlen = 1024
        self.frame_period = 5

        self.mgc_start_idx = 0
        self.lf0_start_idx = 180
        self.vuv_start_idx = 183
        self.bap_start_idx = 184

        # configuration of neural network
        # self.n_in = 421
        self.n_in = casia.n_in

        # self.hidden_layer_size = [1024, 1024, 1024, 1024, 1024, 1024]
        self.hidden_layer_size = casia.hidden_layer_size

        self.n_out = 187
        self.hidden_layer_type = [
            'tanh', 'tanh', 'tanh', 'tanh', 'tanh', 'tanh']
        self.norm_info_file = casia.norm_info_file
        self.test_norm_path = casia.test_norm_path
        self.model_dir = casia.model_dir
        self.training_num = casia.training_num
        # path
        # acoustic mean and var
        # self.norm_info_file = "/home/gyzhang/merlin/egs/cmu_arctic/s1/experiments/cmu_arctic_2/acoustic_model/inter_module/norm_info__mgc_lf0_vuv_bap_187_MVN.dat"
        # linguistic norm features
        # self.test_norm_path = "/home/gyzhang/merlin/egs/cmu_arctic/s1/experiments/cmu_arctic_2/acoustic_model/inter_module/nn_no_silence_lab_norm_421/arctic_b0452.lab"

        self.tensorflow_models = self.load_tensorflow_model()

        self.mlpg_algo = MLParameterGeneration()

    def load_tensorflow_model(self,):
        tensorflow_models = TrainTensorflowModels(
            self.n_in, self.hidden_layer_size, self.n_out, self.hidden_layer_type, self.model_dir)
        tensorflow_models.define_feedforward_model_utt()
        return tensorflow_models

    def load_prev_fea(self,):
        # load acoustic var and mean and linguistic feature
        fid = open(self.norm_info_file, 'rb')
        cmp_min_max = np.fromfile(fid, dtype=np.float32)
        fid.close()
        cmp_min_max = cmp_min_max.reshape((2, -1))
        cmp_mean_vector = cmp_min_max[0, ]
        cmp_std_vector = cmp_min_max[1, ]
        io_funcs = BinaryIOCollection()
        inp_features, frame_number = io_funcs.load_binary_file_frame(
            self.test_norm_path, self.n_in)
        test_lin_x, test_lab_x = np.hsplit(inp_features, np.array([-1]))
        # set 100 as vary utterance embedding
        test_lab_x = np.tile(np.array(100), (test_lab_x.shape[0], 1))
        return cmp_mean_vector, cmp_std_vector, test_lin_x, test_lab_x

    def inference(self, z1, z2, test_lin_x, test_lab_x, embed_index):
        with self.tensorflow_models.graph.as_default():
            new_saver = tf.train.Saver()
            with tf.Session() as sess:
                sess.run(tf.global_variables_initializer())
                latest_ckpt = tf.train.latest_checkpoint(
                    self.tensorflow_models.ckpt_dir, latest_filename=None)
                new_saver.restore(sess, latest_ckpt)
                v1 = sess.graph.get_tensor_by_name('utt-embeddings:0')
                v1_array = sess.run(v1)
                v1_array[embed_index] = [z1, z2]
                sess.run(tf.assign(v1, v1_array))
                y_predict = sess.run(self.tensorflow_models.output_layer, feed_dict={
                                     self.tensorflow_models.input_lin_layer: test_lin_x, self.tensorflow_models.utt_index_t: test_lab_x, self.tensorflow_models.is_training_batch: False})
            return v1_array, y_predict

    def parms_gen(self, z1, z2, embed_index, test_lin_x, test_lab_x, cmp_mean_vector, cmp_std_vector):
        v1_array, y_predict = self.inference(
            z1, z2, test_lin_x, test_lab_x, embed_index)
        norm_features = y_predict * cmp_std_vector + cmp_mean_vector
        T = norm_features.shape[0]
        # Split acoustic features
        mgc = norm_features[:, :self.lf0_start_idx]
        lf0 = norm_features[:, self.lf0_start_idx: self.vuv_start_idx]
        vuv = norm_features[:, self.vuv_start_idx]
        bap = norm_features[:, self.bap_start_idx:]
        cmp_var_vector = cmp_std_vector**2
        mgc_variances = np.tile(cmp_var_vector[:self.lf0_start_idx], (T, 1))
        mgc = self.mlpg_algo.generation(mgc, mgc_variances, 60)
        lf0_variances = np.tile(
            cmp_var_vector[self.lf0_start_idx:self.vuv_start_idx], (T, 1))
        lf0 = self.mlpg_algo.generation(lf0, lf0_variances, 1)
        bap_variances = np.tile(cmp_var_vector[self.bap_start_idx:], (T, 1))
        bap = self.mlpg_algo.generation(bap, bap_variances, 1)
        f0 = lf0.copy()
        f0[vuv < 0.5] = 0
        f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])
        return v1_array, y_predict, f0, mgc, bap

    def gen_wav(self, f0, mgc, bap):
        spectrogram = pysptk.mc2sp(mgc, fftlen=self.fftlen, alpha=self.alpha)
        aperiodicity = pyworld.decode_aperiodicity(
            bap.astype(np.float64), self.sr, self.fftlen)
        generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64), spectrogram.astype(
            np.float64), aperiodicity.astype(np.float64), self.sr, self.frame_period)
        x2 = generated_waveform / np.max(generated_waveform) * 32768
        x2 = x2.astype(np.int16)
        wavfile.write("gen.wav", self.sr, x2)
        with open("gen.wav", 'rb') as fd:
            contents = fd.read()
        intensity = 10 * np.log10(np.sum(spectrogram**2, axis=1))
        return contents, intensity

    def load_casia_color(self, path="/home/gyzhang/merlin/egs/casia/s1/experiments/liuchang/acoustic_model/data/metadata.tsv"):
        emotion_dict = {'happy': 0, "sad": 1, "angry": 2,
                        "neutral": 3, "fear": 4, "surprise": 5}
        ['green', 'white', 'black', 'blue','magenta','yellow', 'red']
        colors = np.zeros(casia.training_num)
        with open(path, 'r') as fid:
            file_lines = fid.readlines()
        for num, each_line in enumerate(file_lines[1:]):
            _, _, emotion = re.split("\t", each_line.strip())
            emo_id = emotion_dict[emotion]
            colors[num] = emo_id
        return colors
예제 #3
0
def dnn_generation(valid_file_list,
                   nnets_file_name,
                   n_ins,
                   n_outs,
                   out_file_list,
                   target_mean_vector,
                   target_std_vector,
                   out_dimension_dict,
                   file_extension_dict,
                   vocoder='straight'):
    logger = logging.getLogger("dnn_generation")
    logger.debug('Starting dnn_generation')

    inf_float = -1.0e+10

    plotlogger = logging.getLogger("plotting")

    cfg.gen_wav_features

    if vocoder == 'straight':
        gen_wav_features = ['mgc', 'lf0', 'bap']
    elif vocoder == 'glotthmm':
        gen_wav_features = ['F0', 'Gain', 'HNR', 'LSF',
                            'LSFsource']  ## TODO: take this from config
    else:
        sys.exit('unsupported vocoder %s !' % (vocoder))

    stream_start_index = {}
    dimension_index = 0
    for feature_name in out_dimension_dict.keys():
        stream_start_index[feature_name] = dimension_index
        dimension_index += out_dimension_dict[feature_name]

    dnn_model = cPickle.load(open(nnets_file_name, 'rb'))

    file_number = len(valid_file_list)
    io_funcs = BinaryIOCollection()

    mlpg = MLParameterGenerationFast()

    for i in xrange(file_number):
        logger.info('generating %4d of %4d: %s' %
                    (i + 1, file_number, valid_file_list[i]))
        fid_lab = open(valid_file_list[i], 'rb')
        features = numpy.fromfile(fid_lab, dtype=numpy.float32)
        fid_lab.close()
        features = features[:(n_ins * (features.size / n_ins))]
        features = features.reshape((-1, n_ins))

        frame_number = features.shape[0]

        test_set_x = theano.shared(
            numpy.asarray(features, dtype=theano.config.floatX))

        mean_matrix = numpy.tile(target_mean_vector, (features.shape[0], 1))
        std_matrix = numpy.tile(target_std_vector, (features.shape[0], 1))

        predicted_mix = dnn_model.parameter_prediction_mix(
            test_set_x=test_set_x)
        max_index = numpy.argmax(predicted_mix, axis=1)

        temp_predicted_mu = dnn_model.parameter_prediction(
            test_set_x=test_set_x)
        temp_predicted_sigma = dnn_model.parameter_prediction_sigma(
            test_set_x=test_set_x)
        predicted_mu = numpy.zeros((temp_predicted_mu.shape[0], n_outs))
        predicted_sigma = numpy.zeros((temp_predicted_sigma.shape[0], n_outs))
        for kk in xrange(temp_predicted_mu.shape[0]):
            predicted_mu[kk, :] = temp_predicted_mu[kk, max_index[kk] *
                                                    n_outs:(max_index[kk] +
                                                            1) * n_outs]
            predicted_sigma[kk, :] = temp_predicted_sigma[
                kk, max_index[kk] * n_outs:(max_index[kk] + 1) * n_outs]
#        print   predicted_mu.shape
#        predicted_mu = predicted_mu[aa*n_outs:(aa+1)*n_outs]
        predicted_mu = predicted_mu * std_matrix + mean_matrix
        predicted_sigma = ((predicted_sigma**0.5) * std_matrix)**2

        dir_name = os.path.dirname(out_file_list[i])
        file_id = os.path.splitext(os.path.basename(out_file_list[i]))[0]

        mlpg = MLParameterGenerationFast()
        for feature_name in gen_wav_features:
            current_features = predicted_mu[:,
                                            stream_start_index[feature_name]:
                                            stream_start_index[feature_name] +
                                            out_dimension_dict[feature_name]]
            current_sigma = predicted_sigma[:,
                                            stream_start_index[feature_name]:
                                            stream_start_index[feature_name] +
                                            out_dimension_dict[feature_name]]

            gen_features = mlpg.generation(
                current_features, current_sigma,
                out_dimension_dict[feature_name] / 3)

            if feature_name in ['lf0', 'F0']:
                if stream_start_index.has_key('vuv'):
                    vuv_feature = predicted_mu[:, stream_start_index['vuv']:
                                               stream_start_index['vuv'] + 1]
                    for i in xrange(frame_number):
                        if vuv_feature[i, 0] < 0.5:
                            gen_features[i, 0] = inf_float
#                print   gen_features
            new_file_name = os.path.join(
                dir_name, file_id + file_extension_dict[feature_name])

            io_funcs.array_to_binary_file(gen_features, new_file_name)