Пример #1
0
    def build(self, inp, inp_mask):

        batch_size = tf.shape(inp)[0]
        input_time_steps = tf.shape(inp)[1]

        ### Encoder [ begin ]
        with tf.variable_scope("encoder"):
            with tf.variable_scope("embedding"):
                embed_inp = EmbeddingLayer(EMBED_CLASS, EMBED_DIM)(inp)

            with tf.variable_scope("changeToVarible"):
                self.single_style_token = tf.get_variable(
                    'style_token', (1, styles_kind, style_dim),
                    dtype=tf.float32)
                self.style_token = tf.tile(self.single_style_token,
                                           (batch_size, 1, 1))

            with tf.variable_scope("pre-net"):
                pre_ed_inp = tf.layers.dropout(tf.layers.dense(
                    embed_inp, 256, tf.nn.relu),
                                               training=self.training)
                pre_ed_inp = tf.layers.dropout(tf.layers.dense(
                    pre_ed_inp, 128, tf.nn.relu),
                                               training=self.training)

            with tf.variable_scope("CBHG"):
                # batch major
                encoder_output = CBHG(16,
                                      (128, 128))(pre_ed_inp,
                                                  sequence_length=inp_mask,
                                                  is_training=self.training,
                                                  time_major=False)

        with tf.variable_scope("attention"):
            att_module = AttentionModule(ATT_RNN_SIZE,
                                         encoder_output,
                                         sequence_length=inp_mask,
                                         time_major=False)
        with tf.variable_scope("attention_style"):
            att_module_style = AttentionModule(STYLE_ATT_RNN_SIZE,
                                               self.style_token,
                                               time_major=False)

        with tf.variable_scope("decoder"):
            with tf.variable_scope("attentionRnn"):
                att_cell = GRUCell(ATT_RNN_SIZE)
            with tf.variable_scope("acoustic_module"):
                aco_cell = MultiRNNCell(
                    [ResidualWrapper(GRUCell(DEC_RNN_SIZE)) for _ in range(2)])

            ### prepare output alpha TensorArray
            reduced_time_steps = tf.div(MAX_OUT_STEPS, self.r)
            att_cell_state = att_cell.init_state(batch_size, tf.float32)
            aco_cell_state = aco_cell.zero_state(batch_size, tf.float32)
            state_tup = tuple([att_cell_state, aco_cell_state])
            output_ta = tf.TensorArray(size=reduced_time_steps,
                                       dtype=tf.float32)
            alpha_ta = tf.TensorArray(size=reduced_time_steps,
                                      dtype=tf.float32)
            weight_ta = tf.TensorArray(size=reduced_time_steps,
                                       dtype=tf.float32)
            alpha_style_ta = tf.TensorArray(size=reduced_time_steps,
                                            dtype=tf.float32)
            init_indic = tf.zeros([batch_size, OUTPUT_MEL_DIM])
            # init_context = tf.zeros((batch_size, 256))

            time = tf.constant(0, dtype=tf.int32)
            cond = lambda time, *_: tf.less(time, reduced_time_steps)

            def body(time, indic, output_ta, alpha_ta, alpha_style_ta,
                     weight_ta, state_tup):
                with tf.variable_scope("att-rnn"):
                    pre_ed_indic = tf.layers.dropout(tf.layers.dense(
                        indic, 256, tf.nn.relu),
                                                     training=self.training)
                    pre_ed_indic = tf.layers.dropout(tf.layers.dense(
                        pre_ed_indic, 128, tf.nn.relu),
                                                     training=self.training)
                    att_cell_out, att_cell_state = att_cell(
                        tf.concat([pre_ed_indic], axis=-1), state_tup[0])
                with tf.variable_scope("attention"):
                    query = att_cell_state[0]  # att_cell_out
                    context, alpha = att_module(query)
                    alpha_ta = alpha_ta.write(time, alpha)
                with tf.variable_scope("attention_style"):
                    context_style, alpha_style = att_module_style(query)
                    print('context_style:', context_style)
                    # print('context_style22:', alpha_style)
                    alpha_style_ta = alpha_style_ta.write(time, alpha_style)
                with tf.variable_scope("weighting"):
                    weighting = add_layer(query,
                                          query.shape[-1],
                                          1,
                                          'weighting_w',
                                          'weighting_b',
                                          activation_function=tf.nn.sigmoid)

                    # weighting = tf.nn.softmax(weighting)
                    weight_ta = weight_ta.write(time, weighting)
                with tf.variable_scope("acoustic_module"):
                    # weighting0 = tf.reshape(weighting[:, 0], (BATCH_SIZE, 1))
                    # weighting1 = tf.reshape(weighting[:, 1], (BATCH_SIZE, 1))
                    # weighting_context = weighting0 * context + weighting1 * context_style
                    # print('context:', context)
                    weighting = tf.Print(weighting, [weighting],
                                         message='weight',
                                         summarize=100)
                    context_style = tf.Print(context_style,
                                             [context_style[0][0:5]],
                                             message='origal_style',
                                             summarize=100)
                    context_style = tf.Print(
                        context_style, [tf.nn.tanh(context_style)[0][0:5]],
                        message='tanh_style',
                        summarize=100)
                    context = tf.Print(context, [context[0][0:5]],
                                       message='context',
                                       summarize=100)

                    weighting_context = context + weighting * tf.nn.tanh(
                        context_style)

                    aco_input = tf.layers.dense(
                        tf.concat([att_cell_out, weighting_context], axis=-1),
                        DEC_RNN_SIZE)
                    aco_cell_out, aco_cell_state = aco_cell(
                        aco_input, state_tup[1])
                    dense_out = tf.reshape(
                        tf.layers.dense(aco_cell_out, OUTPUT_MEL_DIM * self.r),
                        shape=(batch_size, self.r, OUTPUT_MEL_DIM))
                    output_ta = output_ta.write(time, dense_out)
                    new_indic = dense_out[:, -1]
                state_tup = tuple([att_cell_state, aco_cell_state])

                return tf.add(
                    time, 1
                ), new_indic, output_ta, alpha_ta, alpha_style_ta, weight_ta, state_tup

            ### run loop
            _, _, output_mel_ta, final_alpha_ta, final_alpha_style_ta, final_weight_ta, *_ = tf.while_loop(
                cond, body, [
                    time, init_indic, output_ta, alpha_ta, alpha_style_ta,
                    weight_ta, state_tup
                ])

        ### time major
        with tf.variable_scope("output"):
            output_mel = tf.reshape(output_mel_ta.stack(),
                                    shape=(reduced_time_steps, batch_size,
                                           OUTPUT_MEL_DIM * self.r))
            output_mel = tf.reshape(tf.transpose(output_mel, perm=(1, 0, 2)),
                                    shape=(batch_size, MAX_OUT_STEPS,
                                           OUTPUT_MEL_DIM))
            self.out_mel = output_mel

            with tf.variable_scope("post-net"):
                output_post = CBHG(8, (256, OUTPUT_MEL_DIM))(
                    output_mel,
                    sequence_length=None,
                    is_training=self.training,
                    time_major=False)
                output_spec = tf.layers.dense(output_post, OUTPUT_SPEC_DIM)
                self.out_stftm = output_spec

            final_alpha = tf.reshape(final_alpha_ta.stack(),
                                     shape=(reduced_time_steps, batch_size,
                                            input_time_steps))
            self.final_alpha = tf.transpose(final_alpha,
                                            perm=(1, 0, 2))  # batch major

            final_alpha_style = tf.reshape(final_alpha_style_ta.stack(),
                                           shape=(reduced_time_steps,
                                                  batch_size, styles_kind))
            self.final_alpha_style = tf.transpose(final_alpha_style,
                                                  perm=(1, 0,
                                                        2))  # batch major

            final_weight_ta = tf.reshape(final_weight_ta.stack(),
                                         shape=(reduced_time_steps, batch_size,
                                                1))
            self.final_weight_ta = tf.transpose(final_weight_ta,
                                                perm=(1, 0, 2))  # batch major
Пример #2
0
    def build(self, inp, inp_mask, mel_gtruth, spec_gtruth):
        batch_size = tf.shape(inp)[0]
        input_time_steps = tf.shape(inp)[1]
        output_time_steps = tf.shape(mel_gtruth)[1]

        ### Encoder [ begin
        with tf.variable_scope("encoder"):

            with tf.variable_scope("embedding"):
                embed_inp = EmbeddingLayer(EMBED_CLASS, EMBED_DIM)(inp)

            with tf.variable_scope("changeToVarible"):

                self.single_style_token = tf.get_variable(
                    'style_token', (1, styles_kind, style_dim),
                    dtype=tf.float32)
                self.style_token = tf.tile(self.single_style_token,
                                           (batch_size, 1, 1))

            with tf.variable_scope("pre-net"):
                pre_ed_inp = tf.layers.dropout(tf.layers.dense(
                    embed_inp, 256, tf.nn.relu),
                                               training=self.training)
                pre_ed_inp = tf.layers.dropout(tf.layers.dense(
                    pre_ed_inp, 128, tf.nn.relu),
                                               training=self.training)

            with tf.variable_scope("CBHG"):
                # batch major
                encoder_output = CBHG(16,
                                      (128, 128))(pre_ed_inp,
                                                  sequence_length=inp_mask,
                                                  is_training=self.training,
                                                  time_major=False)

        with tf.variable_scope("attention"):
            att_module = AttentionModule(ATT_RNN_SIZE,
                                         encoder_output,
                                         sequence_length=inp_mask,
                                         time_major=False)
        with tf.variable_scope("attention_style"):
            att_module_style = AttentionModule(STYLE_ATT_RNN_SIZE,
                                               self.style_token,
                                               time_major=False)

        with tf.variable_scope("decoder"):
            with tf.variable_scope("attentionRnn"):
                att_cell = GRUCell(ATT_RNN_SIZE)
            with tf.variable_scope("acoustic_module"):
                aco_cell = MultiRNNCell(
                    [ResidualWrapper(GRUCell(DEC_RNN_SIZE)) for _ in range(2)])

            ### prepare output alpha TensorArray
            reduced_time_steps = tf.div(output_time_steps, self.r)
            att_cell_state = att_cell.init_state(batch_size, tf.float32)
            aco_cell_state = aco_cell.zero_state(batch_size, tf.float32)
            state_tup = tuple([att_cell_state, aco_cell_state])
            output_ta = tf.TensorArray(size=reduced_time_steps,
                                       dtype=tf.float32)
            alpha_ta = tf.TensorArray(size=reduced_time_steps,
                                      dtype=tf.float32)
            weight_ta = tf.TensorArray(size=reduced_time_steps,
                                       dtype=tf.float32)
            alpha_style_ta = tf.TensorArray(size=reduced_time_steps,
                                            dtype=tf.float32)
            indic_ta = tf.TensorArray(size=self.r + output_time_steps,
                                      dtype=tf.float32)
            time_major_mel_gtruth = tf.transpose(mel_gtruth, perm=(1, 0, 2))
            indic_array = tf.concat([
                tf.zeros([self.r, batch_size, OUTPUT_MEL_DIM]),
                time_major_mel_gtruth
            ],
                                    axis=0)
            indic_ta = indic_ta.unstack(indic_array)
            #init_context = tf.zeros((batch_size, 256))

            time = tf.constant(0, dtype=tf.int32)
            cond = lambda time, *_: tf.less(time, reduced_time_steps)

            def body(time, output_ta, alpha_ta, alpha_style_ta, weight_ta,
                     state_tup):
                with tf.variable_scope("att-rnn"):
                    pre_ed_indic = tf.layers.dropout(tf.layers.dense(
                        indic_ta.read(self.r * time + self.r - 1), 256,
                        tf.nn.relu),
                                                     training=self.training)
                    pre_ed_indic = tf.layers.dropout(tf.layers.dense(
                        pre_ed_indic, 128, tf.nn.relu),
                                                     training=self.training)
                    att_cell_out, att_cell_state = att_cell(
                        tf.concat([pre_ed_indic], axis=-1), state_tup[0])
                with tf.variable_scope("attention"):
                    query = att_cell_state[0]  # att_cell_out
                    context, alpha = att_module(query)
                    alpha_ta = alpha_ta.write(time, alpha)
                with tf.variable_scope("attention_style"):
                    context_style, alpha_style = att_module_style(query)
                    alpha_style_ta = alpha_style_ta.write(time, alpha_style)
                with tf.variable_scope("weighting"):
                    print(query)
                    weighting = add_layer(query,
                                          query.shape[-1],
                                          1,
                                          'weighting_w',
                                          'weighting_b',
                                          activation_function=tf.nn.sigmoid)
                    # weighting = tf.nn.softmax(weighting)
                    weight_ta = weight_ta.write(time, weighting)

                with tf.variable_scope("acoustic_module"):
                    # weighting0 = tf.reshape(weighting[:, 0], (BATCH_SIZE, 1))
                    # weighting1 = tf.reshape(weighting[:, 1], (BATCH_SIZE, 1))
                    weighting_context = context + weighting * tf.nn.tanh(
                        context_style)
                    # print(weighting_context)
                    aco_input = tf.layers.dense(
                        tf.concat([att_cell_out, weighting_context], axis=-1),
                        DEC_RNN_SIZE)
                    aco_cell_out, aco_cell_state = aco_cell(
                        aco_input, state_tup[1])
                    dense_out = tf.layers.dense(aco_cell_out,
                                                OUTPUT_MEL_DIM * self.r)
                    output_ta = output_ta.write(time, dense_out)
                state_tup = tuple([att_cell_state, aco_cell_state])

                return tf.add(
                    time, 1
                ), output_ta, alpha_ta, alpha_style_ta, weight_ta, state_tup

            ### run loop
            _, output_mel_ta, final_alpha_ta, final_alpha_style_ta, final_weight_ta, *_ = tf.while_loop(
                cond, body, [
                    time, output_ta, alpha_ta, alpha_style_ta, weight_ta,
                    state_tup
                ])
        # print('hjhhhh', reduced_time_steps, batch_size, OUTPUT_MEL_DIM * self.r, batch_size, output_time_steps,
        #       OUTPUT_MEL_DIM)
        # sys.stdout.flush()
        ### time major
        with tf.variable_scope("output"):
            # print('hjhhhh', reduced_time_steps, batch_size, OUTPUT_MEL_DIM * self.r, batch_size, output_time_steps, OUTPUT_MEL_DIM)
            # sys.stdout.flush()
            output_mel = tf.reshape(output_mel_ta.stack(),
                                    shape=(reduced_time_steps, batch_size,
                                           OUTPUT_MEL_DIM * self.r))
            output_mel = tf.reshape(tf.transpose(output_mel, perm=(1, 0, 2)),
                                    shape=(batch_size, output_time_steps,
                                           OUTPUT_MEL_DIM))
            self.out_mel = output_mel

            with tf.variable_scope("post-net"):
                output_post = CBHG(8, (256, OUTPUT_MEL_DIM))(
                    output_mel,
                    sequence_length=None,
                    is_training=self.training,
                    time_major=False)
                output_spec = tf.layers.dense(output_post, OUTPUT_SPEC_DIM)
                self.out_stftm = output_spec

            final_alpha = tf.reshape(final_alpha_ta.stack(),
                                     shape=(reduced_time_steps, batch_size,
                                            input_time_steps))
            final_alpha = tf.transpose(final_alpha,
                                       perm=(1, 0, 2))  # batch major

            final_alpha_style = tf.reshape(final_alpha_style_ta.stack(),
                                           shape=(reduced_time_steps,
                                                  batch_size, styles_kind))
            final_alpha_style = tf.transpose(final_alpha_style,
                                             perm=(1, 0, 2))  # batch major

            final_weight_ta = tf.reshape(final_weight_ta.stack(),
                                         shape=(reduced_time_steps, batch_size,
                                                1))
            final_weight_ta = tf.transpose(final_weight_ta,
                                           perm=(1, 0, 2))  # batch major
            self.weighting = final_weight_ta

            # self.alpha_style_hjk_img = tf.reshape(final_alpha_style, shape=(batch_size, reduced_time_steps, styles_kind))

        with tf.variable_scope("loss_and_metric"):
            self.loss_mel = tf.reduce_mean(tf.abs(mel_gtruth - output_mel))
            self.loss_spec = tf.reduce_mean(tf.abs(spec_gtruth - output_spec))
            self.loss = self.loss_mel + self.loss_spec
            self.alpha_img = tf.expand_dims(final_alpha, -1)
            self.alpha_style_img = tf.expand_dims(final_alpha_style, -1)
            self.weight_img = tf.expand_dims(final_weight_ta, -1)

            self.sums = []
            self.sums.append(
                tf.summary.image("train/alpha", self.alpha_img[:2]))
            self.sums.append(
                tf.summary.image("train/alpha_style",
                                 self.alpha_style_img[:2]))
            self.sums.append(
                tf.summary.image("train/weight", self.weight_img[:2]))
            self.sums.append(tf.summary.scalar("train/loss", self.loss))
            self.sums.append(
                tf.summary.scalar("train/style_0_0",
                                  self.single_style_token[0][0][0]))
            self.sums.append(
                tf.summary.scalar("train/style_0_100",
                                  self.single_style_token[0][0][100]))
            self.sums.append(
                tf.summary.scalar("train/style_5_100",
                                  self.single_style_token[0][5][100]))
            self.sums.append(
                tf.summary.histogram("train/style_vec",
                                     self.single_style_token))

            self.pred_audio_holder = tf.placeholder(shape=(None, None),
                                                    dtype=tf.float32,
                                                    name='pred_audio')
            self.pred_audio_summary = tf.summary.audio('pred_audio_summary',
                                                       self.pred_audio_holder,
                                                       sample_rate=sr,
                                                       max_outputs=12)