Пример #1
0
    def _create_iaf(self, inputs, iaf_idx, init):
        num_stages = self.hparams.num_stages
        num_layers = self.hparams.num_iaf_layers[iaf_idx]
        filter_length = self.hparams.filter_length
        width = self.hparams.width
        out_width = self.out_width
        deconv_width = self.hparams.deconv_width
        deconv_config = self.hparams.deconv_config  # [[l1, s1], [l2, s2]]
        use_weight_norm = self.use_weight_norm
        use_resize_conv = self.use_resize_conv
        upsample_act = self.upsample_act
        gate_width = width
        final_init, final_bias = PWNHelper.manual_finit_or_not_fn(
            init, iaf_idx)

        mel = inputs['mel']
        x = inputs['x']

        iaf_name = 'iaf_{:d}'.format(iaf_idx + 1)

        mel_en = wavenet._deconv_stack(mel,
                                       deconv_width,
                                       deconv_config,
                                       act=upsample_act,
                                       use_resize_conv=use_resize_conv,
                                       name=iaf_name,
                                       use_weight_norm=use_weight_norm,
                                       init=init)

        l = masked.shift_right(x)
        l = masked.conv1d(l,
                          num_filters=width,
                          filter_length=filter_length,
                          name='{}/start_conv'.format(iaf_name),
                          use_weight_norm=use_weight_norm,
                          init=init)

        for i in range(num_layers):
            dilation = 2**(i % num_stages)
            d = masked.conv1d(l,
                              num_filters=gate_width,
                              filter_length=filter_length,
                              dilation=dilation,
                              name='{}/dilated_conv_{:d}'.format(
                                  iaf_name, i + 1),
                              use_weight_norm=use_weight_norm,
                              init=init)
            c = masked.conv1d(mel_en,
                              num_filters=gate_width,
                              filter_length=1,
                              name='{}/mel_cond_{:d}'.format(iaf_name, i + 1),
                              use_weight_norm=use_weight_norm,
                              init=init)
            d = wavenet._condition(d, c)

            assert d.get_shape().as_list()[2] % 2 == 0
            m = d.get_shape().as_list()[2] // 2
            d_sigmoid = tf.sigmoid(d[:, :, :m])
            d_tanh = tf.tanh(d[:, :, m:])
            d = d_sigmoid * d_tanh

            l += masked.conv1d(d,
                               num_filters=width,
                               filter_length=1,
                               name='{}/res_{:d}'.format(iaf_name, i + 1),
                               use_weight_norm=use_weight_norm,
                               init=init)

        l = tf.nn.relu(l)
        l = masked.conv1d(l,
                          num_filters=width,
                          filter_length=1,
                          name='{}/out1'.format(iaf_name),
                          use_weight_norm=use_weight_norm,
                          init=init)
        c = masked.conv1d(mel_en,
                          num_filters=width,
                          filter_length=1,
                          name='{}/mel_cond_out1'.format(iaf_name),
                          use_weight_norm=use_weight_norm,
                          init=init)
        l = wavenet._condition(l, c)
        l = tf.nn.relu(l)

        mean = masked.conv1d(l,
                             num_filters=out_width // 2,
                             filter_length=1,
                             name='{}/out2_mean'.format(iaf_name),
                             use_weight_norm=use_weight_norm,
                             init=final_init)
        scale_params = masked.conv1d(
            l,
            num_filters=out_width // 2,
            filter_length=1,
            name='{}/out2_scale'.format(iaf_name),
            use_weight_norm=use_weight_norm,
            init=final_init,
            biases_initializer=tf.constant_initializer(final_bias))

        scale, log_scale = PWNHelper.scale_log_scale_fn(scale_params)
        new_x = x * scale + mean

        if DETAIL_LOG:
            tf.summary.scalar('scale_{}'.format(iaf_idx),
                              tf.reduce_mean(scale))
            tf.summary.scalar('log_scale_{}'.format(iaf_idx),
                              tf.reduce_mean(log_scale))
            tf.summary.scalar('mean_{}'.format(iaf_idx), tf.reduce_mean(mean))

        return {
            'x': new_x,
            'mean': mean,
            'scale': scale,
            'log_scale': log_scale
        }
Пример #2
0
    def feed_forward(self, inputs, init=False):
        """Build the graph for this configuration.

        Args:
          inputs: A dict of inputs. For training, should contain 'wav'.
          init: data dependent initialization.

        Returns:
          A dict of outputs that includes the 'predictions', 'loss', the 'encoding',
          the 'quantized_input', and whatever metrics we want to track for eval.
        """
        use_weight_norm = self.use_weight_norm
        num_stages = self.hparams.num_stages
        num_layers = self.hparams.num_layers
        filter_length = self.hparams.filter_length
        width = self.hparams.width
        skip_width = self.hparams.skip_width
        out_width = self.out_width
        use_dropout = self.use_dropout
        use_as_teacher = self.use_as_teacher
        # in parallel wavenet paper, gate width is the same with residual width
        # not double of that.
        gate_width = 2 * width if self.double_gate_width else width
        dropout_training = not use_as_teacher

        ###
        # The Transpose Convolution Stack for mel feature.
        ###
        # wavenet inputs <- trans_conv (l2, s2) <- trans_conv (l1, s1) <- mel_ceps
        # win_len: l1 * s2 + (l2 - s2); win_shift: s1 * s2
        # (l1, s1) = (40, 10), (l2, s2) = (80, 20) is a proper configuration.
        # it is almost consistent with mel analysis frame shift (200) and frame length (800).
        mel = inputs['mel']
        ds_dict = self.deconv_stack({'mel': mel}, init=init)
        mel_en = ds_dict['encoding']

        x_scaled = inputs['wav_scaled']
        x_scaled = tf.expand_dims(x_scaled, 2)

        ###
        # The WaveNet Decoder.
        ###
        l = masked.shift_right(x_scaled)
        l = masked.conv1d(l,
                          num_filters=width,
                          filter_length=filter_length,
                          name='conv_start',
                          use_weight_norm=use_weight_norm,
                          init=init)
        if use_dropout:
            l = tf.layers.dropout(l,
                                  rate=0.2,
                                  training=dropout_training,
                                  name='conv_dropout')

        # Set up skip connections.
        s = masked.conv1d(l,
                          num_filters=skip_width,
                          filter_length=1,
                          name='skip_start',
                          use_weight_norm=use_weight_norm,
                          init=init)

        ###
        # Residual blocks with skip connections.
        ###
        for i in range(num_layers):
            dilation = 2**(i % num_stages)
            d = masked.conv1d(l,
                              num_filters=gate_width,
                              filter_length=filter_length,
                              dilation=dilation,
                              name='dilated_conv_%d' % (i + 1),
                              use_weight_norm=use_weight_norm,
                              init=init)
            c = masked.conv1d(mel_en,
                              num_filters=gate_width,
                              filter_length=1,
                              name='mel_cond_%d' % (i + 1),
                              use_weight_norm=use_weight_norm,
                              init=init)
            d = _condition(d, c)

            assert d.get_shape().as_list()[2] % 2 == 0
            m = d.get_shape().as_list()[2] // 2
            d_sigmoid = tf.sigmoid(d[:, :, :m])
            d_tanh = tf.tanh(d[:, :, m:])
            d = d_sigmoid * d_tanh

            l += masked.conv1d(d,
                               num_filters=width,
                               filter_length=1,
                               name='res_%d' % (i + 1),
                               use_weight_norm=use_weight_norm,
                               init=init)
            s += masked.conv1d(d,
                               num_filters=skip_width,
                               filter_length=1,
                               name='skip_%d' % (i + 1),
                               use_weight_norm=use_weight_norm,
                               init=init)

            if use_dropout:
                l = tf.layers.dropout(l,
                                      rate=0.2,
                                      training=dropout_training,
                                      name='res_dropout_%d' % (i + 1))

        s = tf.nn.relu(s)
        s = masked.conv1d(s,
                          num_filters=skip_width,
                          filter_length=1,
                          name='out1',
                          use_weight_norm=use_weight_norm,
                          init=init)
        c = masked.conv1d(mel_en,
                          num_filters=skip_width,
                          filter_length=1,
                          name='mel_cond_out1',
                          use_weight_norm=use_weight_norm,
                          init=init)
        s = _condition(s, c)
        s = tf.nn.relu(s)
        out = masked.conv1d(s,
                            num_filters=out_width,
                            filter_length=1,
                            name='out2',
                            use_weight_norm=use_weight_norm,
                            init=init)

        return {'encoding': mel_en, 'out_params': out}
Пример #3
0
    def _create_iaf(self, inputs, iaf_idx):
        num_stages = self.hparams.num_stages
        num_layers = self.hparams.num_iaf_layers[iaf_idx]
        filter_length = self.hparams.filter_length
        width = self.hparams.width
        out_width = self.out_width
        deconv_width = self.hparams.deconv_width
        deconv_config = self.hparams.deconv_config  # [[l1, s1], [l2, s2]]
        use_log_scale = getattr(self.hparams, 'use_log_scale', True)

        mel = inputs['mel']
        x = inputs['x']

        iaf_name = 'iaf_{:d}'.format(iaf_idx + 1)

        mel_en = wavenet._deconv_stack(
            mel, deconv_width, deconv_config, name=iaf_name)

        l = masked.shift_right(x)
        l = masked.conv1d(l, num_filters=width, filter_length=filter_length,
                          name='{}/start_conv'.format(iaf_name))

        for i in range(num_layers):
            dilation = 2 ** (i % num_stages)
            d = masked.conv1d(
                l,
                num_filters=2 * width,
                filter_length=filter_length,
                dilation=dilation,
                name='{}/dilated_conv_{:d}'.format(iaf_name, i + 1))
            c = masked.conv1d(
                mel_en,
                num_filters=2 * width,
                filter_length=1,
                name='{}/mel_cond_{:d}'.format(iaf_name, i + 1))
            d = wavenet._condition(d, c)

            assert d.get_shape().as_list()[2] % 2 == 0
            m = d.get_shape().as_list()[2] // 2
            d_sigmoid = tf.sigmoid(d[:, :, :m])
            d_tanh = tf.tanh(d[:, :, m:])
            d = d_sigmoid * d_tanh

            l += masked.conv1d(d, num_filters=width, filter_length=1,
                               name='{}/res_{:d}'.format(iaf_name, i + 1))

        l = tf.nn.relu(l)
        l = masked.conv1d(l, num_filters=width, filter_length=1,
                          name='{}/out1'.format(iaf_name))
        c = masked.conv1d(mel_en, num_filters=width, filter_length=1,
                          name='{}/mel_cond_out1'.format(iaf_name))
        l = wavenet._condition(l, c)
        l = tf.nn.relu(l)

        # to keep the scale in a reasonable small range if use_log_scale=True.
        final_kernel_init = (tf.truncated_normal_initializer(0., 0.01) if use_log_scale
                             else tf.uniform_unit_scaling_initializer(1.0))
        out = masked.conv1d(l, num_filters=out_width, filter_length=1,
                            name='{}/out2'.format(iaf_name),
                            kernel_initializer=final_kernel_init)
        mean, scale_params = tf.split(out, num_or_size_splits=2, axis=2)
        if use_log_scale:
            log_scale = tf.clip_by_value(scale_params, -9.0, 7.0)
            scale = tf.exp(log_scale)
        else:
            scale_params = tf.nn.softplus(scale_params)
            scale = tf.clip_by_value(scale_params, tf.exp(-9.0), tf.exp(7.0))
            log_scale = tf.log(scale)
        new_x = x * scale + mean

        if DETAIL_LOG:
            tf.summary.scalar('scale_{}'.format(iaf_idx), tf.reduce_mean(scale))
            tf.summary.scalar('log_scale_{}'.format(iaf_idx), tf.reduce_mean(log_scale))
            tf.summary.scalar('mean_{}'.format(iaf_idx), tf.reduce_mean(mean))

        return {'x': new_x,
                'mean': mean,
                'scale': scale,
                'log_scale': log_scale}
Пример #4
0
    def _create_iaf(self, inputs, iaf_idx):
        num_stages = self.hparams.num_stages
        num_layers = self.hparams.num_iaf_layers[iaf_idx]
        filter_length = self.hparams.filter_length
        width = self.hparams.width
        out_width = self.out_width
        deconv_width = self.hparams.deconv_width
        deconv_config = self.hparams.deconv_config  # [[l1, s1], [l2, s2]]

        mel = inputs['mel']
        x = inputs['x']

        iaf_name = 'iaf_{:d}'.format(iaf_idx + 1)

        mel_en = wavenet._deconv_stack(mel,
                                       deconv_width,
                                       deconv_config,
                                       name=iaf_name)

        l = masked.shift_right(x)
        l = masked.conv1d(l,
                          num_filters=width,
                          filter_length=filter_length,
                          name='{}/start_conv'.format(iaf_name))

        for i in range(num_layers):
            dilation = 2**(i % num_stages)
            d = masked.conv1d(l,
                              num_filters=2 * width,
                              filter_length=filter_length,
                              dilation=dilation,
                              name='{}/dilated_conv_{:d}'.format(
                                  iaf_name, i + 1))
            c = masked.conv1d(mel_en,
                              num_filters=2 * width,
                              filter_length=1,
                              name='{}/mel_cond_{:d}'.format(iaf_name, i + 1))
            d = wavenet._condition(d, c)

            assert d.get_shape().as_list()[2] % 2 == 0
            m = d.get_shape().as_list()[2] // 2
            d_sigmoid = tf.sigmoid(d[:, :, :m])
            d_tanh = tf.tanh(d[:, :, m:])
            d = d_sigmoid * d_tanh

            l += masked.conv1d(d,
                               num_filters=width,
                               filter_length=1,
                               name='{}/res_{:d}'.format(iaf_name, i + 1))

        l = tf.nn.relu(l)
        l = masked.conv1d(l,
                          num_filters=width,
                          filter_length=1,
                          name='{}/out1'.format(iaf_name))
        c = masked.conv1d(mel_en,
                          num_filters=width,
                          filter_length=1,
                          name='{}/mel_cond_out1'.format(iaf_name))
        l = wavenet._condition(l, c)
        l = tf.nn.relu(l)
        out = masked.conv1d(l,
                            num_filters=out_width,
                            filter_length=1,
                            name='{}/out2'.format(iaf_name))
        mean, scale = tf.split(out, num_or_size_splits=2, axis=2)
        scale = tf.clip_by_value(scale, tf.exp(-7.0), tf.exp(7.0))
        new_x = x * scale + mean
        return {'x': new_x, 'mean': mean, 'scale': scale}
Пример #5
0
    def feed_forward(self, inputs):
        """Build the graph for this configuration.

        Args:
          inputs: A dict of inputs. For training, should contain 'wav'.

        Returns:
          A dict of outputs that includes the 'predictions', 'loss', the 'encoding',
          the 'quantized_input', and whatever metrics we want to track for eval.
        """
        num_stages = self.hparams.num_stages
        num_layers = self.hparams.num_layers
        filter_length = self.hparams.filter_length
        width = self.hparams.width
        skip_width = self.hparams.skip_width
        use_mu_law = self.use_mu_law
        quant_chann = self.quant_chann
        out_width = self.out_width

        ###
        # The Transpose Convolution Stack for mel feature.
        ###
        # wavenet inputs <- trans_conv (l2, s2) <- trans_conv (l1, s1) <- mel_ceps
        # win_len: l1 * s2 + (l2 - s2); win_shift: s1 * s2
        # (l1, s1) = (40, 10), (l2, s2) = (80, 20) is a proper configuration.
        # it is almost consistent with mel analysis frame shift (200) and frame length (800).
        mel = inputs['mel']
        ds_dict = self.deconv_stack({'mel': mel})
        mel_en = ds_dict['encoding']

        ###
        # Encode the source with 8-bit Mu-Law or just use 16-bit signal.
        ###
        x = inputs['wav']
        if use_mu_law:
            x_quantized = utils.mu_law(x)
            x_scaled = tf.cast(x_quantized, tf.float32) / (quant_chann / 2.)
            real_targets = x_scaled
            cate_targets = tf.cast(x_quantized, tf.int32) + tf.cast(
                quant_chann / 2., tf.int32)
        else:
            x_quantized = utils.cast_quantize(x, quant_chann)
            x_scaled = x
            real_targets = x
            cate_targets = tf.cast(x_quantized, tf.int32) + tf.cast(
                quant_chann / 2., tf.int32)
        x_scaled = tf.expand_dims(x_scaled, 2)

        ###
        # The WaveNet Decoder.
        ###
        l = masked.shift_right(x_scaled)
        l = masked.conv1d(l,
                          num_filters=width,
                          filter_length=filter_length,
                          name='startconv')

        # Set up skip connections.
        s = masked.conv1d(l,
                          num_filters=skip_width,
                          filter_length=1,
                          name='skip_start')

        # Residual blocks with skip connections.
        for i in range(num_layers):
            dilation = 2**(i % num_stages)
            d = masked.conv1d(l,
                              num_filters=2 * width,
                              filter_length=filter_length,
                              dilation=dilation,
                              name='dilated_conv_%d' % (i + 1))
            c = masked.conv1d(mel_en,
                              num_filters=2 * width,
                              filter_length=1,
                              name='mel_cond_%d' % (i + 1))
            d = _condition(d, c)

            assert d.get_shape().as_list()[2] % 2 == 0
            m = d.get_shape().as_list()[2] // 2
            d_sigmoid = tf.sigmoid(d[:, :, :m])
            d_tanh = tf.tanh(d[:, :, m:])
            d = d_sigmoid * d_tanh

            l += masked.conv1d(d,
                               num_filters=width,
                               filter_length=1,
                               name='res_%d' % (i + 1))
            s += masked.conv1d(d,
                               num_filters=skip_width,
                               filter_length=1,
                               name='skip_%d' % (i + 1))

        s = tf.nn.relu(s)
        s = masked.conv1d(s,
                          num_filters=skip_width,
                          filter_length=1,
                          name='out1')
        c = masked.conv1d(mel_en,
                          num_filters=skip_width,
                          filter_length=1,
                          name='mel_cond_out1')
        s = _condition(s, c)
        s = tf.nn.relu(s)
        # when using mol loss, the model always predicts log_scale, the initializer makes
        # the log_scale in a reasonable small range to speed up convergence.
        final_kernel_init = (tf.truncated_normal_initializer(0.0, 0.01)
                             if self.loss_type == 'mol' else
                             tf.uniform_unit_scaling_initializer(1.0))
        out = masked.conv1d(s,
                            num_filters=out_width,
                            filter_length=1,
                            name='out2',
                            kernel_initializer=final_kernel_init)

        return {
            'real_targets': real_targets,
            'cate_targets': cate_targets,
            'encoding': mel_en,
            'out_params': out
        }