示例#1
0
def create_wavenet_autoencoder(n_stages, n_layers_per_stage, n_hidden,
                               batch_size, n_skip, filter_length,
                               bottleneck_width, hop_length, n_quantization,
                               sample_rate):
    """Summary

    Parameters
    ----------
    n_stages : TYPE
        Description
    n_layers_per_stage : TYPE
        Description
    n_hidden : TYPE
        Description
    batch_size : TYPE
        Description
    n_skip : TYPE
        Description
    filter_length : TYPE
        Description
    bottleneck_width : TYPE
        Description
    hop_length : TYPE
        Description
    n_quantization : TYPE
        Description
    sample_rate : TYPE
        Description

    Returns
    -------
    TYPE
        Description
    """
    offset = n_quantization / 2.0
    sequence_length = 2**n_layers_per_stage * 2 * n_stages

    # Encode the source with 8-bit Mu-Law.
    X = tf.placeholder(
        name='X', shape=[batch_size, sequence_length], dtype=tf.float32)
    X_quantized = wnu.mu_law(X, n_quantization)
    X_scaled = tf.cast(X_quantized / offset, tf.float32)
    X_scaled = tf.expand_dims(X_scaled, 2)

    # The Non-Causal Temporal Encoder.
    en = wnu.conv1d(
        X=X_scaled,
        causal=False,
        num_filters=n_hidden,
        filter_length=filter_length,
        name='ae_startconv')

    # Residual blocks with skip connections.
    for i in range(n_stages * n_layers_per_stage):
        dilation = 2**(i % n_layers_per_stage)
        print(dilation)
        d = tf.nn.relu(en)
        d = wnu.conv1d(
            d,
            causal=False,
            num_filters=n_hidden,
            filter_length=filter_length,
            dilation=dilation,
            name='ae_dilatedconv_%d' % (i + 1))
        d = tf.nn.relu(d)
        en += wnu.conv1d(
            d,
            num_filters=n_hidden,
            filter_length=1,
            name='ae_res_%d' % (i + 1))

    en = wnu.conv1d(
        en, num_filters=bottleneck_width, filter_length=1, name='ae_bottleneck')

    en = wnu.pool1d(en, hop_length, name='ae_pool', mode='avg')
    encoding = en

    # The WaveNet Decoder.
    l = wnu.shift_right(X_scaled)
    l = wnu.conv1d(
        l, num_filters=n_hidden, filter_length=filter_length, name='startconv')

    # Set up skip connections.
    s = wnu.conv1d(l, num_filters=n_skip, filter_length=1, name='skip_start')

    # Residual blocks with skip connections.
    for i in range(n_stages * n_layers_per_stage):
        dilation = 2**(i % n_layers_per_stage)
        d = wnu.conv1d(
            l,
            num_filters=2 * n_hidden,
            filter_length=filter_length,
            dilation=dilation,
            name='dilatedconv_%d' % (i + 1))
        d = condition(d,
                      wnu.conv1d(
                          en,
                          num_filters=2 * n_hidden,
                          filter_length=1,
                          name='cond_map_%d' % (i + 1)))
        assert d.get_shape().as_list()[2] % 2 == 0
        m = d.get_shape().as_list()[2] // 2
        d_sigmoid = tf.sigmoid(d[:, :, :m])
        d_tanh = tf.tanh(d[:, :, m:])
        d = d_sigmoid * d_tanh
        l += wnu.conv1d(
            d, num_filters=n_hidden, filter_length=1, name='res_%d' % (i + 1))
        s += wnu.conv1d(
            d, num_filters=n_skip, filter_length=1, name='skip_%d' % (i + 1))

    s = tf.nn.relu(s)
    s = wnu.conv1d(s, num_filters=n_skip, filter_length=1, name='out1')
    s = condition(s,
                  wnu.conv1d(
                      en,
                      num_filters=n_skip,
                      filter_length=1,
                      name='cond_map_out1'))
    s = tf.nn.relu(s)

    # Compute the logits and get the loss.
    logits = wnu.conv1d(
        s, num_filters=n_quantization, filter_length=1, name='logits')
    logits = tf.reshape(logits, [-1, n_quantization])
    probs = tf.nn.softmax(logits, name='softmax')
    synthesis = tf.reshape(
        wnu.inv_mu_law(
            tf.cast(tf.argmax(probs, 1), tf.float32) - offset, n_quantization),
        [-1, sequence_length])
    labels = tf.cast(tf.reshape(X_quantized, [-1]), tf.int32) + int(offset)
    loss = tf.reduce_mean(
        tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=labels, name='nll'),
        0,
        name='loss')

    tf.summary.audio("synthesis", synthesis, sample_rate=sample_rate)
    tf.summary.histogram("probs", probs)
    tf.summary.histogram("input_quantized", X_quantized)
    tf.summary.histogram("logits", logits)
    tf.summary.histogram("labels", labels)
    tf.summary.histogram("synthesis", synthesis)
    tf.summary.scalar("loss", loss)
    summaries = tf.summary.merge_all()

    return {
        'X': X,
        'quantized': X_quantized,
        'encoding': encoding,
        'probs': probs,
        'synthesis': synthesis,
        'summaries': summaries,
        'loss': loss
    }
示例#2
0
def create_wavenet(n_stages=10,
                   n_layers_per_stage=9,
                   n_hidden=200,
                   batch_size=32,
                   n_skip=100,
                   filter_length=2,
                   shift=True,
                   n_quantization=256,
                   sample_rate=16000):
    """Summary

    Parameters
    ----------
    n_stages : int, optional
        Description
    n_layers_per_stage : int, optional
        Description
    n_hidden : int, optional
        Description
    batch_size : int, optional
        Description
    n_skip : int, optional
        Description
    filter_length : int, optional
        Description
    shift : bool, optional
        Description
    n_quantization : int, optional
        Description
    sample_rate : int, optional
        Description

    Returns
    -------
    TYPE
        Description
    """
    offset = n_quantization / 2.0
    sequence_length = 2**n_layers_per_stage * 2 * n_stages

    # Encode the source with 8-bit Mu-Law.
    X = tf.placeholder(
        name='X', shape=[batch_size, sequence_length], dtype=tf.float32)
    X_quantized = wnu.mu_law(X, n_quantization)
    X_onehot = tf.expand_dims(X_quantized, 2)
    if shift:
        X_onehot = wnu.shift_right(X_onehot)

    h = wnu.conv1d(
        X=X_onehot,
        num_filters=n_hidden,
        filter_length=filter_length,
        name='startconv')

    # Set up skip connections.
    s = wnu.conv1d(X=h, num_filters=n_skip, filter_length=1, name='skip_start')

    # Residual blocks with skip connections.
    for i in range(n_stages * n_layers_per_stage):
        dilation = 2**(i % n_layers_per_stage)

        # dilated masked cnn
        d = wnu.conv1d(
            X=h,
            num_filters=2 * n_hidden,
            filter_length=filter_length,
            dilation=dilation,
            name='dilatedconv_%d' % (i + 1))

        # gated cnn
        assert d.get_shape().as_list()[2] % 2 == 0
        m = d.get_shape().as_list()[2] // 2
        d = tf.sigmoid(d[:, :, :m]) * tf.tanh(d[:, :, m:])

        # residuals
        h += wnu.conv1d(
            X=d, num_filters=n_hidden, filter_length=1, name='res_%d' % (i + 1))

        # skips
        s += wnu.conv1d(
            X=d, num_filters=n_skip, filter_length=1, name='skip_%d' % (i + 1))

    s = tf.nn.relu(s)
    s = wnu.conv1d(X=s, num_filters=n_skip, filter_length=1, name='out1')
    s = tf.nn.relu(s)
    logits = tf.clip_by_value(
        wnu.conv1d(
            X=s,
            num_filters=n_quantization,
            filter_length=1,
            name='logits_preclip') + offset,
        0.0,
        n_quantization - 1.0,
        name='logits')
    logits = tf.reshape(logits, [-1, n_quantization])
    labels = tf.cast(tf.reshape(X_quantized + offset, [-1]), tf.int32)
    loss = tf.reduce_mean(
        tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=labels, name='nll'),
        0,
        name='loss')

    probs = tf.nn.softmax(logits, name='softmax')
    synthesis = tf.reshape(
        wnu.inv_mu_law(
            tf.cast(tf.argmax(probs, 1), tf.float32) - offset, n_quantization),
        [-1, sequence_length])

    tf.summary.audio("synthesis", synthesis, sample_rate=sample_rate)
    tf.summary.histogram("probs", probs)
    tf.summary.histogram("input_quantized", X_quantized)
    tf.summary.histogram("logits", logits)
    tf.summary.histogram("labels", labels)
    tf.summary.histogram("synthesis", synthesis)
    tf.summary.scalar("loss", loss)
    summaries = tf.summary.merge_all()

    return {
        'X': X,
        'quantized': X_quantized,
        'probs': probs,
        'synthesis': synthesis,
        'summaries': summaries,
        'loss': loss
    }
示例#3
0
def create_generation_model(n_stages=5, n_layers_per_stage=10,
                            n_hidden=256, batch_size=1, n_skip=128,
                            n_quantization=256, filter_length=2,
                            onehot=False):
    """Summary

    Parameters
    ----------
    n_stages : int, optional
        Description
    n_layers_per_stage : int, optional
        Description
    n_hidden : int, optional
        Description
    batch_size : int, optional
        Description
    n_skip : int, optional
        Description
    n_quantization : int, optional
        Description
    filter_length : int, optional
        Description
    onehot : bool, optional
        Description

    Returns
    -------
    TYPE
        Description
    """
    offset = n_quantization / 2.0

    # Encode the source with 8-bit Mu-Law.
    X = tf.placeholder(name='X', shape=[None, None], dtype=tf.float32)
    X_quantized = wnu.mu_law(X, n_quantization)
    if onehot:
        X_onehot = tf.one_hot(
            tf.cast(X_quantized + offset, tf.int32),
            n_quantization)
    else:
        X_onehot = tf.expand_dims(X_quantized, 2)

    push_ops, init_ops = [], []
    h, init, push = wnu.causal_linear(
        X=X_onehot,
        n_inputs=256 if onehot else 1,
        n_outputs=n_hidden,
        name='startconv',
        rate=1,
        batch_size=batch_size,
        filter_length=filter_length)
    init_ops.extend(init)
    push_ops.extend(push)

    # Set up skip connections.
    s = wnu.linear(h, n_hidden, n_skip, name='skip_start')

    # Residual blocks with skip connections.
    for i in range(n_stages * n_layers_per_stage):
        dilation = 2**(i % n_layers_per_stage)

        # dilated masked cnn
        d, init, push = wnu.causal_linear(
            X=h,
            n_inputs=n_hidden,
            n_outputs=n_hidden * 2,
            name='dilatedconv_%d' % (i + 1),
            rate=dilation,
            batch_size=batch_size,
            filter_length=filter_length)
        init_ops.extend(init)
        push_ops.extend(push)

        # gated cnn
        assert d.get_shape().as_list()[2] % 2 == 0
        m = d.get_shape().as_list()[2] // 2
        d = tf.sigmoid(d[:, :, :m]) * tf.tanh(d[:, :, m:])

        # residuals
        h += wnu.linear(d, n_hidden, n_hidden, name='res_%d' % (i + 1))

        # skips
        s += wnu.linear(d, n_hidden, n_skip, name='skip_%d' % (i + 1))

    s = tf.nn.relu(s)
    s = wnu.linear(s, n_skip, n_skip, name='out1')
    s = tf.nn.relu(s)
    logits = tf.clip_by_value(
        wnu.linear(s, n_skip, n_quantization, name='logits_preclip') + offset,
        0.0, n_quantization - 1.0,
        name='logits')
    logits = tf.reshape(logits, [-1, n_quantization])
    probs = tf.nn.softmax(logits, name='softmax')
    synthesis = tf.reshape(
        wnu.inv_mu_law(tf.cast(tf.argmax(probs, 1), tf.float32) - offset,
                       n_quantization),
        [-1, 1])

    return {
        'X': X,
        'init_ops': init_ops,
        'push_ops': push_ops,
        'probs': probs,
        'synthesis': synthesis
    }
示例#4
0
def create_wavenet_autoencoder(n_stages, n_layers_per_stage, n_hidden,
                               batch_size, n_skip, filter_length,
                               bottleneck_width, hop_length, n_quantization,
                               sample_rate):
    """Summary

    Parameters
    ----------
    n_stages : TYPE
        Description
    n_layers_per_stage : TYPE
        Description
    n_hidden : TYPE
        Description
    batch_size : TYPE
        Description
    n_skip : TYPE
        Description
    filter_length : TYPE
        Description
    bottleneck_width : TYPE
        Description
    hop_length : TYPE
        Description
    n_quantization : TYPE
        Description
    sample_rate : TYPE
        Description

    Returns
    -------
    TYPE
        Description
    """
    offset = n_quantization / 2.0
    sequence_length = 2**n_layers_per_stage * 2 * n_stages

    # Encode the source with 8-bit Mu-Law.
    X = tf.placeholder(name='X',
                       shape=[batch_size, sequence_length],
                       dtype=tf.float32)
    X_quantized = wnu.mu_law(X, n_quantization)
    X_scaled = tf.cast(X_quantized / offset, tf.float32)
    X_scaled = tf.expand_dims(X_scaled, 2)

    # The Non-Causal Temporal Encoder.
    en = wnu.conv1d(X=X_scaled,
                    causal=False,
                    num_filters=n_hidden,
                    filter_length=filter_length,
                    name='ae_startconv')

    # Residual blocks with skip connections.
    for i in range(n_stages * n_layers_per_stage):
        dilation = 2**(i % n_layers_per_stage)
        print(dilation)
        d = tf.nn.relu(en)
        d = wnu.conv1d(d,
                       causal=False,
                       num_filters=n_hidden,
                       filter_length=filter_length,
                       dilation=dilation,
                       name='ae_dilatedconv_%d' % (i + 1))
        d = tf.nn.relu(d)
        en += wnu.conv1d(d,
                         num_filters=n_hidden,
                         filter_length=1,
                         name='ae_res_%d' % (i + 1))

    en = wnu.conv1d(en,
                    num_filters=bottleneck_width,
                    filter_length=1,
                    name='ae_bottleneck')

    en = wnu.pool1d(en, hop_length, name='ae_pool', mode='avg')
    encoding = en

    # The WaveNet Decoder.
    l = wnu.shift_right(X_scaled)
    l = wnu.conv1d(l,
                   num_filters=n_hidden,
                   filter_length=filter_length,
                   name='startconv')

    # Set up skip connections.
    s = wnu.conv1d(l, num_filters=n_skip, filter_length=1, name='skip_start')

    # Residual blocks with skip connections.
    for i in range(n_stages * n_layers_per_stage):
        dilation = 2**(i % n_layers_per_stage)
        d = wnu.conv1d(l,
                       num_filters=2 * n_hidden,
                       filter_length=filter_length,
                       dilation=dilation,
                       name='dilatedconv_%d' % (i + 1))
        d = condition(
            d,
            wnu.conv1d(en,
                       num_filters=2 * n_hidden,
                       filter_length=1,
                       name='cond_map_%d' % (i + 1)))
        assert d.get_shape().as_list()[2] % 2 == 0
        m = d.get_shape().as_list()[2] // 2
        d_sigmoid = tf.sigmoid(d[:, :, :m])
        d_tanh = tf.tanh(d[:, :, m:])
        d = d_sigmoid * d_tanh
        l += wnu.conv1d(d,
                        num_filters=n_hidden,
                        filter_length=1,
                        name='res_%d' % (i + 1))
        s += wnu.conv1d(d,
                        num_filters=n_skip,
                        filter_length=1,
                        name='skip_%d' % (i + 1))

    s = tf.nn.relu(s)
    s = wnu.conv1d(s, num_filters=n_skip, filter_length=1, name='out1')
    s = condition(
        s,
        wnu.conv1d(en,
                   num_filters=n_skip,
                   filter_length=1,
                   name='cond_map_out1'))
    s = tf.nn.relu(s)

    # Compute the logits and get the loss.
    logits = wnu.conv1d(s,
                        num_filters=n_quantization,
                        filter_length=1,
                        name='logits')
    logits = tf.reshape(logits, [-1, n_quantization])
    probs = tf.nn.softmax(logits, name='softmax')
    synthesis = tf.reshape(
        wnu.inv_mu_law(
            tf.cast(tf.argmax(probs, 1), tf.float32) - offset, n_quantization),
        [-1, sequence_length])
    labels = tf.cast(tf.reshape(X_quantized, [-1]), tf.int32) + int(offset)
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=logits, labels=labels, name='nll'),
                          0,
                          name='loss')

    tf.summary.audio("synthesis", synthesis, sample_rate=sample_rate)
    tf.summary.histogram("probs", probs)
    tf.summary.histogram("input_quantized", X_quantized)
    tf.summary.histogram("logits", logits)
    tf.summary.histogram("labels", labels)
    tf.summary.histogram("synthesis", synthesis)
    tf.summary.scalar("loss", loss)
    summaries = tf.summary.merge_all()

    return {
        'X': X,
        'quantized': X_quantized,
        'encoding': encoding,
        'probs': probs,
        'synthesis': synthesis,
        'summaries': summaries,
        'loss': loss
    }
示例#5
0
def create_wavenet(n_stages=10,
                   n_layers_per_stage=9,
                   n_hidden=200,
                   batch_size=32,
                   n_skip=100,
                   filter_length=2,
                   shift=True,
                   n_quantization=256,
                   sample_rate=16000):
    """Summary

    Parameters
    ----------
    n_stages : int, optional
        Description
    n_layers_per_stage : int, optional
        Description
    n_hidden : int, optional
        Description
    batch_size : int, optional
        Description
    n_skip : int, optional
        Description
    filter_length : int, optional
        Description
    shift : bool, optional
        Description
    n_quantization : int, optional
        Description
    sample_rate : int, optional
        Description

    Returns
    -------
    TYPE
        Description
    """
    offset = n_quantization / 2.0
    sequence_length = 2**n_layers_per_stage * 2 * n_stages

    # Encode the source with 8-bit Mu-Law.
    X = tf.placeholder(name='X',
                       shape=[batch_size, sequence_length],
                       dtype=tf.float32)
    X_quantized = wnu.mu_law(X, n_quantization)
    X_onehot = tf.expand_dims(X_quantized, 2)
    if shift:
        X_onehot = wnu.shift_right(X_onehot)

    h = wnu.conv1d(X=X_onehot,
                   num_filters=n_hidden,
                   filter_length=filter_length,
                   name='startconv')

    # Set up skip connections.
    s = wnu.conv1d(X=h, num_filters=n_skip, filter_length=1, name='skip_start')

    # Residual blocks with skip connections.
    for i in range(n_stages * n_layers_per_stage):
        dilation = 2**(i % n_layers_per_stage)

        # dilated masked cnn
        d = wnu.conv1d(X=h,
                       num_filters=2 * n_hidden,
                       filter_length=filter_length,
                       dilation=dilation,
                       name='dilatedconv_%d' % (i + 1))

        # gated cnn
        assert d.get_shape().as_list()[2] % 2 == 0
        m = d.get_shape().as_list()[2] // 2
        d = tf.sigmoid(d[:, :, :m]) * tf.tanh(d[:, :, m:])

        # residuals
        h += wnu.conv1d(X=d,
                        num_filters=n_hidden,
                        filter_length=1,
                        name='res_%d' % (i + 1))

        # skips
        s += wnu.conv1d(X=d,
                        num_filters=n_skip,
                        filter_length=1,
                        name='skip_%d' % (i + 1))

    s = tf.nn.relu(s)
    s = wnu.conv1d(X=s, num_filters=n_skip, filter_length=1, name='out1')
    s = tf.nn.relu(s)
    logits = tf.clip_by_value(wnu.conv1d(X=s,
                                         num_filters=n_quantization,
                                         filter_length=1,
                                         name='logits_preclip') + offset,
                              0.0,
                              n_quantization - 1.0,
                              name='logits')
    logits = tf.reshape(logits, [-1, n_quantization])
    labels = tf.cast(tf.reshape(X_quantized + offset, [-1]), tf.int32)
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=logits, labels=labels, name='nll'),
                          0,
                          name='loss')

    probs = tf.nn.softmax(logits, name='softmax')
    synthesis = tf.reshape(
        wnu.inv_mu_law(
            tf.cast(tf.argmax(probs, 1), tf.float32) - offset, n_quantization),
        [-1, sequence_length])

    tf.summary.audio("synthesis", synthesis, sample_rate=sample_rate)
    tf.summary.histogram("probs", probs)
    tf.summary.histogram("input_quantized", X_quantized)
    tf.summary.histogram("logits", logits)
    tf.summary.histogram("labels", labels)
    tf.summary.histogram("synthesis", synthesis)
    tf.summary.scalar("loss", loss)
    summaries = tf.summary.merge_all()

    return {
        'X': X,
        'quantized': X_quantized,
        'probs': probs,
        'synthesis': synthesis,
        'summaries': summaries,
        'loss': loss
    }
示例#6
0
def create_generation_model(n_stages=5, n_layers_per_stage=10,
                            n_hidden=256, batch_size=1, n_skip=128,
                            n_quantization=256, filter_length=2,
                            onehot=False):
    """Summary

    Parameters
    ----------
    n_stages : int, optional
        Description
    n_layers_per_stage : int, optional
        Description
    n_hidden : int, optional
        Description
    batch_size : int, optional
        Description
    n_skip : int, optional
        Description
    n_quantization : int, optional
        Description
    filter_length : int, optional
        Description
    onehot : bool, optional
        Description

    Returns
    -------
    TYPE
        Description
    """
    offset = n_quantization / 2.0

    # Encode the source with 8-bit Mu-Law.
    X = tf.placeholder(name='X', shape=[None, None], dtype=tf.float32)
    X_quantized = wnu.mu_law(X, n_quantization)
    if onehot:
        X_onehot = tf.one_hot(
            tf.cast(X_quantized + offset, tf.int32),
            n_quantization)
    else:
        X_onehot = tf.expand_dims(X_quantized, 2)

    push_ops, init_ops = [], []
    h, init, push = wnu.causal_linear(
        X=X_onehot,
        n_inputs=256 if onehot else 1,
        n_outputs=n_hidden,
        name='startconv',
        rate=1,
        batch_size=batch_size,
        filter_length=filter_length)
    init_ops.extend(init)
    push_ops.extend(push)

    # Set up skip connections.
    s = wnu.linear(h, n_hidden, n_skip, name='skip_start')

    # Residual blocks with skip connections.
    for i in range(n_stages * n_layers_per_stage):
        dilation = 2**(i % n_layers_per_stage)

        # dilated masked cnn
        d, init, push = wnu.causal_linear(
            X=h,
            n_inputs=n_hidden,
            n_outputs=n_hidden * 2,
            name='dilatedconv_%d' % (i + 1),
            rate=dilation,
            batch_size=batch_size,
            filter_length=filter_length)
        init_ops.extend(init)
        push_ops.extend(push)

        # gated cnn
        assert d.get_shape().as_list()[2] % 2 == 0
        m = d.get_shape().as_list()[2] // 2
        d = tf.sigmoid(d[:, :, :m]) * tf.tanh(d[:, :, m:])

        # residuals
        h += wnu.linear(d, n_hidden, n_hidden, name='res_%d' % (i + 1))

        # skips
        s += wnu.linear(d, n_hidden, n_skip, name='skip_%d' % (i + 1))

    s = tf.nn.relu(s)
    s = wnu.linear(s, n_skip, n_skip, name='out1')
    s = tf.nn.relu(s)
    logits = tf.clip_by_value(
        wnu.linear(s, n_skip, n_quantization, name='logits_preclip') + offset,
        0.0, n_quantization - 1.0,
        name='logits')
    logits = tf.reshape(logits, [-1, n_quantization])
    probs = tf.nn.softmax(logits, name='softmax')
    synthesis = tf.reshape(
        wnu.inv_mu_law(tf.cast(tf.argmax(probs, 1), tf.float32) - offset,
                       n_quantization),
        [-1, 1])

    return {
        'X': X,
        'init_ops': init_ops,
        'push_ops': push_ops,
        'probs': probs,
        'synthesis': synthesis
    }