def create_wavenet_autoencoder(n_stages, n_layers_per_stage, n_hidden, batch_size, n_skip, filter_length, bottleneck_width, hop_length, n_quantization, sample_rate): """Summary Parameters ---------- n_stages : TYPE Description n_layers_per_stage : TYPE Description n_hidden : TYPE Description batch_size : TYPE Description n_skip : TYPE Description filter_length : TYPE Description bottleneck_width : TYPE Description hop_length : TYPE Description n_quantization : TYPE Description sample_rate : TYPE Description Returns ------- TYPE Description """ offset = n_quantization / 2.0 sequence_length = 2**n_layers_per_stage * 2 * n_stages # Encode the source with 8-bit Mu-Law. X = tf.placeholder( name='X', shape=[batch_size, sequence_length], dtype=tf.float32) X_quantized = wnu.mu_law(X, n_quantization) X_scaled = tf.cast(X_quantized / offset, tf.float32) X_scaled = tf.expand_dims(X_scaled, 2) # The Non-Causal Temporal Encoder. en = wnu.conv1d( X=X_scaled, causal=False, num_filters=n_hidden, filter_length=filter_length, name='ae_startconv') # Residual blocks with skip connections. for i in range(n_stages * n_layers_per_stage): dilation = 2**(i % n_layers_per_stage) print(dilation) d = tf.nn.relu(en) d = wnu.conv1d( d, causal=False, num_filters=n_hidden, filter_length=filter_length, dilation=dilation, name='ae_dilatedconv_%d' % (i + 1)) d = tf.nn.relu(d) en += wnu.conv1d( d, num_filters=n_hidden, filter_length=1, name='ae_res_%d' % (i + 1)) en = wnu.conv1d( en, num_filters=bottleneck_width, filter_length=1, name='ae_bottleneck') en = wnu.pool1d(en, hop_length, name='ae_pool', mode='avg') encoding = en # The WaveNet Decoder. l = wnu.shift_right(X_scaled) l = wnu.conv1d( l, num_filters=n_hidden, filter_length=filter_length, name='startconv') # Set up skip connections. s = wnu.conv1d(l, num_filters=n_skip, filter_length=1, name='skip_start') # Residual blocks with skip connections. for i in range(n_stages * n_layers_per_stage): dilation = 2**(i % n_layers_per_stage) d = wnu.conv1d( l, num_filters=2 * n_hidden, filter_length=filter_length, dilation=dilation, name='dilatedconv_%d' % (i + 1)) d = condition(d, wnu.conv1d( en, num_filters=2 * n_hidden, filter_length=1, name='cond_map_%d' % (i + 1))) assert d.get_shape().as_list()[2] % 2 == 0 m = d.get_shape().as_list()[2] // 2 d_sigmoid = tf.sigmoid(d[:, :, :m]) d_tanh = tf.tanh(d[:, :, m:]) d = d_sigmoid * d_tanh l += wnu.conv1d( d, num_filters=n_hidden, filter_length=1, name='res_%d' % (i + 1)) s += wnu.conv1d( d, num_filters=n_skip, filter_length=1, name='skip_%d' % (i + 1)) s = tf.nn.relu(s) s = wnu.conv1d(s, num_filters=n_skip, filter_length=1, name='out1') s = condition(s, wnu.conv1d( en, num_filters=n_skip, filter_length=1, name='cond_map_out1')) s = tf.nn.relu(s) # Compute the logits and get the loss. logits = wnu.conv1d( s, num_filters=n_quantization, filter_length=1, name='logits') logits = tf.reshape(logits, [-1, n_quantization]) probs = tf.nn.softmax(logits, name='softmax') synthesis = tf.reshape( wnu.inv_mu_law( tf.cast(tf.argmax(probs, 1), tf.float32) - offset, n_quantization), [-1, sequence_length]) labels = tf.cast(tf.reshape(X_quantized, [-1]), tf.int32) + int(offset) loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=labels, name='nll'), 0, name='loss') tf.summary.audio("synthesis", synthesis, sample_rate=sample_rate) tf.summary.histogram("probs", probs) tf.summary.histogram("input_quantized", X_quantized) tf.summary.histogram("logits", logits) tf.summary.histogram("labels", labels) tf.summary.histogram("synthesis", synthesis) tf.summary.scalar("loss", loss) summaries = tf.summary.merge_all() return { 'X': X, 'quantized': X_quantized, 'encoding': encoding, 'probs': probs, 'synthesis': synthesis, 'summaries': summaries, 'loss': loss }
def create_wavenet(n_stages=10, n_layers_per_stage=9, n_hidden=200, batch_size=32, n_skip=100, filter_length=2, shift=True, n_quantization=256, sample_rate=16000): """Summary Parameters ---------- n_stages : int, optional Description n_layers_per_stage : int, optional Description n_hidden : int, optional Description batch_size : int, optional Description n_skip : int, optional Description filter_length : int, optional Description shift : bool, optional Description n_quantization : int, optional Description sample_rate : int, optional Description Returns ------- TYPE Description """ offset = n_quantization / 2.0 sequence_length = 2**n_layers_per_stage * 2 * n_stages # Encode the source with 8-bit Mu-Law. X = tf.placeholder( name='X', shape=[batch_size, sequence_length], dtype=tf.float32) X_quantized = wnu.mu_law(X, n_quantization) X_onehot = tf.expand_dims(X_quantized, 2) if shift: X_onehot = wnu.shift_right(X_onehot) h = wnu.conv1d( X=X_onehot, num_filters=n_hidden, filter_length=filter_length, name='startconv') # Set up skip connections. s = wnu.conv1d(X=h, num_filters=n_skip, filter_length=1, name='skip_start') # Residual blocks with skip connections. for i in range(n_stages * n_layers_per_stage): dilation = 2**(i % n_layers_per_stage) # dilated masked cnn d = wnu.conv1d( X=h, num_filters=2 * n_hidden, filter_length=filter_length, dilation=dilation, name='dilatedconv_%d' % (i + 1)) # gated cnn assert d.get_shape().as_list()[2] % 2 == 0 m = d.get_shape().as_list()[2] // 2 d = tf.sigmoid(d[:, :, :m]) * tf.tanh(d[:, :, m:]) # residuals h += wnu.conv1d( X=d, num_filters=n_hidden, filter_length=1, name='res_%d' % (i + 1)) # skips s += wnu.conv1d( X=d, num_filters=n_skip, filter_length=1, name='skip_%d' % (i + 1)) s = tf.nn.relu(s) s = wnu.conv1d(X=s, num_filters=n_skip, filter_length=1, name='out1') s = tf.nn.relu(s) logits = tf.clip_by_value( wnu.conv1d( X=s, num_filters=n_quantization, filter_length=1, name='logits_preclip') + offset, 0.0, n_quantization - 1.0, name='logits') logits = tf.reshape(logits, [-1, n_quantization]) labels = tf.cast(tf.reshape(X_quantized + offset, [-1]), tf.int32) loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=labels, name='nll'), 0, name='loss') probs = tf.nn.softmax(logits, name='softmax') synthesis = tf.reshape( wnu.inv_mu_law( tf.cast(tf.argmax(probs, 1), tf.float32) - offset, n_quantization), [-1, sequence_length]) tf.summary.audio("synthesis", synthesis, sample_rate=sample_rate) tf.summary.histogram("probs", probs) tf.summary.histogram("input_quantized", X_quantized) tf.summary.histogram("logits", logits) tf.summary.histogram("labels", labels) tf.summary.histogram("synthesis", synthesis) tf.summary.scalar("loss", loss) summaries = tf.summary.merge_all() return { 'X': X, 'quantized': X_quantized, 'probs': probs, 'synthesis': synthesis, 'summaries': summaries, 'loss': loss }
def create_generation_model(n_stages=5, n_layers_per_stage=10, n_hidden=256, batch_size=1, n_skip=128, n_quantization=256, filter_length=2, onehot=False): """Summary Parameters ---------- n_stages : int, optional Description n_layers_per_stage : int, optional Description n_hidden : int, optional Description batch_size : int, optional Description n_skip : int, optional Description n_quantization : int, optional Description filter_length : int, optional Description onehot : bool, optional Description Returns ------- TYPE Description """ offset = n_quantization / 2.0 # Encode the source with 8-bit Mu-Law. X = tf.placeholder(name='X', shape=[None, None], dtype=tf.float32) X_quantized = wnu.mu_law(X, n_quantization) if onehot: X_onehot = tf.one_hot( tf.cast(X_quantized + offset, tf.int32), n_quantization) else: X_onehot = tf.expand_dims(X_quantized, 2) push_ops, init_ops = [], [] h, init, push = wnu.causal_linear( X=X_onehot, n_inputs=256 if onehot else 1, n_outputs=n_hidden, name='startconv', rate=1, batch_size=batch_size, filter_length=filter_length) init_ops.extend(init) push_ops.extend(push) # Set up skip connections. s = wnu.linear(h, n_hidden, n_skip, name='skip_start') # Residual blocks with skip connections. for i in range(n_stages * n_layers_per_stage): dilation = 2**(i % n_layers_per_stage) # dilated masked cnn d, init, push = wnu.causal_linear( X=h, n_inputs=n_hidden, n_outputs=n_hidden * 2, name='dilatedconv_%d' % (i + 1), rate=dilation, batch_size=batch_size, filter_length=filter_length) init_ops.extend(init) push_ops.extend(push) # gated cnn assert d.get_shape().as_list()[2] % 2 == 0 m = d.get_shape().as_list()[2] // 2 d = tf.sigmoid(d[:, :, :m]) * tf.tanh(d[:, :, m:]) # residuals h += wnu.linear(d, n_hidden, n_hidden, name='res_%d' % (i + 1)) # skips s += wnu.linear(d, n_hidden, n_skip, name='skip_%d' % (i + 1)) s = tf.nn.relu(s) s = wnu.linear(s, n_skip, n_skip, name='out1') s = tf.nn.relu(s) logits = tf.clip_by_value( wnu.linear(s, n_skip, n_quantization, name='logits_preclip') + offset, 0.0, n_quantization - 1.0, name='logits') logits = tf.reshape(logits, [-1, n_quantization]) probs = tf.nn.softmax(logits, name='softmax') synthesis = tf.reshape( wnu.inv_mu_law(tf.cast(tf.argmax(probs, 1), tf.float32) - offset, n_quantization), [-1, 1]) return { 'X': X, 'init_ops': init_ops, 'push_ops': push_ops, 'probs': probs, 'synthesis': synthesis }
def create_wavenet_autoencoder(n_stages, n_layers_per_stage, n_hidden, batch_size, n_skip, filter_length, bottleneck_width, hop_length, n_quantization, sample_rate): """Summary Parameters ---------- n_stages : TYPE Description n_layers_per_stage : TYPE Description n_hidden : TYPE Description batch_size : TYPE Description n_skip : TYPE Description filter_length : TYPE Description bottleneck_width : TYPE Description hop_length : TYPE Description n_quantization : TYPE Description sample_rate : TYPE Description Returns ------- TYPE Description """ offset = n_quantization / 2.0 sequence_length = 2**n_layers_per_stage * 2 * n_stages # Encode the source with 8-bit Mu-Law. X = tf.placeholder(name='X', shape=[batch_size, sequence_length], dtype=tf.float32) X_quantized = wnu.mu_law(X, n_quantization) X_scaled = tf.cast(X_quantized / offset, tf.float32) X_scaled = tf.expand_dims(X_scaled, 2) # The Non-Causal Temporal Encoder. en = wnu.conv1d(X=X_scaled, causal=False, num_filters=n_hidden, filter_length=filter_length, name='ae_startconv') # Residual blocks with skip connections. for i in range(n_stages * n_layers_per_stage): dilation = 2**(i % n_layers_per_stage) print(dilation) d = tf.nn.relu(en) d = wnu.conv1d(d, causal=False, num_filters=n_hidden, filter_length=filter_length, dilation=dilation, name='ae_dilatedconv_%d' % (i + 1)) d = tf.nn.relu(d) en += wnu.conv1d(d, num_filters=n_hidden, filter_length=1, name='ae_res_%d' % (i + 1)) en = wnu.conv1d(en, num_filters=bottleneck_width, filter_length=1, name='ae_bottleneck') en = wnu.pool1d(en, hop_length, name='ae_pool', mode='avg') encoding = en # The WaveNet Decoder. l = wnu.shift_right(X_scaled) l = wnu.conv1d(l, num_filters=n_hidden, filter_length=filter_length, name='startconv') # Set up skip connections. s = wnu.conv1d(l, num_filters=n_skip, filter_length=1, name='skip_start') # Residual blocks with skip connections. for i in range(n_stages * n_layers_per_stage): dilation = 2**(i % n_layers_per_stage) d = wnu.conv1d(l, num_filters=2 * n_hidden, filter_length=filter_length, dilation=dilation, name='dilatedconv_%d' % (i + 1)) d = condition( d, wnu.conv1d(en, num_filters=2 * n_hidden, filter_length=1, name='cond_map_%d' % (i + 1))) assert d.get_shape().as_list()[2] % 2 == 0 m = d.get_shape().as_list()[2] // 2 d_sigmoid = tf.sigmoid(d[:, :, :m]) d_tanh = tf.tanh(d[:, :, m:]) d = d_sigmoid * d_tanh l += wnu.conv1d(d, num_filters=n_hidden, filter_length=1, name='res_%d' % (i + 1)) s += wnu.conv1d(d, num_filters=n_skip, filter_length=1, name='skip_%d' % (i + 1)) s = tf.nn.relu(s) s = wnu.conv1d(s, num_filters=n_skip, filter_length=1, name='out1') s = condition( s, wnu.conv1d(en, num_filters=n_skip, filter_length=1, name='cond_map_out1')) s = tf.nn.relu(s) # Compute the logits and get the loss. logits = wnu.conv1d(s, num_filters=n_quantization, filter_length=1, name='logits') logits = tf.reshape(logits, [-1, n_quantization]) probs = tf.nn.softmax(logits, name='softmax') synthesis = tf.reshape( wnu.inv_mu_law( tf.cast(tf.argmax(probs, 1), tf.float32) - offset, n_quantization), [-1, sequence_length]) labels = tf.cast(tf.reshape(X_quantized, [-1]), tf.int32) + int(offset) loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=labels, name='nll'), 0, name='loss') tf.summary.audio("synthesis", synthesis, sample_rate=sample_rate) tf.summary.histogram("probs", probs) tf.summary.histogram("input_quantized", X_quantized) tf.summary.histogram("logits", logits) tf.summary.histogram("labels", labels) tf.summary.histogram("synthesis", synthesis) tf.summary.scalar("loss", loss) summaries = tf.summary.merge_all() return { 'X': X, 'quantized': X_quantized, 'encoding': encoding, 'probs': probs, 'synthesis': synthesis, 'summaries': summaries, 'loss': loss }
def create_wavenet(n_stages=10, n_layers_per_stage=9, n_hidden=200, batch_size=32, n_skip=100, filter_length=2, shift=True, n_quantization=256, sample_rate=16000): """Summary Parameters ---------- n_stages : int, optional Description n_layers_per_stage : int, optional Description n_hidden : int, optional Description batch_size : int, optional Description n_skip : int, optional Description filter_length : int, optional Description shift : bool, optional Description n_quantization : int, optional Description sample_rate : int, optional Description Returns ------- TYPE Description """ offset = n_quantization / 2.0 sequence_length = 2**n_layers_per_stage * 2 * n_stages # Encode the source with 8-bit Mu-Law. X = tf.placeholder(name='X', shape=[batch_size, sequence_length], dtype=tf.float32) X_quantized = wnu.mu_law(X, n_quantization) X_onehot = tf.expand_dims(X_quantized, 2) if shift: X_onehot = wnu.shift_right(X_onehot) h = wnu.conv1d(X=X_onehot, num_filters=n_hidden, filter_length=filter_length, name='startconv') # Set up skip connections. s = wnu.conv1d(X=h, num_filters=n_skip, filter_length=1, name='skip_start') # Residual blocks with skip connections. for i in range(n_stages * n_layers_per_stage): dilation = 2**(i % n_layers_per_stage) # dilated masked cnn d = wnu.conv1d(X=h, num_filters=2 * n_hidden, filter_length=filter_length, dilation=dilation, name='dilatedconv_%d' % (i + 1)) # gated cnn assert d.get_shape().as_list()[2] % 2 == 0 m = d.get_shape().as_list()[2] // 2 d = tf.sigmoid(d[:, :, :m]) * tf.tanh(d[:, :, m:]) # residuals h += wnu.conv1d(X=d, num_filters=n_hidden, filter_length=1, name='res_%d' % (i + 1)) # skips s += wnu.conv1d(X=d, num_filters=n_skip, filter_length=1, name='skip_%d' % (i + 1)) s = tf.nn.relu(s) s = wnu.conv1d(X=s, num_filters=n_skip, filter_length=1, name='out1') s = tf.nn.relu(s) logits = tf.clip_by_value(wnu.conv1d(X=s, num_filters=n_quantization, filter_length=1, name='logits_preclip') + offset, 0.0, n_quantization - 1.0, name='logits') logits = tf.reshape(logits, [-1, n_quantization]) labels = tf.cast(tf.reshape(X_quantized + offset, [-1]), tf.int32) loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=labels, name='nll'), 0, name='loss') probs = tf.nn.softmax(logits, name='softmax') synthesis = tf.reshape( wnu.inv_mu_law( tf.cast(tf.argmax(probs, 1), tf.float32) - offset, n_quantization), [-1, sequence_length]) tf.summary.audio("synthesis", synthesis, sample_rate=sample_rate) tf.summary.histogram("probs", probs) tf.summary.histogram("input_quantized", X_quantized) tf.summary.histogram("logits", logits) tf.summary.histogram("labels", labels) tf.summary.histogram("synthesis", synthesis) tf.summary.scalar("loss", loss) summaries = tf.summary.merge_all() return { 'X': X, 'quantized': X_quantized, 'probs': probs, 'synthesis': synthesis, 'summaries': summaries, 'loss': loss }