def compute_wavenet_encoder_features(content, style): ae_hop_length = 512 ae_bottleneck_width = 16 ae_num_stages = 10 ae_num_layers = 30 ae_filter_length = 3 ae_width = 128 # Encode the source with 8-bit Mu-Law. n_frames = content.shape[0] n_samples = content.shape[1] content_tf = np.ascontiguousarray(content) style_tf = np.ascontiguousarray(style) g = tf.Graph() content_features = [] style_features = [] layers = [] with g.as_default(), g.device('/cpu:0'), tf.Session() as sess: x = tf.placeholder('float32', [n_frames, n_samples], name="x") x_quantized = mu_law(x) x_scaled = tf.cast(x_quantized, tf.float32) / 128.0 x_scaled = tf.expand_dims(x_scaled, 2) en = masked.conv1d(x_scaled, causal=False, num_filters=ae_width, filter_length=ae_filter_length, name='ae_startconv') for num_layer in range(ae_num_layers): dilation = 2**(num_layer % ae_num_stages) d = tf.nn.relu(en) d = masked.conv1d(d, causal=False, num_filters=ae_width, filter_length=ae_filter_length, dilation=dilation, name='ae_dilatedconv_%d' % (num_layer + 1)) d = tf.nn.relu(d) en += masked.conv1d(d, num_filters=ae_width, filter_length=1, name='ae_res_%d' % (num_layer + 1)) layers.append(en) en = masked.conv1d(en, num_filters=ae_bottleneck_width, filter_length=1, name='ae_bottleneck') en = masked.pool1d(en, ae_hop_length, name='ae_pool', mode='avg') saver = tf.train.Saver() saver.restore(sess, './model.ckpt-200000') content_features = sess.run(layers, feed_dict={x: content_tf}) styles = sess.run(layers, feed_dict={x: style_tf}) for i, style_feature in enumerate(styles): n_features = np.prod(layers[i].shape.as_list()[-1]) features = np.reshape(style_feature, (-1, n_features)) style_gram = np.matmul(features.T, features) / (n_samples * n_frames) style_features.append(style_gram) return content_features, style_features
def encode(self, inputs, reuse=False): ae_num_stages = self.ae_num_stages ae_num_layers = self.ae_num_layers ae_filter_length = self.ae_filter_length ae_width = self.ae_width ae_bottleneck_width = self.ae_bottleneck_width # Encode the source with 8-bit Mu-Law. x = inputs tf.logging.info("x shape: %s", str(x.shape.as_list())) x_quantized = utils.mu_law(x) x_scaled = tf.cast(x_quantized, tf.float32) / 128.0 x_scaled = tf.expand_dims(x_scaled, 2) en = masked.conv1d( x_scaled, causal=False, num_filters=ae_width, filter_length=ae_filter_length, name='ae_startconv') for num_layer in xrange(ae_num_layers): dilation = 2**(num_layer % ae_num_stages) d = tf.nn.relu(en) d = masked.conv1d( d, causal=False, num_filters=ae_width, filter_length=ae_filter_length, dilation=dilation, name='ae_dilatedconv_%d' % (num_layer + 1)) d = tf.nn.relu(d) en += masked.conv1d( d, num_filters=ae_width, filter_length=1, name='ae_res_%d' % (num_layer + 1)) en = masked.conv1d( en, num_filters=self.ae_bottleneck_width, filter_length=1, name='ae_bottleneck') # pooling is optional # en = masked.pool1d(en, self.ae_hop_length, name='ae_pool', mode='avg') return { 'x_quantized': x_quantized, 'encoding': en, }
def build(self, inputs, is_training): """Build the graph for this configuration. Args: inputs: A dict of inputs. For training, should contain 'wav'. is_training: Whether we are training or not. Not used in this config. Returns: A dict of outputs that includes the 'predictions', 'loss', the 'encoding', the 'quantized_input', and whatever metrics we want to track for eval. """ del is_training num_stages = 10 num_layers = 30 filter_length = 3 width = 512 skip_width = 256 ae_num_stages = 10 ae_num_layers = 30 ae_filter_length = 3 ae_width = 128 # Encode the source with 8-bit Mu-Law. x = inputs['wav'] x_quantized = utils.mu_law(x) x_scaled = tf.cast(x_quantized, tf.float32) / 128.0 x_scaled = tf.expand_dims(x_scaled, 2) ### # The Non-Causal Temporal Encoder. ### en = masked.conv1d( x_scaled, causal=False, num_filters=ae_width, filter_length=ae_filter_length, name='ae_startconv')
def build(self, inputs): """Build the graph for this configuration. Args: inputs: A dict of inputs. For training, should contain 'wav'. Returns: A dict of outputs that includes the 'predictions', 'init_ops', the 'push_ops', and the 'quantized_input'. """ num_stages = 10 num_layers = 30 filter_length = 3 width = 512 skip_width = 256 num_z = 16 # Encode the source with 8-bit Mu-Law. x = inputs['wav'] batch_size = self.batch_size x_quantized = utils.mu_law(x) x_scaled = tf.cast(x_quantized, tf.float32) / 128.0 x_scaled = tf.expand_dims(x_scaled, 2) encoding = tf.placeholder(name='encoding', shape=[batch_size, num_z], dtype=tf.float32) en = tf.expand_dims(encoding, 1) init_ops, push_ops = [], [] ### # The WaveNet Decoder. ### l = x_scaled # noqa l, inits, pushs = utils.causal_linear(x=l, n_inputs=1, n_outputs=width, name='startconv', rate=1, batch_size=batch_size, filter_length=filter_length) for init in inits: init_ops.append(init) for push in pushs: push_ops.append(push) # Set up skip connections. s = utils.linear(l, width, skip_width, name='skip_start') # Residual blocks with skip connections. for i in range(num_layers): dilation = 2**(i % num_stages) # dilated masked cnn d, inits, pushs = utils.causal_linear(x=l, n_inputs=width, n_outputs=width * 2, name='dilatedconv_%d' % (i + 1), rate=dilation, batch_size=batch_size, filter_length=filter_length) for init in inits: init_ops.append(init) for push in pushs: push_ops.append(push) # local conditioning d += utils.linear(en, num_z, width * 2, name='cond_map_%d' % (i + 1)) # gated cnn assert d.get_shape().as_list()[2] % 2 == 0 m = d.get_shape().as_list()[2] // 2 d = tf.sigmoid(d[:, :, :m]) * tf.tanh(d[:, :, m:]) # residuals l += utils.linear(d, width, width, name='res_%d' % (i + 1)) # noqa # skips s += utils.linear(d, width, skip_width, name='skip_%d' % (i + 1)) s = tf.nn.relu(s) s = (utils.linear(s, skip_width, skip_width, name='out1') + utils.linear(en, num_z, skip_width, name='cond_map_out1')) s = tf.nn.relu(s) ### # Compute the logits and get the loss. ### logits = utils.linear(s, skip_width, 256, name='logits') logits = tf.reshape(logits, [-1, 256]) probs = tf.nn.softmax(logits, name='softmax') return { 'init_ops': init_ops, 'push_ops': push_ops, 'predictions': probs, 'encoding': encoding, 'quantized_input': x_quantized, }
def build(self, inputs, is_training, rescale_inputs=True, include_decoder=True, use_reduce_mean_to_pool=False): """Build the graph for this configuration. Args: inputs: A dict of inputs. For training, should contain 'wav'. is_training: Whether we are training or not. Not used in this config. rescale_inputs: Whether to convert inputs to mu-law and back to unit scaling before passing through the model (loses gradients). include_decoder: bool, whether to include the decoder in the build(). use_reduce_mean_to_pool: whether to use reduce_mean (instead of pool1d) for pooling. Returns: A dict of outputs that includes the 'predictions', 'loss', the 'encoding', the 'quantized_input', and whatever metrics we want to track for eval. """ num_stages = 10 num_layers = 30 filter_length = 3 width = 512 skip_width = 256 ae_num_stages = 10 ae_num_layers = 30 ae_filter_length = 3 ae_width = 128 # Encode the source with 8-bit Mu-Law. x = inputs['wav'] x_quantized = utils.mu_law(x) x_scaled = tf.cast(x_quantized, tf.float32) / 128.0 x_scaled = tf.expand_dims(x_scaled, 2) x = tf.expand_dims(x, 2) ### # The Non-Causal Temporal Encoder. ### en = masked.conv1d(x_scaled if rescale_inputs else x, causal=False, num_filters=ae_width, filter_length=ae_filter_length, name='ae_startconv', is_training=is_training) for num_layer in range(ae_num_layers): dilation = 2**(num_layer % ae_num_stages) d = tf.nn.relu(en) d = masked.conv1d(d, causal=False, num_filters=ae_width, filter_length=ae_filter_length, dilation=dilation, name='ae_dilatedconv_%d' % (num_layer + 1), is_training=is_training) d = tf.nn.relu(d) en += masked.conv1d(d, num_filters=ae_width, filter_length=1, name='ae_res_%d' % (num_layer + 1), is_training=is_training) en = masked.conv1d(en, num_filters=self.ae_bottleneck_width, filter_length=1, name='ae_bottleneck', is_training=is_training) if use_reduce_mean_to_pool: # Depending on the accelerator used for training, masked.pool1d may # lead to out of memory error. # reduce_mean is equivalent to masked.pool1d when the stride is the same # as the window length (which is the case here). batch_size, unused_length, depth = en.shape.as_list() en = tf.reshape(en, [batch_size, -1, self.ae_hop_length, depth]) en = tf.reduce_mean(en, axis=2) else: en = masked.pool1d(en, self.ae_hop_length, name='ae_pool', mode='avg') encoding = en if not include_decoder: return {'encoding': encoding} ### # The WaveNet Decoder. ### l = masked.shift_right(x_scaled if rescale_inputs else x) # noqa l = masked.conv1d( # noqa l, num_filters=width, filter_length=filter_length, name='startconv', is_training=is_training) # Set up skip connections. s = masked.conv1d(l, num_filters=skip_width, filter_length=1, name='skip_start', is_training=is_training) # Residual blocks with skip connections. for i in range(num_layers): dilation = 2**(i % num_stages) d = masked.conv1d(l, num_filters=2 * width, filter_length=filter_length, dilation=dilation, name='dilatedconv_%d' % (i + 1), is_training=is_training) d = self._condition( d, masked.conv1d(en, num_filters=2 * width, filter_length=1, name='cond_map_%d' % (i + 1), is_training=is_training)) assert d.get_shape().as_list()[2] % 2 == 0 m = d.get_shape().as_list()[2] // 2 d_sigmoid = tf.sigmoid(d[:, :, :m]) d_tanh = tf.tanh(d[:, :, m:]) d = d_sigmoid * d_tanh l += masked.conv1d( # noqa d, num_filters=width, filter_length=1, name='res_%d' % (i + 1), is_training=is_training) s += masked.conv1d(d, num_filters=skip_width, filter_length=1, name='skip_%d' % (i + 1), is_training=is_training) s = tf.nn.relu(s) s = masked.conv1d(s, num_filters=skip_width, filter_length=1, name='out1', is_training=is_training) s = self._condition( s, masked.conv1d(en, num_filters=skip_width, filter_length=1, name='cond_map_out1', is_training=is_training)) s = tf.nn.relu(s) ### # Compute the logits and get the loss. ### logits = masked.conv1d(s, num_filters=256, filter_length=1, name='logits', is_training=is_training) logits = tf.reshape(logits, [-1, 256]) probs = tf.nn.softmax(logits, name='softmax') x_indices = tf.cast(tf.reshape(x_quantized, [-1]), tf.int32) + 128 loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=x_indices, name='nll'), 0, name='loss') return { 'predictions': probs, 'loss': loss, 'eval': { 'nll': loss }, 'quantized_input': x_quantized, 'encoding': encoding, }
def build(self, inputs): """Build the graph for this configuration. Args: inputs: A dict of inputs. For training, should contain 'wav'. Returns: A dict of outputs that includes the 'predictions', 'init_ops', the 'push_ops', and the 'quantized_input'. """ num_stages = 10 num_layers = 30 filter_length = 3 width = 512 skip_width = 256 num_z = 16 # Encode the source with 8-bit Mu-Law. x = inputs['wav'] batch_size = self.batch_size x_quantized = utils.mu_law(x) x_scaled = tf.cast(x_quantized, tf.float32) / 128.0 x_scaled = tf.expand_dims(x_scaled, 2) encoding = tf.placeholder( name='encoding', shape=[batch_size, num_z], dtype=tf.float32) en = tf.expand_dims(encoding, 1) init_ops, push_ops = [], [] ### # The WaveNet Decoder. ### l = x_scaled l, inits, pushs = utils.causal_linear( x=l, n_inputs=1, n_outputs=width, name='startconv', rate=1, batch_size=batch_size, filter_length=filter_length) for init in inits: init_ops.append(init) for push in pushs: push_ops.append(push) # Set up skip connections. s = utils.linear(l, width, skip_width, name='skip_start') # Residual blocks with skip connections. for i in range(num_layers): dilation = 2**(i % num_stages) # dilated masked cnn d, inits, pushs = utils.causal_linear( x=l, n_inputs=width, n_outputs=width * 2, name='dilatedconv_%d' % (i + 1), rate=dilation, batch_size=batch_size, filter_length=filter_length) for init in inits: init_ops.append(init) for push in pushs: push_ops.append(push) # local conditioning d += utils.linear(en, num_z, width * 2, name='cond_map_%d' % (i + 1)) # gated cnn assert d.get_shape().as_list()[2] % 2 == 0 m = d.get_shape().as_list()[2] // 2 d = tf.sigmoid(d[:, :, :m]) * tf.tanh(d[:, :, m:]) # residuals l += utils.linear(d, width, width, name='res_%d' % (i + 1)) # skips s += utils.linear(d, width, skip_width, name='skip_%d' % (i + 1)) s = tf.nn.relu(s) s = (utils.linear(s, skip_width, skip_width, name='out1') + utils.linear( en, num_z, skip_width, name='cond_map_out1')) s = tf.nn.relu(s) ### # Compute the logits and get the loss. ### logits = utils.linear(s, skip_width, 256, name='logits') logits = tf.reshape(logits, [-1, 256]) probs = tf.nn.softmax(logits, name='softmax') return { 'init_ops': init_ops, 'push_ops': push_ops, 'predictions': probs, 'encoding': encoding, 'quantized_input': x_quantized, }
def build(self, inputs, is_training): """Build the graph for this configuration. Args: inputs: A dict of inputs. For training, should contain 'wav'. is_training: Whether we are training or not. Not used in this config. Returns: A dict of outputs that includes the 'predictions', 'loss', the 'encoding', the 'quantized_input', and whatever metrics we want to track for eval. """ del is_training num_stages = 10 num_layers = 30 filter_length = 3 width = 512 skip_width = 256 ae_num_stages = 10 ae_num_layers = 30 ae_filter_length = 3 ae_width = 128 # Encode the source with 8-bit Mu-Law. x = inputs['wav'] x_quantized = utils.mu_law(x) x_scaled = tf.cast(x_quantized, tf.float32) / 128.0 x_scaled = tf.expand_dims(x_scaled, 2) ### # The Non-Causal Temporal Encoder. ### en = masked.conv1d( x_scaled, causal=False, num_filters=ae_width, filter_length=ae_filter_length, name='ae_startconv') for num_layer in range(ae_num_layers): dilation = 2**(num_layer % ae_num_stages) d = tf.nn.relu(en) d = masked.conv1d( d, causal=False, num_filters=ae_width, filter_length=ae_filter_length, dilation=dilation, name='ae_dilatedconv_%d' % (num_layer + 1)) d = tf.nn.relu(d) en += masked.conv1d( d, num_filters=ae_width, filter_length=1, name='ae_res_%d' % (num_layer + 1)) en = masked.conv1d( en, num_filters=self.ae_bottleneck_width, filter_length=1, name='ae_bottleneck') en = masked.pool1d(en, self.ae_hop_length, name='ae_pool', mode='avg') encoding = en ### # The WaveNet Decoder. ### l = masked.shift_right(x_scaled) l = masked.conv1d( l, num_filters=width, filter_length=filter_length, name='startconv') # Set up skip connections. s = masked.conv1d( l, num_filters=skip_width, filter_length=1, name='skip_start') # Residual blocks with skip connections. for i in range(num_layers): dilation = 2**(i % num_stages) d = masked.conv1d( l, num_filters=2 * width, filter_length=filter_length, dilation=dilation, name='dilatedconv_%d' % (i + 1)) d = self._condition(d, masked.conv1d( en, num_filters=2 * width, filter_length=1, name='cond_map_%d' % (i + 1))) assert d.get_shape().as_list()[2] % 2 == 0 m = d.get_shape().as_list()[2] // 2 d_sigmoid = tf.sigmoid(d[:, :, :m]) d_tanh = tf.tanh(d[:, :, m:]) d = d_sigmoid * d_tanh l += masked.conv1d( d, num_filters=width, filter_length=1, name='res_%d' % (i + 1)) s += masked.conv1d( d, num_filters=skip_width, filter_length=1, name='skip_%d' % (i + 1)) s = tf.nn.relu(s) s = masked.conv1d(s, num_filters=skip_width, filter_length=1, name='out1') s = self._condition(s, masked.conv1d( en, num_filters=skip_width, filter_length=1, name='cond_map_out1')) s = tf.nn.relu(s) ### # Compute the logits and get the loss. ### logits = masked.conv1d(s, num_filters=256, filter_length=1, name='logits') logits = tf.reshape(logits, [-1, 256]) probs = tf.nn.softmax(logits, name='softmax') x_indices = tf.cast(tf.reshape(x_quantized, [-1]), tf.int32) + 128 loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=x_indices, name='nll'), 0, name='loss') return { 'predictions': probs, 'loss': loss, 'eval': { 'nll': loss }, 'quantized_input': x_quantized, 'encoding': encoding, }
def build(self, inputs, is_training): """Build the graph for this configuration. Parameters ---------- inputs A dict of inputs. For training, should contain 'wav'. is_training Whether we are training or not. Not used in this config. Returns ------- A dict of outputs that includes the 'predictions', 'loss', the 'encoding', the 'quantized_input', and whatever metrics we want to track for eval. """ del is_training num_stages = 10 num_layers = 30 filter_length = 3 width = 512 skip_width = 256 ae_num_stages = 10 ae_num_layers = 30 ae_filter_length = 3 ae_width = 128 # Encode the source with 8-bit Mu-Law. x = inputs['wav'] x_quantized = utils.mu_law(x) x_scaled = tf.cast(x_quantized, tf.float32) / 128.0 x_scaled = tf.expand_dims(x_scaled, 2) if self.encoding: ### # The Non-Causal Temporal Encoder. ### en = masked.conv1d( x_scaled, causal=False, num_filters=ae_width, filter_length=ae_filter_length, name='ae_startconv') for num_layer in range(ae_num_layers): dilation = 2**(num_layer % ae_num_stages) d = tf.nn.relu(en) d = masked.conv1d( d, causal=False, num_filters=ae_width, filter_length=ae_filter_length, dilation=dilation, name='ae_dilatedconv_%d' % (num_layer + 1)) d = tf.nn.relu(d) en += masked.conv1d( d, num_filters=ae_width, filter_length=1, name='ae_res_%d' % (num_layer + 1)) en = masked.conv1d( en, num_filters=self.ae_bottleneck_width, filter_length=1, name='ae_bottleneck') en = masked.pool1d( en, self.ae_hop_length, name='ae_pool', mode='avg') encoding = en else: encoding = en = tf.placeholder( name='ae_pool', shape=[1, 125, 16], dtype=tf.float32) ### # The WaveNet Decoder. ### l = masked.shift_right(x_scaled) l = masked.conv1d( l, num_filters=width, filter_length=filter_length, name='startconv') # Set up skip connections. s = masked.conv1d( l, num_filters=skip_width, filter_length=1, name='skip_start') # Residual blocks with skip connections. for i in range(num_layers): dilation = 2**(i % num_stages) d = masked.conv1d( l, num_filters=2 * width, filter_length=filter_length, dilation=dilation, name='dilatedconv_%d' % (i + 1)) d = self._condition(d, masked.conv1d( en, num_filters=2 * width, filter_length=1, name='cond_map_%d' % (i + 1))) assert d.get_shape().as_list()[2] % 2 == 0 m = d.get_shape().as_list()[2] // 2 d_sigmoid = tf.sigmoid(d[:, :, :m]) d_tanh = tf.tanh(d[:, :, m:]) d = d_sigmoid * d_tanh l += masked.conv1d( d, num_filters=width, filter_length=1, name='res_%d' % (i + 1)) s += masked.conv1d( d, num_filters=skip_width, filter_length=1, name='skip_%d' % (i + 1)) s = tf.nn.relu(s) s = masked.conv1d( s, num_filters=skip_width, filter_length=1, name='out1') s = self._condition(s, masked.conv1d( en, num_filters=skip_width, filter_length=1, name='cond_map_out1')) s = tf.nn.relu(s) ### # Compute the logits and get the loss. ### logits = masked.conv1d( s, num_filters=256, filter_length=1, name='logits') logits = tf.reshape(logits, [-1, 256]) probs = tf.nn.softmax(logits, name='softmax') x_indices = tf.cast(tf.reshape(x_quantized, [-1]), tf.int32) + 128 loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=x_indices, name='nll'), 0, name='loss') return { 'predictions': probs, 'loss': loss, 'eval': { 'nll': loss }, 'quantized_input': x_quantized, 'encoding': encoding, }
def build(self, inputs, is_training): """Build the graph for this configuration. Args: inputs: A dict of inputs. For training, should contain 'wav'. is_training: Whether we are training or not. Not used in this config. Returns: A dict of outputs that includes the 'predictions', 'loss', the 'encoding', the 'quantized_input', and whatever metrics we want to track for eval. """ del is_training num_stages = self.num_stages num_layers = self.num_layers filter_length = 3 width = 512 skip_width = 256 ae_num_stages = self.ae_num_stages ae_num_layers = self.ae_num_layers ae_filter_length = 3 ae_width = 128 # Encode the source with 8-bit Mu-Law. x = inputs['wav'] x_quantized = utils.mu_law(x) x_scaled = tf.cast(x_quantized, tf.float32) / 128.0 x_scaled = tf.expand_dims(x_scaled, 2) if self.iw > 1: x_scaled = self._duplicate(x_scaled, self.iw) ### # The Non-Causal Temporal Encoder. ### en = masked.conv1d( x_scaled, causal=False, num_filters=ae_width, filter_length=ae_filter_length, name='ae_startconv') for num_layer in range(ae_num_layers): dilation = 2**(num_layer % ae_num_stages) d = tf.nn.relu(en) d = masked.conv1d( d, causal=False, num_filters=ae_width, filter_length=ae_filter_length, dilation=dilation, name='ae_dilatedconv_%d' % (num_layer + 1)) d = tf.nn.relu(d) en += masked.conv1d( d, num_filters=ae_width, filter_length=1, name='ae_res_%d' % (num_layer + 1)) en = masked.conv1d( en, num_filters=self.ae_bottleneck_width, filter_length=1, name='ae_bottleneck') en = masked.pool1d(en, self.ae_hop_length, name='ae_pool', mode='avg') # divide encoding into "mean" and "variance" mn, v = self._gaussian_parameters(en) # flatten "mean" and "var" m_shape = mn.get_shape().as_list() v_shape = v.get_shape().as_list() mn = tf.reshape(mn, (-1, m_shape[-2]*m_shape[-1])) v = tf.reshape(v, (-1, v_shape[-2]*v_shape[-1])) # reparameterization trick en = self._sample_gaussian(mn, v) # reshape into original embedding shape en = tf.reshape(en, (-1, m_shape[-2], m_shape[-1])) encoding = en ### # The WaveNet Decoder. ### dropout_mask = tf.distributions.Bernoulli(probs=tf.to_float(self.dropout), dtype=tf.float32).sample(sample_shape=tf.shape(x_scaled)) l = tf.math.multiply(masked.shift_right(x_scaled), dropout_mask) l = masked.conv1d( l, num_filters=width, filter_length=filter_length, name='startconv') # Set up skip connections. s = masked.conv1d( l, num_filters=skip_width, filter_length=1, name='skip_start') # Residual blocks with skip connections. for i in range(num_layers): dilation = 2**(i % num_stages) d = masked.conv1d( l, num_filters=2 * width, filter_length=filter_length, dilation=dilation, name='dilatedconv_%d' % (i + 1)) d = self._condition(d, masked.conv1d( en, num_filters=2 * width, filter_length=1, name='cond_map_%d' % (i + 1))) assert d.get_shape().as_list()[2] % 2 == 0 m = d.get_shape().as_list()[2] // 2 d_sigmoid = tf.sigmoid(d[:, :, :m]) d_tanh = tf.tanh(d[:, :, m:]) d = d_sigmoid * d_tanh l += masked.conv1d( d, num_filters=width, filter_length=1, name='res_%d' % (i + 1)) s += masked.conv1d( d, num_filters=skip_width, filter_length=1, name='skip_%d' % (i + 1)) s = tf.nn.relu(s) s = masked.conv1d(s, num_filters=skip_width, filter_length=1, name='out1') s = self._condition(s, masked.conv1d( en, num_filters=skip_width, filter_length=1, name='cond_map_out1')) s = tf.nn.relu(s) if self.aux > 0: en_logits = masked.conv1d( en, num_filters=skip_width, filter_length=1, name='cond_map_rec') enc_mb, enc_length, enc_channels = en_logits.get_shape().as_list() mb, length, channels = s.get_shape().as_list() assert enc_mb == mb assert enc_channels == channels en_logits = tf.nn.relu(en_logits) en_logits = tf.reshape(en_logits, [mb, enc_length, 1, channels]) _, _, reps, _ = tf.reshape(s, [mb, enc_length, -1, channels]).get_shape().as_list() en_logits = tf.tile(en_logits, [1, 1, reps, 1]) en_logits = tf.reshape(en_logits, [mb, length, channels]) en_logits = masked.conv1d(en_logits, num_filters=256, filter_length=1, name='en_logits') en_logits = tf.reshape(en_logits, [-1, 256]) en_probs = tf.nn.softmax(en_logits, name='en_softmax') ### # Compute the logits and get the loss. ### logits = masked.conv1d(s, num_filters=256, filter_length=1, name='logits') logits = tf.reshape(logits, [-1, 256]) probs = tf.nn.softmax(logits, name='softmax') x_indices = tf.cast(tf.reshape(x_quantized, [-1]), tf.int32) + 128 rec = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=x_indices, name='nll'), 0, name='loss') if self.aux > 0: aux = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=en_logits, labels=x_indices, name='en_nll'), 0, name='aux') else: aux = 0 kl = tf.reduce_mean(self._kl_normal(mn, v, tf.zeros(1), tf.ones(1)), name='kl') return { 'predictions': probs, 'loss': { 'kl': kl, 'rec': rec, 'aux': aux}, 'eval': { 'kl': kl, 'rec':rec }, 'quantized_input': x_quantized, 'encoding': encoding, }
def compute_wavenet_decoder_features(content, style): num_stages = 10 num_layers = 30 filter_length = 3 width = 512 skip_width = 256 # Encode the source with 8-bit Mu-Law. n_frames = content.shape[0] n_samples = content.shape[1] content_tf = np.ascontiguousarray(content) style_tf = np.ascontiguousarray(style) g = tf.Graph() content_features = [] style_features = [] layers = [] with g.as_default(), g.device('/cpu:0'), tf.Session() as sess: x = tf.placeholder('float32', [n_frames, n_samples], name="x") x_quantized = mu_law(x) x_scaled = tf.cast(x_quantized, tf.float32) / 128.0 x_scaled = tf.expand_dims(x_scaled, 2) layer = x_scaled layer = masked.conv1d(layer, num_filters=width, filter_length=filter_length, name='startconv') # Set up skip connections. s = masked.conv1d(layer, num_filters=skip_width, filter_length=1, name='skip_start') # Residual blocks with skip connections. for i in range(num_layers): dilation = 2**(i % num_stages) d = masked.conv1d(layer, num_filters=2 * width, filter_length=filter_length, dilation=dilation, name='dilatedconv_%d' % (i + 1)) assert d.get_shape().as_list()[2] % 2 == 0 m = d.get_shape().as_list()[2] // 2 d_sigmoid = tf.sigmoid(d[:, :, :m]) d_tanh = tf.tanh(d[:, :, m:]) d = d_sigmoid * d_tanh layer += masked.conv1d(d, num_filters=width, filter_length=1, name='res_%d' % (i + 1)) s += masked.conv1d(d, num_filters=skip_width, filter_length=1, name='skip_%d' % (i + 1)) layers.append(s) saver = tf.train.Saver() saver.restore(sess, './model.ckpt-200000') content_features = sess.run(layers, feed_dict={x: content_tf}) styles = sess.run(layers, feed_dict={x: style_tf}) for i, style_feature in enumerate(styles): n_features = np.prod(layers[i].shape.as_list()[-1]) features = np.reshape(style_feature, (-1, n_features)) style_gram = np.matmul(features.T, features) / (n_samples * n_frames) style_features.append(style_gram) return content_features, style_features
def build(self, inputs, is_training): """Build the graph for this configuration. Args: inputs: A dict of inputs. For training, should contain 'wav'. is_training: Whether we are training or not. Not used in this config. Returns: A dict of outputs that includes the 'predictions', 'loss', the 'encoding', the 'quantized_input', and whatever metrics we want to track for eval. """ del is_training num_stages = 10 num_layers = 30 filter_length = 3 width = 512 skip_width = 256 ae_num_stages = 10 ae_num_layers = 30 ae_filter_length = 3 ae_width = 128 print("@build, inputs: ", inputs) #pitch shape=(1,), wav shape=(1, 6144), key shape=(1,) # Encode the source with 8-bit Mu-Law. x = inputs['wav'] print("@build, x: ", x) #shape=(1, 6144) x_quantized = utils.mu_law(x) print("@build, x_quantized: ", x_quantized) #shape=(1, 6144) x_scaled = tf.cast(x_quantized, tf.float32) / 128.0 print("@build, x_scaled@1: ", x_scaled) #shape=(1, 6144) x_scaled = tf.expand_dims(x_scaled, 2) print("@build, x_scaled@2: ", x_scaled) #shape=(1, 6144, 1) ### # The Non-Causal Temporal Encoder. ### print("@build, ##Non-Causal Temporal Encoder...") print("\t create Layer ae_startconv") print("\t input[x_scaled] is: ", x_scaled) en = masked.conv1d( x_scaled, causal=False, num_filters=ae_width, #ae_width = 128 filter_length=ae_filter_length, name='ae_startconv') print("\t ae_startconv output [en] is:", en) #shape=(1. 6144, 128) print("\t create Layer ae_startconv Done\n") for num_layer in range(ae_num_layers): dilation = 2**(num_layer % ae_num_stages) print("\t create Layer relu") print("\t input[en] is: ", en) #shape=(1. 6144, 128) d = tf.nn.relu(en) print("\t relu output [d] is:", d) print("\t create Layer relu Done\n") print("\t create Layer ae_dilatedconv_{}, dilation={}".format( num_layer + 1, dilation)) print("\t input[d] is: ", d) d = masked.conv1d( d, causal=False, num_filters=ae_width, #128 filter_length=ae_filter_length, dilation=dilation, name='ae_dilatedconv_%d' % (num_layer + 1)) print("\t output [d] is:", d) print( "\t create Layer ae_dilatedconv_{}, dilation={} Done\n".format( num_layer + 1, dilation)) print("\t create Layer relu") print("\t input[d] is: ", d) d = tf.nn.relu(d) print("\t relu output [d] is:", d) print("\t create Layer relu Done\n") print("\t create Layer ae_res_{}".format(num_layer + 1)) print("\t input[en] is: ", en) print("\t input[d] is: ", d) en += masked.conv1d( d, num_filters=ae_width, #128 filter_length=1, name='ae_res_%d' % (num_layer + 1)) print("\t output [en] is:", en) #shape=(1, 6144, 128) print("\t create Layer ae_res_{} Done\n".format(num_layer + 1)) print("\t create Layer ae_bottleneck") print("\t input[en] is: ", en) #shape=(1, 6144, 128) en = masked.conv1d( en, num_filters=self.ae_bottleneck_width, #16 filter_length=1, name='ae_bottleneck') print("\t output[en] is: ", en) #shape=(1, 6144, 16) print("\t create Layer ae_bottleneck Done\n") print("\t create ae_pool") print("\t input[en] is: ", en) #shape=(1, 6144, 16) en = masked.pool1d(en, self.ae_hop_length, name='ae_pool', mode='avg') #ae_hop_length=512 print("\t output[en] is: ", en) #shape=(1, 12, 16) #6144/512=12 print("\t create ae_pool Done\n") encoding = en #encoding is 'feature vector', (125,16) for every 4 seconds voice. 125=4x16000/512 print("\t ##Non-Causal Temporal Encoder output[en|encoding] is: ", encoding) print("@build, ##Non-Causal Temporal Encoder...Done\n") ### # The WaveNet Decoder. ### print("@build, ##The WaveNet Decoder...") print("\t input[x_scaled] is: ", x_scaled) #shape=(1, 6144, 1) l = masked.shift_right(x_scaled) print("\t create startconv") print("\t input[l] is: ", l) #shape=(1, 6144, 1) l = masked.conv1d(l, num_filters=width, filter_length=filter_length, name='startconv') #width=512 print("\t output[l] is: ", l) #shape=(1, 6144, 512) print("\t create startconv Done\n") # Set up skip connections. print("\t create skip_start") print("\t input[l] is: ", l) s = masked.conv1d(l, num_filters=skip_width, filter_length=1, name='skip_start') #skip_width=256 print("\t output[s] is: ", s) #shape=(1, 6144, 256) print("\t create skip_start Done\n") # Residual blocks with skip connections. for i in range(num_layers): dilation = 2**(i % num_stages) print("\t create dilatedconv_{}, dilation={}".format( i + 1, dilation)) print("\t input[l] is: ", l) d = masked.conv1d(l, num_filters=2 * width, filter_length=filter_length, dilation=dilation, name='dilatedconv_%d' % (i + 1)) print("\t output[d] is: ", d) #shape=(1, 6144, 1024) print("\t create dilatedconv_{}, dilation={} Done\n".format( i + 1, dilation)) print("\t create _condition for cond_map_{}".format(i + 1)) print("\t input[d] is: ", d) print("\t input[en] is: ", en) d = self._condition( d, masked.conv1d(en, num_filters=2 * width, filter_length=1, name='cond_map_%d' % (i + 1))) print("\t output[d] is: ", d) print("\t create _condition for cond_map_{} Done\n".format(i + 1)) assert d.get_shape().as_list()[2] % 2 == 0 m = d.get_shape().as_list()[2] // 2 d_sigmoid = tf.sigmoid(d[:, :, :m]) d_tanh = tf.tanh(d[:, :, m:]) d = d_sigmoid * d_tanh print("\t d after some cacule:", d) #shape=(1, 6144, 512) print("") print("\t create res_{}".format(i + 1)) print("\t input[d] is: ", d) #shape=(1, 6144, 512) print("\t input[l] is: ", l) #shape=(1, 6144, 512) l += masked.conv1d(d, num_filters=width, filter_length=1, name='res_%d' % (i + 1)) #width=512 print("\t output[l] is: ", l) #shape=(1, 6144, 512) print("\t create res_{} Done\n".format(i + 1)) print("\t create skip_{}".format(i + 1)) print("\t input[d] is: ", d) #shape=(1, 6144, 512) print("\t input[s] is: ", s) #shape=(1, 6144, 256) s += masked.conv1d(d, num_filters=skip_width, filter_length=1, name='skip_%d' % (i + 1)) #skip_width=256 print("\t output[s] is: ", s) #shape=(1, 6144, 256) print("\t create skip_{} Done\n".format(i + 1)) print("\t create Layer relu") print("\t input[s] is: ", s) #shape=(1, 6144, 256) s = tf.nn.relu(s) print("\t output[s] is: ", s) #shape=(1, 6144, 256) print("\t create Layer relu Done\n") print("\t create Layer out1") print("\t input[s] is: ", s) #shape=(1, 6144, 256) s = masked.conv1d(s, num_filters=skip_width, filter_length=1, name='out1') #skip_width=256 print("\t output[s] is: ", s) #shape=(1, 6144, 256) print("\t create Layer out1 Done\n") print("\t create _condition for cond_map_out1") print("\t input[s] is: ", s) #shape=(1, 6144, 256) print("\t input[en] is: ", en) s = self._condition( s, masked.conv1d( en, num_filters=skip_width, #skip_width=256 filter_length=1, name='cond_map_out1')) print("\t output[s] is: ", s) print("\t create _condition for cond_map_out1 Done\n") print("\t create Layer relu") print("\t input[s] is: ", s) #shape=(1, 6144, 256) s = tf.nn.relu(s) print("\t output[s] is: ", s) #shape=(1, 6144, 256) print("\t create Layer relu Done\n") print("@build, ##The WaveNet Decoder...Done") ### # Compute the logits and get the loss. ### print("@build, ##Compute the logits and get the loss...") print("\t input[s] is: ", s) #shape=(1, 6144, 256) logits = masked.conv1d(s, num_filters=256, filter_length=1, name='logits') print("\t output[logits] is: ", logits) #shape=(1, 6144, 256) logits = tf.reshape(logits, [-1, 256]) print("\t logits after reshape: ", logits) #shape=(6144, 256) probs = tf.nn.softmax(logits, name='softmax') print("\t probs: ", probs) #shape=(6144, 256) print("\t x_quantized: ", x_quantized) # x_indices = tf.cast(tf.reshape(x_quantized, [-1]), tf.int32) + 128 print("\t x_indices", x_indices) #shape=(6144,) loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=x_indices, name='nll'), 0, name='loss') print("@build, ##Compute the logits and get the loss...Done") print("@build, Done, return:") print("\t probs:", probs) #shape=(6144, 256) print("\t loss:", loss) #shape=() print("\t x_quantized:", x_quantized) #shape=(1, 6144) print("\t encoding:", encoding) #shape=(1, 12, 16) return { 'predictions': probs, 'loss': loss, 'eval': { 'nll': loss }, 'quantized_input': x_quantized, 'encoding': encoding, }