def cost(self): """ Construct the cost function op for the negative sampling cost """ # Get the embedded T-F vectors from the network embedding = self.prediction # Reshape I so that it is of the correct dimension I = tf.expand_dims(self.I, axis=2) # Normalize the speaker vectors and collect the speaker vectors # corresponding to the speakers in batch if self.normalize: speaker_vectors = tf.nn.l2_normalize(self.speaker_vectors, 1) else: speaker_vectors = self.speaker_vectors Vspeakers = tf.gather_nd(speaker_vectors, I) # Expand the dimensions in preparation for broadcasting Vspeakers_broad = tf.expand_dims(Vspeakers, 1) Vspeakers_broad = tf.expand_dims(Vspeakers_broad, 1) embedding_broad = tf.expand_dims(embedding, 3) # Compute the dot product between the embedding vectors and speaker # vectors dot = tf.reduce_sum(Vspeakers_broad * embedding_broad, 4) # Compute the cost for every element cost = -tf.log(tf.nn.sigmoid(self.y * dot)) # Average the cost over all speakers in the input cost = tf.reduce_mean(cost, 3) # Average the cost over all batches cost = tf.reduce_mean(cost, 0) training_vars = tf.trainable_variables() for var in training_vars: if 'prediction' in var.name: variable_summaries(var) # Average the cost over all T-F elements. Here is where weighting to # account for gradient confidence can occur cost = tf.reduce_mean(cost) tf.summary.scalar('cost', cost) #cost = cost + 0.001*self.adapt_front.l*reg # tf.summary.scalar('regularized', cost) return cost
def front(self): # Front-End # x : [ Btot , 1, L , 1] # Equivalent to B_tot batches of image of height = 1, width = L and 1 channel -> for Conv1D with Conv2D input_front = tf.reshape(self.x, [self.B_tot, 1, self.L, 1]) # Filter [filter_height, filter_width, input_channels, output_channels] = [1, W, 1, N] self.window_filter = get_scope_variable( 'window', 'w', shape=[self.window], initializer=tf.contrib.layers.xavier_initializer_conv2d()) self.bases = get_scope_variable( 'bases', 'bases', shape=[self.window, self.N], initializer=tf.contrib.layers.xavier_initializer_conv2d()) self.conv_filter = tf.reshape( tf.abs(tf.expand_dims(self.window_filter, 1)) * self.bases, [1, self.window, 1, self.N]) # self.conv_filter = get_scope_variable('filters_front','filters_front', shape=[1, self.window, 1, self.N]) variable_summaries(self.conv_filter) # 1 Dimensional convolution along T axis with a window length = self.window # And N = 256 filters -> Create a [Btot, 1, T, N] self.T = tf.shape(input_front)[2] if self.with_max_pool: self.X = tf.nn.conv2d(input_front, self.conv_filter, strides=[1, 1, 1, 1], padding="SAME", name='Conv_STFT') self.y, self.argmax = tf.nn.max_pool_with_argmax( self.X, [1, 1, self.max_pool_value, 1], strides=[1, 1, self.max_pool_value, 1], padding="SAME", name='output') print self.argmax elif self.with_average_pool: self.X = tf.nn.conv2d(input_front, self.conv_filter, strides=[1, 1, 1, 1], padding="SAME", name='Conv_STFT') # self.y = tf.nn.avg_pool(self.X, [1, 1, self.max_pool_value, 1], [1, 1, self.max_pool_value, 1], padding="SAME") self.y = tf.layers.average_pooling2d(self.X, (1, self.max_pool_value), strides=(1, self.max_pool_value), name='output') else: self.y = tf.nn.conv2d(input_front, self.conv_filter, strides=[1, 1, self.max_pool_value, 1], padding="SAME", name='Conv_STFT') # Reshape to Btot batches of T x N images with 1 channel # [Btot, 1, T_pool, N] -> [Btot, T-pool, N, 1] self.y = tf.transpose(self.y, [0, 2, 3, 1], name='output') tf.summary.image('front/output', self.y, max_outputs=3) y_shape = tf.shape(self.y) y = tf.reshape(self.y, [self.B_tot, y_shape[1] * y_shape[2]]) self.p_hat = tf.reduce_sum(tf.abs(y), 0) self.sparse_constraint = tf.reduce_sum(kl_div(self.p, self.p_hat)) return self.y
def cost(self): # Definition of cost for Adapt model # Regularisation # shape = [B_tot, T, N] # regularization = tf.nn.l2_loss(self.conv_filter_2) + tf.nn.l2_loss(self.conv_filter) # non_negativity_1 = tf.reduce_sum(tf.square(tf.where(self.conv_filter < 0, self.conv_filter, tf.zeros_like(self.conv_filter)))) # non_negativity_2 = tf.reduce_sum(tf.square(tf.where(self.conv_filter_2 < 0, self.conv_filter_2, tf.zeros_like(self.conv_filter_2)))) # nn = 0.5 * (non_negativity_1 + non_negativity_2) neg = tf.square( tf.where(self.front < 0, self.front, tf.zeros_like(self.front))) nn = 10 * tf.reduce_mean(tf.reduce_sum(neg, [1, 2, 3])) regularization = self.l * (tf.nn.l2_loss(self.conv_filter_2) + tf.nn.l2_loss(self.conv_filter)) # input_shape = [B, S, L] # Doing l2 norm on L axis : if self.pretraining: l2 = tf.reduce_sum(tf.square(self.x_non_mix - self.back), axis=-1) l2 = tf.reduce_sum(l2, -1) # Sum over all the speakers l2 = tf.reduce_mean(l2, -1) # Mean over batches sdr_improvement, sdr = self.sdr_improvement( self.x_non_mix, self.back) sdr = tf.reduce_mean(sdr) # Mean over speakers sdr = tf.reduce_mean(sdr) # Mean over batches if self.loss == 'l2': loss = l2 elif self.loss == 'sdr': loss = sdr else: loss = l2 + sdr # loss = loss + nn else: # Compute loss over all possible permutations perms = list( permutations(range(self.S)) ) # ex with 3: [0, 1, 2], [0, 2 ,1], [1, 0, 2], [1, 2, 0], [2, 1, 0], [2, 0, 1] length_perm = len(perms) perms = tf.reshape(tf.constant(perms), [1, length_perm, self.S, 1]) perms = tf.tile(perms, [self.B, 1, 1, 1]) batch_range = tf.tile( tf.reshape(tf.range(self.B, dtype=tf.int32), shape=[self.B, 1, 1, 1]), [1, length_perm, self.S, 1]) perm_range = tf.tile( tf.reshape(tf.range(length_perm, dtype=tf.int32), shape=[1, length_perm, 1, 1]), [self.B, 1, self.S, 1]) indicies = tf.concat([batch_range, perm_range, perms], axis=3) # [B, P, S, L] permuted_back = tf.gather_nd( tf.tile(tf.reshape(self.back, [self.B, 1, self.S, self.L]), [1, length_perm, 1, 1]), indicies) # X_nmr = tf.reshape(self.x_non_mix, [self.B, 1, self.S, self.L]) l2 = tf.reduce_sum(tf.square(X_nmr - permuted_back), axis=-1) # L2^2 norm l2 = tf.reduce_min( l2, axis=1) # Get the minimum over all possible permutations : B S l2 = tf.reduce_sum(l2, -1) l2 = tf.reduce_mean(l2, -1) sdr_improvement, sdr = self.sdr_improvement(X_nmr, self.back, True) sdr = tf.reduce_min( sdr, 1) # Get the minimum over all possible permutations : B S sdr = tf.reduce_sum(sdr, -1) sdr = tf.reduce_mean(sdr, -1) if self.loss == 'l2': loss = l2 elif self.loss == 'sdr': loss = sdr else: loss = 1e-3 * l2 + sdr # shape = [B] # Compute mean over batches cost_value = loss if self.beta != 0.0: cost_value += self.beta * self.sparse_constraint if self.l != 0.0: cost_value += self.l * regularization if self.overlap_coef != 0.0: cost_value += self.overlap_coef * self.overlapping_constraint variable_summaries(self.conv_filter_2) tf.summary.audio(name="audio/output/reconstructed", tensor=tf.reshape(self.back, [-1, self.L]), sample_rate=config.fs, max_outputs=2) with tf.name_scope('loss_values'): tf.summary.scalar('l2_loss', l2) tf.summary.scalar('SDR', sdr) tf.summary.scalar('SDR_improvement', sdr_improvement) tf.summary.scalar('Non negativity loss', nn) tf.summary.scalar('sparsity', tf.reduce_mean(self.p_hat)) tf.summary.scalar('sparsity_loss', self.beta * self.sparse_constraint) tf.summary.scalar('L2_reg', self.l * regularization) tf.summary.scalar('loss', cost_value) tf.summary.scalar('overlapping', self.overlapping) tf.summary.scalar('overlapping_loss', self.overlap_coef * self.overlapping_constraint) return cost_value
def cost(self): """ Construct the cost function op for the negative sampling cost """ # Get the embedded T-F vectors from the network embedding = self.prediction # [B, T, F, E] # Normalize the speaker vectors and collect the speaker vectors # corresponding to the speakers in batch if self.normalize: speaker_vectors = tf.nn.l2_normalize(self.speaker_vectors, 1) else: speaker_vectors = self.speaker_vectors if self.sampling is None: I = tf.expand_dims(self.I, axis=2) # [B, S, 1] # Gathering the speaker_vectors [|S|, E] Vspeakers = tf.gather_nd(speaker_vectors, I) # [B, S, E] else: I = tf.expand_dims(self.I, axis=2) # Gathering the speaker_vectors [B, S, E] Vspeakers = tf.gather_nd(speaker_vectors, I) # Get index of dominant speaker dominant = tf.argmax(self.y, -1) # [B, T, F] # [B, TF] dominant = tf.reshape(dominant, [self.B, -1, 1]) # [] dominant_speaker = tf.gather(self.I, dominant) # [B, TF] dominant_speaker_vector = tf.gather_nd( tf.expand_dims(speaker_vectors, 1), dominant_speaker) # [B, TF, E] dominant_speaker_vector = tf.reshape( dominant_speaker_vector, [self.B, -1, self.F, self.embedding_size]) dominant_speaker_vector = tf.expand_dims(dominant_speaker_vector, 3) # [B, T, F, 1, E] if self.ns_method == 'k-nearest': # For each speaker vector get the K-neighbors with tf.name_scope('K-Neighbors'): # [B, S, 1, E] Vspeakers_ext = tf.expand_dims(Vspeakers, 2) # [1, 1, |S|, E] speaker_vectors_ext = tf.expand_dims( tf.expand_dims(speaker_vectors, 0), 0) # dot product # [B, S, |S|] prod_dot = tf.reduce_sum( Vspeakers_ext * speaker_vectors_ext, 3) # K neighbors [B, S, K] _, k_neighbors = tf.nn.top_k(prod_dot, k=self.sampling, sorted=False) k_neighbors = tf.reshape(k_neighbors, [-1, 1]) # K neighbors vectors [B, S, K, E] k_neighbors_vectors = tf.gather_nd(speaker_vectors, k_neighbors) k_neighbors_vectors = tf.reshape( k_neighbors_vectors, [self.B, self.S, self.sampling, self.embedding_size]) batch_range = tf.tile( tf.reshape(tf.range(tf.cast(self.B, tf.int64), dtype=tf.int64), shape=[self.B, 1, 1]), [1, tf.shape(dominant)[1], 1]) indices = tf.concat([batch_range, dominant], axis=2) # Gathered K-nearest neighbors on each tf bins for the dominant # [B, T, F, K, E] vectors_tf = tf.reshape( tf.gather_nd(k_neighbors_vectors, indices), [ self.B, -1, self.F, self.sampling, self.embedding_size ]) elif self.ns_method == 'random': # Select randomly K other vectors, except the one in the batch with tf.name_scope('Random'): ext_I = tf.cast(tf.expand_dims(self.I, 1), tf.int32) ranges = tf.cast( tf.tile( tf.reshape(tf.range(self.num_speakers), [1, self.num_speakers, 1]), [self.B, 1, 1]), tf.int32) # [B, S] boolean mask indices_available = tf.logical_not( tf.reduce_any(tf.equal(ext_I, ranges), -1)) indices_available = tf.boolean_mask( tf.squeeze(ranges), indices_available) # [B, |S| - S] indices_available = tf.reshape( indices_available, [self.B, self.num_speakers - self.S]) shuffled_indices = tf.map_fn( lambda x: tf.random_shuffle(x, seed=42), indices_available) rand_I = shuffled_indices[:, :self.sampling] # [B, K] rand_I = tf.expand_dims(rand_I, 2) # [B, K, 1] # Gathering the speaker_vectors [B, K, E] Vspeakers_other = tf.gather_nd(speaker_vectors, rand_I) vectors_tf = tf.reshape( Vspeakers_other, [self.B, 1, 1, self.sampling, self.embedding_size]) # Additional term for the loss embedding_ext = tf.expand_dims(embedding, 3) doto = tf.reduce_sum(vectors_tf * embedding_ext, -1) c = -tf.log(tf.nn.sigmoid(tf.negative(doto))) # [B, T, F, K] neg_sampl = tf.reduce_mean(c, -1) # [B, T, F] # Expand the dimensions in preparation for broadcasting Vspeakers_broad = tf.expand_dims(Vspeakers, 1) Vspeakers_broad = tf.expand_dims(Vspeakers_broad, 1) embedding_broad = tf.expand_dims(embedding, 3) # Compute the dot product between the embedding vectors and speaker # vectors dot = tf.reduce_sum(Vspeakers_broad * embedding_broad, 4) # Compute the cost for every element cost = -tf.log(tf.nn.sigmoid(self.y * dot)) # Average the cost over all speakers in the input cost = tf.reduce_mean(cost, 3) if self.sampling is not None: cost += self.ns_rate * neg_sampl # Average the cost over all batches cost = tf.reduce_mean(cost, 0) training_vars = tf.trainable_variables() for var in training_vars: if 'prediction' in var.name: variable_summaries(var) # Average the cost over all T-F elements. Here is where weighting to # account for gradient confidence can occur cost = tf.reduce_mean(cost) tf.summary.scalar('cost', cost) #cost = cost + 0.001*self.adapt_front.l*reg # tf.summary.scalar('regularized', cost) return cost