def cost(self):
        """
		Construct the cost function op for the negative sampling cost
		"""

        # Get the embedded T-F vectors from the network
        embedding = self.prediction

        # Reshape I so that it is of the correct dimension
        I = tf.expand_dims(self.I, axis=2)

        # Normalize the speaker vectors and collect the speaker vectors
        # corresponding to the speakers in batch
        if self.normalize:
            speaker_vectors = tf.nn.l2_normalize(self.speaker_vectors, 1)
        else:
            speaker_vectors = self.speaker_vectors

        Vspeakers = tf.gather_nd(speaker_vectors, I)

        # Expand the dimensions in preparation for broadcasting
        Vspeakers_broad = tf.expand_dims(Vspeakers, 1)
        Vspeakers_broad = tf.expand_dims(Vspeakers_broad, 1)
        embedding_broad = tf.expand_dims(embedding, 3)

        # Compute the dot product between the embedding vectors and speaker
        # vectors
        dot = tf.reduce_sum(Vspeakers_broad * embedding_broad, 4)

        # Compute the cost for every element

        cost = -tf.log(tf.nn.sigmoid(self.y * dot))

        # Average the cost over all speakers in the input
        cost = tf.reduce_mean(cost, 3)

        # Average the cost over all batches
        cost = tf.reduce_mean(cost, 0)

        training_vars = tf.trainable_variables()
        for var in training_vars:
            if 'prediction' in var.name:
                variable_summaries(var)

        # Average the cost over all T-F elements.  Here is where weighting to
        # account for gradient confidence can occur
        cost = tf.reduce_mean(cost)

        tf.summary.scalar('cost', cost)

        #cost = cost + 0.001*self.adapt_front.l*reg

        # tf.summary.scalar('regularized', cost)
        return cost
    def front(self):
        # Front-End

        # x : [ Btot , 1, L , 1]
        # Equivalent to B_tot batches of image of height = 1, width = L and 1 channel -> for Conv1D with Conv2D
        input_front = tf.reshape(self.x, [self.B_tot, 1, self.L, 1])

        # Filter [filter_height, filter_width, input_channels, output_channels] = [1, W, 1, N]
        self.window_filter = get_scope_variable(
            'window',
            'w',
            shape=[self.window],
            initializer=tf.contrib.layers.xavier_initializer_conv2d())
        self.bases = get_scope_variable(
            'bases',
            'bases',
            shape=[self.window, self.N],
            initializer=tf.contrib.layers.xavier_initializer_conv2d())
        self.conv_filter = tf.reshape(
            tf.abs(tf.expand_dims(self.window_filter, 1)) * self.bases,
            [1, self.window, 1, self.N])
        # self.conv_filter = get_scope_variable('filters_front','filters_front', shape=[1, self.window, 1, self.N])
        variable_summaries(self.conv_filter)

        # 1 Dimensional convolution along T axis with a window length = self.window
        # And N = 256 filters -> Create a [Btot, 1, T, N]
        self.T = tf.shape(input_front)[2]

        if self.with_max_pool:
            self.X = tf.nn.conv2d(input_front,
                                  self.conv_filter,
                                  strides=[1, 1, 1, 1],
                                  padding="SAME",
                                  name='Conv_STFT')
            self.y, self.argmax = tf.nn.max_pool_with_argmax(
                self.X, [1, 1, self.max_pool_value, 1],
                strides=[1, 1, self.max_pool_value, 1],
                padding="SAME",
                name='output')
            print self.argmax
        elif self.with_average_pool:
            self.X = tf.nn.conv2d(input_front,
                                  self.conv_filter,
                                  strides=[1, 1, 1, 1],
                                  padding="SAME",
                                  name='Conv_STFT')
            # self.y = tf.nn.avg_pool(self.X, [1, 1, self.max_pool_value, 1], [1, 1, self.max_pool_value, 1], padding="SAME")
            self.y = tf.layers.average_pooling2d(self.X,
                                                 (1, self.max_pool_value),
                                                 strides=(1,
                                                          self.max_pool_value),
                                                 name='output')
        else:
            self.y = tf.nn.conv2d(input_front,
                                  self.conv_filter,
                                  strides=[1, 1, self.max_pool_value, 1],
                                  padding="SAME",
                                  name='Conv_STFT')

        # Reshape to Btot batches of T x N images with 1 channel
        # [Btot, 1, T_pool, N] -> [Btot, T-pool, N, 1]
        self.y = tf.transpose(self.y, [0, 2, 3, 1], name='output')

        tf.summary.image('front/output', self.y, max_outputs=3)
        y_shape = tf.shape(self.y)
        y = tf.reshape(self.y, [self.B_tot, y_shape[1] * y_shape[2]])
        self.p_hat = tf.reduce_sum(tf.abs(y), 0)
        self.sparse_constraint = tf.reduce_sum(kl_div(self.p, self.p_hat))

        return self.y
    def cost(self):
        # Definition of cost for Adapt model
        # Regularisation
        # shape = [B_tot, T, N]
        # regularization = tf.nn.l2_loss(self.conv_filter_2) + tf.nn.l2_loss(self.conv_filter)

        # non_negativity_1 = tf.reduce_sum(tf.square(tf.where(self.conv_filter < 0, self.conv_filter, tf.zeros_like(self.conv_filter))))
        # non_negativity_2 = tf.reduce_sum(tf.square(tf.where(self.conv_filter_2 < 0, self.conv_filter_2, tf.zeros_like(self.conv_filter_2))))
        # nn = 0.5 * (non_negativity_1 + non_negativity_2)
        neg = tf.square(
            tf.where(self.front < 0, self.front, tf.zeros_like(self.front)))
        nn = 10 * tf.reduce_mean(tf.reduce_sum(neg, [1, 2, 3]))

        regularization = self.l * (tf.nn.l2_loss(self.conv_filter_2) +
                                   tf.nn.l2_loss(self.conv_filter))

        # input_shape = [B, S, L]
        # Doing l2 norm on L axis :
        if self.pretraining:

            l2 = tf.reduce_sum(tf.square(self.x_non_mix - self.back), axis=-1)
            l2 = tf.reduce_sum(l2, -1)  # Sum over all the speakers
            l2 = tf.reduce_mean(l2, -1)  # Mean over batches

            sdr_improvement, sdr = self.sdr_improvement(
                self.x_non_mix, self.back)
            sdr = tf.reduce_mean(sdr)  # Mean over speakers
            sdr = tf.reduce_mean(sdr)  # Mean over batches

            if self.loss == 'l2':
                loss = l2
            elif self.loss == 'sdr':
                loss = sdr
            else:
                loss = l2 + sdr

            # loss = loss + nn

        else:
            # Compute loss over all possible permutations

            perms = list(
                permutations(range(self.S))
            )  # ex with 3: [0, 1, 2], [0, 2 ,1], [1, 0, 2], [1, 2, 0], [2, 1, 0], [2, 0, 1]
            length_perm = len(perms)
            perms = tf.reshape(tf.constant(perms), [1, length_perm, self.S, 1])
            perms = tf.tile(perms, [self.B, 1, 1, 1])

            batch_range = tf.tile(
                tf.reshape(tf.range(self.B, dtype=tf.int32),
                           shape=[self.B, 1, 1, 1]),
                [1, length_perm, self.S, 1])
            perm_range = tf.tile(
                tf.reshape(tf.range(length_perm, dtype=tf.int32),
                           shape=[1, length_perm, 1, 1]),
                [self.B, 1, self.S, 1])
            indicies = tf.concat([batch_range, perm_range, perms], axis=3)

            # [B, P, S, L]
            permuted_back = tf.gather_nd(
                tf.tile(tf.reshape(self.back, [self.B, 1, self.S, self.L]),
                        [1, length_perm, 1, 1]), indicies)  #

            X_nmr = tf.reshape(self.x_non_mix, [self.B, 1, self.S, self.L])

            l2 = tf.reduce_sum(tf.square(X_nmr - permuted_back),
                               axis=-1)  # L2^2 norm
            l2 = tf.reduce_min(
                l2,
                axis=1)  # Get the minimum over all possible permutations : B S
            l2 = tf.reduce_sum(l2, -1)
            l2 = tf.reduce_mean(l2, -1)

            sdr_improvement, sdr = self.sdr_improvement(X_nmr, self.back, True)
            sdr = tf.reduce_min(
                sdr, 1)  # Get the minimum over all possible permutations : B S
            sdr = tf.reduce_sum(sdr, -1)
            sdr = tf.reduce_mean(sdr, -1)

            if self.loss == 'l2':
                loss = l2
            elif self.loss == 'sdr':
                loss = sdr
            else:
                loss = 1e-3 * l2 + sdr

        # shape = [B]
        # Compute mean over batches
        cost_value = loss
        if self.beta != 0.0:
            cost_value += self.beta * self.sparse_constraint
        if self.l != 0.0:
            cost_value += self.l * regularization
        if self.overlap_coef != 0.0:
            cost_value += self.overlap_coef * self.overlapping_constraint

        variable_summaries(self.conv_filter_2)

        tf.summary.audio(name="audio/output/reconstructed",
                         tensor=tf.reshape(self.back, [-1, self.L]),
                         sample_rate=config.fs,
                         max_outputs=2)

        with tf.name_scope('loss_values'):
            tf.summary.scalar('l2_loss', l2)
            tf.summary.scalar('SDR', sdr)
            tf.summary.scalar('SDR_improvement', sdr_improvement)
            tf.summary.scalar('Non negativity loss', nn)
            tf.summary.scalar('sparsity', tf.reduce_mean(self.p_hat))
            tf.summary.scalar('sparsity_loss',
                              self.beta * self.sparse_constraint)
            tf.summary.scalar('L2_reg', self.l * regularization)
            tf.summary.scalar('loss', cost_value)
            tf.summary.scalar('overlapping', self.overlapping)
            tf.summary.scalar('overlapping_loss',
                              self.overlap_coef * self.overlapping_constraint)

        return cost_value
Пример #4
0
    def cost(self):
        """
		Construct the cost function op for the negative sampling cost
		"""

        # Get the embedded T-F vectors from the network
        embedding = self.prediction  # [B, T, F, E]

        # Normalize the speaker vectors and collect the speaker vectors
        # corresponding to the speakers in batch
        if self.normalize:
            speaker_vectors = tf.nn.l2_normalize(self.speaker_vectors, 1)
        else:
            speaker_vectors = self.speaker_vectors

        if self.sampling is None:
            I = tf.expand_dims(self.I, axis=2)  # [B, S, 1]
            # Gathering the speaker_vectors [|S|, E]
            Vspeakers = tf.gather_nd(speaker_vectors, I)  # [B, S, E]
        else:

            I = tf.expand_dims(self.I, axis=2)

            # Gathering the speaker_vectors [B, S, E]
            Vspeakers = tf.gather_nd(speaker_vectors, I)
            # Get index of dominant speaker
            dominant = tf.argmax(self.y, -1)  # [B, T, F]

            # [B, TF]
            dominant = tf.reshape(dominant, [self.B, -1, 1])

            # []
            dominant_speaker = tf.gather(self.I, dominant)  # [B, TF]
            dominant_speaker_vector = tf.gather_nd(
                tf.expand_dims(speaker_vectors, 1),
                dominant_speaker)  # [B, TF, E]
            dominant_speaker_vector = tf.reshape(
                dominant_speaker_vector,
                [self.B, -1, self.F, self.embedding_size])
            dominant_speaker_vector = tf.expand_dims(dominant_speaker_vector,
                                                     3)  # [B, T, F, 1, E]

            if self.ns_method == 'k-nearest':
                # For each speaker vector get the K-neighbors
                with tf.name_scope('K-Neighbors'):

                    # [B, S, 1, E]
                    Vspeakers_ext = tf.expand_dims(Vspeakers, 2)

                    # [1, 1, |S|, E]
                    speaker_vectors_ext = tf.expand_dims(
                        tf.expand_dims(speaker_vectors, 0), 0)

                    # dot product # [B, S, |S|]
                    prod_dot = tf.reduce_sum(
                        Vspeakers_ext * speaker_vectors_ext, 3)

                    # K neighbors [B, S, K]
                    _, k_neighbors = tf.nn.top_k(prod_dot,
                                                 k=self.sampling,
                                                 sorted=False)

                    k_neighbors = tf.reshape(k_neighbors, [-1, 1])

                    # K neighbors vectors [B, S, K, E]
                    k_neighbors_vectors = tf.gather_nd(speaker_vectors,
                                                       k_neighbors)
                    k_neighbors_vectors = tf.reshape(
                        k_neighbors_vectors,
                        [self.B, self.S, self.sampling, self.embedding_size])

                    batch_range = tf.tile(
                        tf.reshape(tf.range(tf.cast(self.B, tf.int64),
                                            dtype=tf.int64),
                                   shape=[self.B, 1, 1]),
                        [1, tf.shape(dominant)[1], 1])
                    indices = tf.concat([batch_range, dominant], axis=2)

                    # Gathered K-nearest neighbors on each tf bins for the dominant
                    # [B, T, F, K, E]
                    vectors_tf = tf.reshape(
                        tf.gather_nd(k_neighbors_vectors, indices), [
                            self.B, -1, self.F, self.sampling,
                            self.embedding_size
                        ])
            elif self.ns_method == 'random':

                # Select randomly K other vectors, except the one in the batch
                with tf.name_scope('Random'):

                    ext_I = tf.cast(tf.expand_dims(self.I, 1), tf.int32)
                    ranges = tf.cast(
                        tf.tile(
                            tf.reshape(tf.range(self.num_speakers),
                                       [1, self.num_speakers, 1]),
                            [self.B, 1, 1]), tf.int32)

                    # [B, S] boolean mask
                    indices_available = tf.logical_not(
                        tf.reduce_any(tf.equal(ext_I, ranges), -1))

                    indices_available = tf.boolean_mask(
                        tf.squeeze(ranges), indices_available)
                    # [B, |S| - S]
                    indices_available = tf.reshape(
                        indices_available,
                        [self.B, self.num_speakers - self.S])

                    shuffled_indices = tf.map_fn(
                        lambda x: tf.random_shuffle(x, seed=42),
                        indices_available)
                    rand_I = shuffled_indices[:, :self.sampling]  # [B, K]

                    rand_I = tf.expand_dims(rand_I, 2)  # [B, K, 1]

                    # Gathering the speaker_vectors [B, K, E]
                    Vspeakers_other = tf.gather_nd(speaker_vectors, rand_I)
                    vectors_tf = tf.reshape(
                        Vspeakers_other,
                        [self.B, 1, 1, self.sampling, self.embedding_size])

            # Additional term for the loss
            embedding_ext = tf.expand_dims(embedding, 3)

            doto = tf.reduce_sum(vectors_tf * embedding_ext, -1)
            c = -tf.log(tf.nn.sigmoid(tf.negative(doto)))  # [B, T, F, K]
            neg_sampl = tf.reduce_mean(c, -1)  # [B, T, F]

        # Expand the dimensions in preparation for broadcasting
        Vspeakers_broad = tf.expand_dims(Vspeakers, 1)
        Vspeakers_broad = tf.expand_dims(Vspeakers_broad, 1)
        embedding_broad = tf.expand_dims(embedding, 3)

        # Compute the dot product between the embedding vectors and speaker
        # vectors
        dot = tf.reduce_sum(Vspeakers_broad * embedding_broad, 4)

        # Compute the cost for every element

        cost = -tf.log(tf.nn.sigmoid(self.y * dot))

        # Average the cost over all speakers in the input
        cost = tf.reduce_mean(cost, 3)

        if self.sampling is not None:
            cost += self.ns_rate * neg_sampl

        # Average the cost over all batches
        cost = tf.reduce_mean(cost, 0)

        training_vars = tf.trainable_variables()
        for var in training_vars:
            if 'prediction' in var.name:
                variable_summaries(var)

        # Average the cost over all T-F elements.  Here is where weighting to
        # account for gradient confidence can occur
        cost = tf.reduce_mean(cost)

        tf.summary.scalar('cost', cost)

        #cost = cost + 0.001*self.adapt_front.l*reg

        # tf.summary.scalar('regularized', cost)
        return cost