Exemplo n.º 1
0
def create_model(self, args):
    spectrogram_min_note = librosa.core.hz_to_midi(self.spectrogram_fmin)
    if args.overtone_stacking > 0 or args.undertone_stacking > 0:
        # offset = args.min_note - spectrogram_min_note
        spectrogram = harmonic_stacking(self, self.spectrogram,
                                        args.undertone_stacking,
                                        args.overtone_stacking)

    else:
        spectrogram = self.spectrogram[:, :, :self.bin_count, :]

    # layer = tf.pad(layer, ((0, 0), (0, 0), (41, 41), (0, 0)))
    print(spectrogram.shape)

    context_size = int(self.context_width / self.spectrogram_hop_size)

    if args.activation is not None:
        activation = getattr(tf.nn, args.activation)

    with tf.name_scope('model_pitch'):
        layer = spectrogram

        if args.architecture == "bittner_improved":
            layer = tf.layers.conv2d(layer,
                                     8 * args.capacity_multiplier, (5, 5),
                                     (1, 1),
                                     "same",
                                     activation=None,
                                     use_bias=False)
            layer = tf.layers.batch_normalization(layer,
                                                  training=self.is_training)
            layer = activation(layer)
            layer = tf.layers.dropout(layer,
                                      args.dropout,
                                      training=self.is_training)
            residual = layer
            layer = tf.layers.conv2d(layer,
                                     8 * args.capacity_multiplier, (5, 5),
                                     (1, 1),
                                     "same",
                                     activation=None,
                                     use_bias=False)
            layer = tf.layers.batch_normalization(layer,
                                                  training=self.is_training)
            layer = activation(layer)
            layer = tf.layers.dropout(layer,
                                      args.dropout,
                                      training=self.is_training)
            residual += layer
            layer = tf.layers.conv2d(layer,
                                     8 * args.capacity_multiplier, (9, 3),
                                     (1, 1),
                                     "same",
                                     activation=None,
                                     use_bias=False)
            layer = tf.layers.batch_normalization(layer,
                                                  training=self.is_training)
            layer = activation(layer)
            layer = tf.layers.dropout(layer,
                                      args.dropout,
                                      training=self.is_training)
            residual += layer
            layer = tf.layers.conv2d(layer,
                                     8 * args.capacity_multiplier, (9, 3),
                                     (1, 1),
                                     "same",
                                     activation=None,
                                     use_bias=False)
            layer = tf.layers.batch_normalization(layer,
                                                  training=self.is_training)
            layer = activation(layer)
            layer = tf.layers.dropout(layer,
                                      args.dropout,
                                      training=self.is_training)
            residual += layer
            layer = tf.layers.conv2d(layer,
                                     8 * args.capacity_multiplier, (5, 70),
                                     (1, 1),
                                     "same",
                                     activation=None,
                                     use_bias=False)
            layer = tf.layers.batch_normalization(layer,
                                                  training=self.is_training)
            layer = activation(layer)
            layer = tf.layers.dropout(layer,
                                      args.dropout,
                                      training=self.is_training)
            residual += layer

            layer = residual

            layer = tf.layers.batch_normalization(layer,
                                                  training=self.is_training)
            layer = tf.layers.conv2d(layer,
                                     1, (10, 1), (1, 1),
                                     "same",
                                     activation=None,
                                     use_bias=False)
            layer_cut = layer[:, context_size:-context_size, :, :]

        if args.architecture == "bittnerlike":

            layer = tf.layers.conv2d(layer,
                                     8 * args.capacity_multiplier,
                                     (args.conv_ctx * 2 + 1, 5), (1, 1),
                                     "same",
                                     activation=activation)
            layer = common.regularization(layer,
                                          args,
                                          training=self.is_training)
            residual = layer
            layer = tf.layers.conv2d(layer,
                                     8 * args.capacity_multiplier,
                                     (args.conv_ctx * 2 + 1, 5), (1, 1),
                                     "same",
                                     activation=activation)
            layer = common.regularization(layer,
                                          args,
                                          training=self.is_training)
            residual += layer
            layer = tf.layers.conv2d(layer,
                                     8 * args.capacity_multiplier,
                                     (args.conv_ctx * 2 + 1, 3), (1, 1),
                                     "same",
                                     activation=activation)
            layer = common.regularization(layer,
                                          args,
                                          training=self.is_training)
            residual += layer
            layer = tf.layers.conv2d(layer,
                                     8 * args.capacity_multiplier,
                                     (args.conv_ctx * 2 + 1, 3), (1, 1),
                                     "same",
                                     activation=activation)
            layer = common.regularization(layer,
                                          args,
                                          training=self.is_training)
            residual += layer
            layer = tf.layers.conv2d(layer,
                                     8 * args.capacity_multiplier,
                                     (args.conv_ctx * 2 + 1, 70), (1, 1),
                                     "same",
                                     activation=activation)
            layer = common.regularization(layer,
                                          args,
                                          training=self.is_training)
            residual += layer

            layer = residual

            layer = tf.layers.conv2d(layer,
                                     1, (args.last_conv_ctx * 2 + 1, 1),
                                     (1, 1),
                                     "same",
                                     activation=None)
            layer_cut = layer[:, context_size:-context_size, :, :]

        if args.architecture.startswith("deep_simple"):
            residual = None
            for i in range(args.stacks):
                layer = tf.layers.conv2d(layer,
                                         8 * args.capacity_multiplier,
                                         (args.conv_ctx, args.conv_range),
                                         (1, 1),
                                         "same",
                                         activation=None)

                layer = activation(layer)

                if args.harmonic_stacking:
                    layer = harmonic_stacking(self, layer,
                                              args.harmonic_stacking,
                                              args.harmonic_stacking + 1)

                layer = common.regularization(layer,
                                              args,
                                              training=self.is_training)

                if residual is None:
                    residual = layer
                else:
                    residual += layer

            layer = residual

            layer = tf.layers.conv2d(layer,
                                     1, (args.last_conv_ctx + 1, 1), (1, 1),
                                     "same",
                                     activation=None)
            layer_cut = layer[:, context_size:-context_size, :, :]

        if args.architecture.startswith("deep_smooth"):
            residual = None
            ctx_end = 1
            dilations_start = 5
            for i in range(args.stacks):
                conv_ctx = args.conv_ctx if i < ctx_end or i >= dilations_start else 1
                dil_rate = (1, 1) if i < dilations_start else (2**(
                    i - dilations_start), 1)
                layer = tf.layers.conv2d(layer,
                                         8 * args.capacity_multiplier,
                                         (conv_ctx, args.conv_range), (1, 1),
                                         "same",
                                         activation=None,
                                         dilation_rate=dil_rate)
                print(i, "kernel", (conv_ctx, args.conv_range), "dilation",
                      dil_rate)

                layer = activation(layer)

                if args.harmonic_stacking:
                    layer = harmonic_stacking(self, layer,
                                              args.harmonic_stacking,
                                              args.harmonic_stacking + 1)

                layer = common.regularization(layer,
                                              args,
                                              training=self.is_training)

                if residual is None:
                    residual = layer
                else:
                    residual += layer

            layer = residual

            layer = tf.layers.conv2d(layer,
                                     1, (args.last_conv_ctx, 1), (1, 1),
                                     "same",
                                     activation=None)
            layer_cut = layer[:, context_size:-context_size, :, :]

        self.note_logits = tf.squeeze(layer_cut, -1)
        print("note_logits shape", self.note_logits.shape)

    if args.voicing:
        with tf.name_scope('model_voicing'):
            voicing_layer = tf.concat([tf.stop_gradient(layer), spectrogram],
                                      axis=-1)

            note = int(int(voicing_layer.shape[2]) / 6 / 12)

            voicing_layer = tf.layers.conv2d(
                voicing_layer,
                8 * args.voicing_capacity_multiplier,
                (args.voicing_conv_ctx * 2 + 1, note), (1, 1),
                "same",
                activation=activation)
            voicing_layer = common.regularization(voicing_layer,
                                                  args,
                                                  training=self.is_training)

            voicing_layer = tf.layers.conv2d(
                voicing_layer,
                8 * args.voicing_capacity_multiplier,
                (args.voicing_conv_ctx * 2 + 1, note), (1, note),
                "same",
                activation=activation)
            voicing_layer = common.regularization(voicing_layer,
                                                  args,
                                                  training=self.is_training)

            octave = int(int(voicing_layer.shape[2]) / 6)
            voicing_layer = tf.layers.conv2d(
                voicing_layer,
                8 * args.voicing_capacity_multiplier,
                (args.voicing_conv_ctx * 2 + 1, octave), (1, 1),
                "same",
                activation=activation)
            voicing_layer = common.regularization(voicing_layer,
                                                  args,
                                                  training=self.is_training)

            voicing_layer = tf.layers.conv2d(
                voicing_layer,
                8 * args.voicing_capacity_multiplier,
                (args.voicing_conv_ctx * 2 + 1, octave), (1, octave),
                "same",
                activation=activation)
            voicing_layer = common.regularization(voicing_layer,
                                                  args,
                                                  training=self.is_training)

            print("adding last conv valid layer")
            print("model output", voicing_layer.shape)
            if args.voicing_last_conv_ctx:
                voicing_layer = tf.pad(
                    voicing_layer,
                    ((0, 0),
                     (args.voicing_last_conv_ctx, args.voicing_last_conv_ctx),
                     (0, 0), (0, 0)))
                print("padded", voicing_layer.shape)
            voicing_layer = tf.layers.conv2d(
                voicing_layer,
                1,
                (args.voicing_last_conv_ctx * 2 + 1, voicing_layer.shape[2]),
                (1, 1),
                "valid",
                activation=None,
                use_bias=True)
            print("last conv output", voicing_layer.shape)
            voicing_layer = voicing_layer[:, context_size:-context_size, :, :]
            print("cut context", voicing_layer.shape)
            self.voicing_logits = tf.squeeze(voicing_layer)
            print("squeeze", voicing_layer.shape)
    else:
        self.voicing_threshold = tf.Variable(0.15, trainable=False)
        tf.summary.scalar("model/voicing_threshold", self.voicing_threshold)

    self.loss = common.loss(self, args)
    self.est_notes = common.est_notes(self, args)
    self.training = common.optimizer(self, args)
Exemplo n.º 2
0
def create_model(self, args):
    if args.spectrogram_undertone_stacking > 0 or args.spectrogram_overtone_stacking > 1:
        # pro spektrogramy začínající na nižší notě než je výstup
        # spectrogram_min_note = librosa.core.hz_to_midi(self.spectrogram_fmin)
        # offset = args.min_note - spectrogram_min_note
        spectrogram = common.harmonic_stacking(
            self, self.spectrogram, args.spectrogram_undertone_stacking,
            args.spectrogram_overtone_stacking)

    else:
        spectrogram = self.spectrogram[:, :, :self.bin_count, :]

    # if args.specaugment_prob:
    # in_shape = tf.shape(spectrogram)
    # batch_size = in_shape[0]
    # freq_shape = (batch_size, self.bin_count)
    # drop_batch = tf.random.uniform((batch_size, 1))
    # drop_freq_bands = tf.random.uniform((batch_size, 1), maxval=self.bin_count)

    # band_size = tf.random.uniform((batch_size, 1), minval=5, maxval=15)

    # masking_fn = tf.where(np.abs(tf.tile(tf.expand_dims(tf.range(self.bin_count, dtype=tf.float32), 0), [
    #                         batch_size, 1])-drop_freq_bands) < band_size, tf.zeros(freq_shape), tf.ones(freq_shape))

    # mask = tf.where(tf.tile(tf.greater(drop_batch, args.specaugment_prob), [1, self.bin_count]), tf.ones(freq_shape), masking_fn)
    # mask = tf.tile(mask[:, tf.newaxis, :, tf.newaxis], [1, in_shape[1], 1, in_shape[3]])

    # tf.summary.image("spectrogram", spectrogram[:,:,:,1:2])
    # tf.summary.image("spec_mask", mask[:,:,:,:1])
    # spectrogram = spectrogram*tf.cond(self.is_training, lambda: mask, lambda: tf.ones_like(spectrogram))
    # tf.summary.image("spectrogram_masked", spectrogram[:,:,:,:1])

    print("spectrogram shape", spectrogram.shape)

    args_context_size = int(self.context_width / self.spectrogram_hop_size)

    if args.activation is not None:
        activation = getattr(tf.nn, args.activation)

    with tf.name_scope('model_pitch'):
        layer = spectrogram

        if args.architecture.startswith("deep_hcnn"):
            assert len(args.conv_ctx) <= args.stacks
            # Prepare kernel sizes (time axis = audio context)
            args_ctx = np.abs(args.conv_ctx)
            args_dils = np.abs(args.dilations)
            ctxs = np.array([
                args_ctx[i] if i < len(args_ctx) else args_ctx[-1]
                for i in range(args.stacks)
            ])
            dils = np.array([
                args_dils[i] if i < len(args_dils) else args_dils[-1]
                for i in range(args.stacks)
            ])
            if args.conv_ctx[0] < 0:
                ctxs = np.array(list(reversed(ctxs)))
            if args.dilations[0] < 0:
                dils = np.array(list(reversed(dils)))
            print(ctxs)

            # Cut the unnecessary context
            needed_context_size = int(
                np.sum(np.ceil((ctxs - 1) / 2)) +
                np.ceil((args.last_conv_kernel[0] - 1) / 2))
            actual_context_size = args_context_size
            print("input context", args_context_size, "actual needed context",
                  needed_context_size)
            if args_context_size < needed_context_size:
                print(
                    "Warning: provided context is shorter than the needed context field of the network"
                )
            elif args_context_size > needed_context_size:
                if args.cut_context:
                    print("Cutting the unnecessary context {} --> ".format(
                        layer.shape),
                          end="")
                    diff = args_context_size - needed_context_size
                    layer = layer[:, diff:-diff, :, :]
                    actual_context_size -= diff
                    print(layer.shape, "context now:", actual_context_size)

            skip = None
            for i, conv_ctx, dil in zip(range(args.stacks), ctxs, dils):
                kernel = (conv_ctx, args.conv_range)
                print("add conv2d {} filters, {} kernel".format(
                    args.filters, kernel))
                layer = tf.layers.conv2d(layer,
                                         args.filters,
                                         kernel, (1, 1),
                                         "same",
                                         activation=None,
                                         dilation_rate=(dil, 1))

                layer = activation(layer)

                if args.undertone_stacking > 0 or args.overtone_stacking > 1:
                    print("harmonic stacking {} --> ".format(layer.shape),
                          end="")
                    layer = common.harmonic_stacking(self, layer,
                                                     args.undertone_stacking,
                                                     args.overtone_stacking)
                    print(layer.shape)

                layer = common.regularization(layer,
                                              args,
                                              training=self.is_training)

                if i < args.stacks - args.residual_end and i % args.residual_hop == 0:
                    if skip is None:
                        print(".- begin residual connection")
                    else:
                        if args.residual_op == "add":
                            print("|- adding residual connection")
                            layer += skip
                        if args.residual_op == "concat":
                            print("|- concatenating residual connection")
                            layer = tf.concat([skip, layer], -1)
                    skip = layer

            layer = tf.layers.conv2d(layer,
                                     1,
                                     args.last_conv_kernel, (1, 1),
                                     "same",
                                     activation=None)
            if actual_context_size > 0:
                layer = layer[:,
                              actual_context_size:-actual_context_size, :, :]

        self.note_logits = tf.squeeze(layer, -1)
        print("note_logits shape", self.note_logits.shape)

    if args.voicing:
        with tf.name_scope('model_voicing'):
            # Cut the unnecessary context
            voicing_layer = spectrogram
            if args_context_size > 0:
                voicing_layer = spectrogram[:, args_context_size:
                                            -args_context_size, :, :]

            if args.voicing_input == "only_salience":
                voicing_layer = tf.stop_gradient(layer)
            if args.voicing_input == "spectrogram_salience":
                voicing_layer = tf.concat(
                    [tf.stop_gradient(layer), voicing_layer], axis=-1)
            if args.voicing_input == "spectrogram_salience_train":
                voicing_layer = tf.concat([layer, voicing_layer], axis=-1)

            note = int(int(voicing_layer.shape[2]) / 6 / 12)

            voicing_layer = tf.layers.conv2d(voicing_layer,
                                             64, (1, note), (1, 1),
                                             "same",
                                             activation=activation)
            voicing_layer = common.regularization(voicing_layer,
                                                  args,
                                                  training=self.is_training)

            voicing_layer = tf.layers.conv2d(voicing_layer,
                                             64, (1, note), (1, note),
                                             "same",
                                             activation=activation)
            voicing_layer = common.regularization(voicing_layer,
                                                  args,
                                                  training=self.is_training)

            octave = int(int(voicing_layer.shape[2]) / 6)
            voicing_layer = tf.layers.conv2d(voicing_layer,
                                             64, (1, octave), (1, 1),
                                             "same",
                                             activation=activation)
            voicing_layer = common.regularization(voicing_layer,
                                                  args,
                                                  training=self.is_training)

            voicing_layer = tf.layers.conv2d(voicing_layer,
                                             64, (1, octave), (1, octave),
                                             "same",
                                             activation=activation)
            voicing_layer = common.regularization(voicing_layer,
                                                  args,
                                                  training=self.is_training)

            print("adding last conv valid layer")
            print("model output", voicing_layer.shape)
            voicing_layer = tf.layers.conv2d(voicing_layer,
                                             1, (1, voicing_layer.shape[2]),
                                             (1, 1),
                                             "valid",
                                             activation=None,
                                             use_bias=True)
            print("last conv output", voicing_layer.shape)
            # print("cut context", voicing_layer.shape)
            self.voicing_logits = tf.squeeze(voicing_layer)
            print("squeeze", voicing_layer.shape)
    else:
        self.voicing_threshold = tf.Variable(0.15, trainable=False)
        tf.summary.scalar("model/voicing_threshold", self.voicing_threshold)

    self.loss = common.loss(self, args)
    self.est_notes = common.est_notes(self, args)
    self.training = common.optimizer(self, args)
Exemplo n.º 3
0
def create_model(self, args):
    spectrogram_min_note = librosa.core.hz_to_midi(self.spectrogram_fmin)
    if args.overtone_stacking > 0 or args.undertone_stacking > 0:
        # offset = args.min_note - spectrogram_min_note
        spectrogram = common.harmonic_stacking(self, self.spectrogram,
                                               args.undertone_stacking,
                                               args.overtone_stacking)

    else:
        spectrogram = self.spectrogram[:, :, :self.bin_count, :]

    context_size = int(self.context_width / self.spectrogram_hop_size)

    if args.activation is not None:
        activation = getattr(tf.nn, args.activation)

    with tf.name_scope('model_pitch'):
        layer = spectrogram
        if args.architecture.startswith("deep_simple"):
            residual = None
            for i in range(args.stacks):
                layer = tf.layers.conv2d(layer,
                                         8 * args.capacity_multiplier,
                                         (args.conv_ctx, args.conv_range),
                                         (1, 1),
                                         "same",
                                         activation=None)

                layer = activation(layer)

                if args.harmonic_stacking:
                    layer = common.harmonic_stacking(
                        self, layer, args.harmonic_stacking,
                        args.harmonic_stacking + 1)

                layer = common.regularization(layer,
                                              args,
                                              training=self.is_training)

                if residual is None:
                    residual = layer
                else:
                    residual += layer

            layer = residual

            layer = tf.layers.conv2d(layer,
                                     1, (args.last_conv_ctx + 1, 1), (1, 1),
                                     "same",
                                     activation=None)
            layer_cut = layer[:, context_size:-context_size, :, :]
            self.note_logits = tf.squeeze(layer_cut, -1)

        if args.architecture.startswith("deep_smooth"):
            residual = None
            ctx_end = 1
            dilations_start = 5
            for i in range(args.stacks):
                conv_ctx = args.conv_ctx if i < ctx_end or i >= dilations_start else 1
                dil_rate = (1, 1) if i < dilations_start else (2**(
                    i - dilations_start), 1)
                layer = tf.layers.conv2d(layer,
                                         8 * args.capacity_multiplier,
                                         (conv_ctx, args.conv_range), (1, 1),
                                         "same",
                                         activation=None,
                                         dilation_rate=dil_rate)
                print(i, "kernel", (conv_ctx, args.conv_range), "dilation",
                      dil_rate)

                layer = activation(layer)

                if args.harmonic_stacking:
                    layer = common.harmonic_stacking(
                        self, layer, args.harmonic_stacking,
                        args.harmonic_stacking + 1)

                layer = common.regularization(layer,
                                              args,
                                              training=self.is_training)

                if residual is None:
                    residual = layer
                else:
                    residual += layer

            layer = residual

            layer = tf.layers.conv2d(layer,
                                     1, (args.last_conv_ctx, 1), (1, 1),
                                     "same",
                                     activation=None)
            layer_cut = layer[:, context_size:-context_size, :, :]
            self.note_logits = tf.squeeze(layer_cut, -1)

        if args.architecture.startswith("deep_lstm"):
            residual = None
            ctx_end = 1
            for i in range(args.stacks):
                conv_ctx = args.conv_ctx if i < ctx_end else 1
                layer = tf.layers.conv2d(layer,
                                         8 * args.capacity_multiplier,
                                         (conv_ctx, args.conv_range), (1, 1),
                                         "same",
                                         activation=None)
                layer = activation(layer)
                if args.harmonic_stacking:
                    layer = common.harmonic_stacking(
                        self, layer, args.harmonic_stacking,
                        args.harmonic_stacking + 1)

                layer = common.regularization(layer,
                                              args,
                                              training=self.is_training)

                if residual is None:
                    residual = layer
                else:
                    residual += layer

            layer = residual
            layer = tf.layers.conv2d(layer,
                                     1, (args.last_conv_ctx, 1), (1, 1),
                                     "same",
                                     activation=None)
            layer_cut = layer[:, context_size:-context_size, :, :]

            # https://www.tensorflow.org/api_docs/python/tf/contrib/cudnn_rnn/CudnnLSTM
            # cell = tf.contrib.cudnn_rnn.CudnnLSTM(1, 128)
            # tf.nn.static_rnn(
            #     cell,
            #     inputs,
            #     initial_state=None,
            #     dtype=None,
            #     sequence_length=None,
            #     scope=None
            # )

            # lstm_sizes = [128, 128]
            # lstms = [tf.contrib.rnn.BasicLSTMCell(size) for size in lstm_sizes]
            # # Add dropout to the cell
            # keep_prob_ = 0.9
            # drops = [tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob_) for lstm in lstms]
            # # Stack up multiple LSTM layers, for deep learning
            # cell = tf.contrib.rnn.MultiRNNCell(drops)
            # self.note_logits = tf.squeeze(layer_cut, -1)

            print("!!!!!!!")
            print(layer_cut.shape)
            # layer_cut = tf.squeeze(layer_cut, 3)
            layer_cut = spectrogram[:, context_size:-context_size, :, 0]
            print(layer_cut.shape)
            cell = tf.nn.rnn_cell.BasicRNNCell(16)
            # seq_length = tf.fill(tf.shape(layer_cut)[:1], self.annotations_per_window)
            # print(seq_length)
            outputs, _ = tf.nn.dynamic_rnn(cell, layer_cut, dtype=tf.float32)
            # outputs = tf.Print(outputs, [outputs, layer_cut])
            print(outputs.shape)
            # outputs = layer_cut

            outputs = tf.layers.dense(outputs, self.bin_count, activation=None)

            self.note_logits = outputs

    if args.voicing:
        with tf.name_scope('model_voicing'):
            voicing_layer = tf.concat([tf.stop_gradient(layer), spectrogram],
                                      axis=-1)

            note = int(int(voicing_layer.shape[2]) / 6 / 12)

            voicing_layer = tf.layers.conv2d(
                voicing_layer,
                8 * args.voicing_capacity_multiplier,
                (args.voicing_conv_ctx * 2 + 1, note), (1, 1),
                "same",
                activation=activation)
            voicing_layer = common.regularization(voicing_layer,
                                                  args,
                                                  training=self.is_training)

            voicing_layer = tf.layers.conv2d(
                voicing_layer,
                8 * args.voicing_capacity_multiplier,
                (args.voicing_conv_ctx * 2 + 1, note), (1, note),
                "same",
                activation=activation)
            voicing_layer = common.regularization(voicing_layer,
                                                  args,
                                                  training=self.is_training)

            octave = int(int(voicing_layer.shape[2]) / 6)
            voicing_layer = tf.layers.conv2d(
                voicing_layer,
                8 * args.voicing_capacity_multiplier,
                (args.voicing_conv_ctx * 2 + 1, octave), (1, 1),
                "same",
                activation=activation)
            voicing_layer = common.regularization(voicing_layer,
                                                  args,
                                                  training=self.is_training)

            voicing_layer = tf.layers.conv2d(
                voicing_layer,
                8 * args.voicing_capacity_multiplier,
                (args.voicing_conv_ctx * 2 + 1, octave), (1, octave),
                "same",
                activation=activation)
            voicing_layer = common.regularization(voicing_layer,
                                                  args,
                                                  training=self.is_training)

            print("adding last conv valid layer")
            print("model output", voicing_layer.shape)
            if args.voicing_last_conv_ctx:
                voicing_layer = tf.pad(
                    voicing_layer,
                    ((0, 0),
                     (args.voicing_last_conv_ctx, args.voicing_last_conv_ctx),
                     (0, 0), (0, 0)))
                print("padded", voicing_layer.shape)
            voicing_layer = tf.layers.conv2d(
                voicing_layer,
                1,
                (args.voicing_last_conv_ctx * 2 + 1, voicing_layer.shape[2]),
                (1, 1),
                "valid",
                activation=None,
                use_bias=True)
            print("last conv output", voicing_layer.shape)
            voicing_layer = voicing_layer[:, context_size:-context_size, :, :]
            print("cut context", voicing_layer.shape)
            self.voicing_logits = tf.squeeze(voicing_layer)
            print("squeeze", voicing_layer.shape)
    else:
        self.voicing_threshold = tf.Variable(0.15, trainable=False)
        tf.summary.scalar("model/voicing_threshold", self.voicing_threshold)

    # multif0 loss ---------
    with tf.name_scope("losses"):

        annotations = self.annotations - args.min_note

        voicing_ref = tf.cast(tf.greater(annotations[:, :, 0], 0), tf.float32)
        loss_names = []
        losses = []
        if self.note_logits is not None:
            if args.annotation_smoothing > 0:
                self.note_probabilities = tf.nn.sigmoid(self.note_logits)
                annotations_per_frame = tf.shape(annotations)[-1]
                note_bins = tf.tile(tf.expand_dims(self.note_bins, 2),
                                    [1, 1, annotations_per_frame, 1])
                note_ref = tf.tile(
                    tf.reshape(annotations, [
                        -1, self.annotations_per_window, annotations_per_frame,
                        1
                    ]), [1, 1, 1, self.bin_count])

                ref_probabilities = tf.exp(-(note_ref - note_bins)**2 /
                                           (2 * args.annotation_smoothing**2))
                ref_probabilities = tf.concat([
                    ref_probabilities[:, :, :1, :],
                    ref_probabilities[:, :, 1:, :] * args.miss_weight
                ],
                                              axis=2)
                ref_probabilities = tf.reduce_sum(ref_probabilities, axis=2)

                # self.note_probabilities = ref_probabilities
                # print(ref_probabilities.eval(), ref_probabilities.shape)

                voicing_weights = tf.tile(tf.expand_dims(voicing_ref, -1),
                                          [1, 1, self.bin_count])

                if args.architecture.startswith("deep_simple_focal"):
                    note_loss = focal_loss(self.note_logits,
                                           ref_probabilities,
                                           weights=voicing_weights)
                else:
                    note_loss = tf.losses.sigmoid_cross_entropy(
                        ref_probabilities,
                        self.note_logits,
                        weights=voicing_weights)

            loss_names.append("note_loss")
            losses.append(note_loss)

        # Melody input, not compatible with multif0 input
        # annotations = self.annotations[:, :, 0] - args.min_note
        # voicing_ref = tf.cast(tf.greater(annotations, 0), tf.float32)
        # loss_names = []
        # losses = []
        # if self.note_logits is not None:
        #     if args.annotation_smoothing > 0:
        #         self.note_probabilities = tf.nn.sigmoid(self.note_logits)
        #         note_ref = tf.tile(tf.reshape(annotations, [-1, self.annotations_per_window, 1]), [1, 1, self.bin_count])
        #         ref_probabilities = tf.exp(-(note_ref-self.note_bins)**2/(2*args.annotation_smoothing**2))

        #         voicing_weights = tf.tile(tf.expand_dims(voicing_ref, -1), [1, 1, self.bin_count])

        #         # miss weights
        #         peak_ref = tf.cast(tf.abs(tf.tile(tf.reshape(annotations, [-1, self.annotations_per_window, 1]), [1, 1, self.bin_count]) - self.note_bins) < 0.5, tf.float32)
        #         miss_weights = tf.ones_like(voicing_weights)*args.miss_weight + peak_ref*(1-args.miss_weight)

        #         note_loss = tf.losses.sigmoid_cross_entropy(ref_probabilities, self.note_logits, weights=voicing_weights*miss_weights)
        #     else:
        #         self.note_probabilities = tf.nn.softmax(self.note_logits)
        #         ref_bins = tf.cast(tf.round(annotations * self.bins_per_semitone), tf.int32)
        #         note_loss = tf.losses.sparse_softmax_cross_entropy(ref_bins, self.note_logits, weights=voicing_ref)

        #     loss_names.append("note_loss")
        #     losses.append(note_loss)

        if args.l2_loss_weight > 0:
            reg_variables = tf.get_collection(
                tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.reduce_sum(
                tf.constant(args.l2_loss_weight) * reg_variables)

            loss_names.append("l2_loss")
            losses.append(l2_loss)

        if self.voicing_logits is not None:
            voicing_loss = tf.losses.sigmoid_cross_entropy(
                voicing_ref, self.voicing_logits)

            loss_names.append("voicing_loss")
            losses.append(voicing_loss)

    if len(losses) > 1:
        for name, loss in zip(loss_names, losses):
            tf.summary.scalar('metrics/train/' + name, loss)

    self.loss = tf.math.add_n(losses)
    self.est_notes = common.est_notes(self, args)
    self.training = common.optimizer(self, args)
Exemplo n.º 4
0
def create_model(self, args):
    if args.overtone_stacking > 0 or args.undertone_stacking > 0:
        spectrogram_windows = []
        print("stacking the spectrogram")
        for i in [1 / (x + 2) for x in range(args.undertone_stacking)] + list(
                range(1, args.overtone_stacking + 1)):
            f_ref = 440  # arbitrary reference frequency
            hz = f_ref * i
            interval = librosa.core.hz_to_midi(hz) - librosa.core.hz_to_midi(
                f_ref)
            int_bins = int(round(interval * self.bins_per_semitone))
            spec_layer = self.spectrogram[:, :,
                                          max(int_bins, 0):self.bin_count +
                                          int_bins, :]
            print(i, "offset", int_bins, "end", self.bin_count + int_bins,
                  "shape", spec_layer.shape)
            if int_bins < 0:
                spec_layer = tf.pad(spec_layer,
                                    ((0, 0), (0, 0), (-int_bins, 0), (0, 0)))

            spec_layer = tf.pad(spec_layer,
                                ((0, 0), (0, 0),
                                 (0, self.bin_count - spec_layer.shape[2]),
                                 (0, 0)))
            print("padded shape", spec_layer.shape)
            spectrogram_windows.append(spec_layer)
        spectrogram = tf.concat(spectrogram_windows, axis=-1)

    else:
        spectrogram = self.spectrogram[:, :, :360, :]

    # layer = tf.pad(layer, ((0, 0), (0, 0), (41, 41), (0, 0)))
    print(spectrogram.shape)

    context_size = int(self.context_width / self.spectrogram_hop_size)

    if args.activation is not None:
        activation = getattr(tf.nn, args.activation)

    with tf.name_scope('model_pitch'):
        layer = spectrogram

        layer = tf.layers.conv2d(layer,
                                 8 * args.capacity_multiplier,
                                 (args.conv_ctx * 2 + 1, 5), (1, 1),
                                 "same",
                                 activation=None,
                                 use_bias=False)
        # layer = common.regularization(layer, args, training=self.is_training)
        layer = tf.layers.batch_normalization(layer, training=self.is_training)
        layer = tf.nn.relu(layer)
        layer = tf.layers.dropout(layer, 0.25, training=self.is_training)

        residual = layer
        layer = tf.layers.conv2d(layer,
                                 8 * args.capacity_multiplier,
                                 (args.conv_ctx * 2 + 1, 5), (1, 1),
                                 "same",
                                 activation=None,
                                 use_bias=False)
        # layer = common.regularization(layer, args, training=self.is_training)
        layer = tf.layers.batch_normalization(layer, training=self.is_training)
        layer = tf.nn.relu(layer)
        layer = tf.layers.dropout(layer, 0.25, training=self.is_training)
        residual += layer
        layer = tf.layers.conv2d(layer,
                                 8 * args.capacity_multiplier,
                                 (args.conv_ctx * 2 + 1, 3), (1, 1),
                                 "same",
                                 activation=None,
                                 use_bias=False)
        # layer = common.regularization(layer, args, training=self.is_training)
        layer = tf.layers.batch_normalization(layer, training=self.is_training)
        layer = tf.nn.relu(layer)
        layer = tf.layers.dropout(layer, 0.25, training=self.is_training)
        residual += layer
        layer = tf.layers.conv2d(layer,
                                 8 * args.capacity_multiplier,
                                 (args.conv_ctx * 2 + 1, 3), (1, 1),
                                 "same",
                                 activation=None,
                                 use_bias=False)
        # layer = common.regularization(layer, args, training=self.is_training)
        layer = tf.layers.batch_normalization(layer, training=self.is_training)
        layer = tf.nn.relu(layer)
        layer = tf.layers.dropout(layer, 0.25, training=self.is_training)
        residual += layer
        layer = tf.layers.conv2d(layer,
                                 8 * args.capacity_multiplier,
                                 (args.conv_ctx * 2 + 1, 70), (1, 1),
                                 "same",
                                 activation=None,
                                 use_bias=False)
        # layer = common.regularization(layer, args, training=self.is_training)
        layer = tf.layers.batch_normalization(layer, training=self.is_training)
        layer = tf.nn.relu(layer)
        layer = tf.layers.dropout(layer, 0.25, training=self.is_training)
        residual += layer

        layer = residual
        layer = tf.layers.batch_normalization(layer, training=self.is_training)

        layer = tf.layers.conv2d(layer,
                                 1, (args.last_conv_ctx * 2 + 1, 1), (1, 1),
                                 "same",
                                 activation=None,
                                 use_bias=False)
        layer_cut = layer[:, context_size:-context_size, :, :]
        # layer = tf.layers.conv2d(layer, 1, (10, 1), (1, 1), "same", activation=None, use_bias=True)

        note_output = tf.squeeze(layer_cut, -1)
        print(note_output.shape)
        self.note_logits = note_output

    if args.voicing:
        with tf.name_scope('model_voicing'):
            voicing_layer = tf.concat([tf.stop_gradient(layer), spectrogram],
                                      axis=-1)

            note = int(int(voicing_layer.shape[2]) / 6 / 12)

            voicing_layer = tf.layers.conv2d(
                voicing_layer,
                8 * args.voicing_capacity_multiplier,
                (args.voicing_conv_ctx * 2 + 1, note), (1, 1),
                "same",
                activation=activation)
            voicing_layer = common.regularization(voicing_layer,
                                                  args,
                                                  training=self.is_training)

            voicing_layer = tf.layers.conv2d(
                voicing_layer,
                8 * args.voicing_capacity_multiplier,
                (args.voicing_conv_ctx * 2 + 1, note), (1, note),
                "same",
                activation=activation)
            voicing_layer = common.regularization(voicing_layer,
                                                  args,
                                                  training=self.is_training)

            octave = int(int(voicing_layer.shape[2]) / 6)
            voicing_layer = tf.layers.conv2d(
                voicing_layer,
                8 * args.voicing_capacity_multiplier,
                (args.voicing_conv_ctx * 2 + 1, octave), (1, 1),
                "same",
                activation=activation)
            voicing_layer = common.regularization(voicing_layer,
                                                  args,
                                                  training=self.is_training)

            voicing_layer = tf.layers.conv2d(
                voicing_layer,
                8 * args.voicing_capacity_multiplier,
                (args.voicing_conv_ctx * 2 + 1, octave), (1, octave),
                "same",
                activation=activation)
            voicing_layer = common.regularization(voicing_layer,
                                                  args,
                                                  training=self.is_training)

            print("adding last conv valid layer")
            print("model output", voicing_layer.shape)
            if args.voicing_last_conv_ctx:
                voicing_layer = tf.pad(
                    voicing_layer,
                    ((0, 0),
                     (args.voicing_last_conv_ctx, args.voicing_last_conv_ctx),
                     (0, 0), (0, 0)))
                print("padded", voicing_layer.shape)
            voicing_layer = tf.layers.conv2d(
                voicing_layer,
                1,
                (args.voicing_last_conv_ctx * 2 + 1, voicing_layer.shape[2]),
                (1, 1),
                "valid",
                activation=None,
                use_bias=True)
            print("last conv output", voicing_layer.shape)
            voicing_layer = voicing_layer[:, context_size:-context_size, :, :]
            print("cut context", voicing_layer.shape)
            self.voicing_logits = tf.squeeze(voicing_layer)
            print("squeeze", voicing_layer.shape)

    self.loss = common.loss(self, args)
    self.est_notes = common.est_notes(self, args)
    self.training = common.optimizer(self, args)
def create_model(self, args):
    if args.spectrogram == "cqt":
        spec_bin_count = 360
        spec_bins_per_semitone = 5
    if args.spectrogram == "YunNingHung_cqt":
        spec_bin_count = 88
        spec_bins_per_semitone = 1

    if args.spectrogram_undertone_stacking > 0 or args.spectrogram_overtone_stacking > 1:
        spectrogram = common.harmonic_stacking(
            self,
            self.spectrogram,
            args.spectrogram_undertone_stacking,
            args.spectrogram_overtone_stacking,
            bin_count=spec_bin_count,
            bins_per_semitone=spec_bins_per_semitone)
    else:
        spectrogram = self.spectrogram
        if args.spectrogram == "cqt":
            spectrogram = self.spectrogram[:, :, :spec_bin_count, :]

    args_context_size = int(self.context_width / self.spectrogram_hop_size)

    if args.activation is not None:
        activation = getattr(tf.nn, args.activation)

    with tf.name_scope('model_pitch'):
        layer = spectrogram
        print("self.spectrogram shape", self.spectrogram.shape)
        print("spectrogram shape", spectrogram.shape)

        if args.architecture.startswith("baseline"):
            # layer = tf.layers.conv2d(layer, args.filters, (args.conv_ctx[0], args.conv_range), strides=(1, 5), padding="same", activation=None)
            #layer = activation(layer)
            #layer = tf.layers.average_pooling2d(layer, (5, 1), (5, 1))
            layer = tf.layers.flatten(layer)
            layer = tf.layers.dense(layer, 100, use_bias=(not args.batchnorm))
            if args.batchnorm:
                layer = tf.layers.batch_normalization(
                    layer, training=self.is_training)

            layer = activation(layer)
            layer = tf.layers.dense(
                layer, args.note_range * args.annotations_per_window)
            layer = tf.reshape(
                layer, (-1, args.annotations_per_window, args.note_range))
            self.note_logits = layer
            # layer_cut = layer[:, args_context_size:-args_context_size, :, :]
            # self.note_logits = tf.squeeze(layer_cut, -1)
            print("note_logits shape", self.note_logits.shape)

        if args.architecture.startswith("LY"):
            # batch_size, annotations_per_wind, time, freq
            def conv_block(self, layer, args, channels, kernel, time_padding):
                layer = tf.pad(layer, ((0, 0), (time_padding, time_padding),
                                       (0, 0), (0, 0)))
                layer = tf.layers.conv2d(layer,
                                         channels,
                                         kernel,
                                         padding="valid",
                                         activation=None,
                                         use_bias=False)
                if args.batchnorm:
                    layer = tf.layers.batch_normalization(
                        layer, training=self.is_training)
                layer = activation(layer)

                return layer

            print(layer.shape)
            layer = conv_block(self, layer, args, args.filters,
                               (7, spectrogram.shape[2]), 3)
            print(layer.shape)
            layer = tf.layers.max_pooling2d(layer, (3, 1), (3, 1))
            print(layer.shape)
            layer = conv_block(self, layer, args, args.filters, (7, 1), 3)
            print(layer.shape)
            layer = tf.layers.max_pooling2d(layer, (3, 1), (3, 1))
            print(layer.shape)
            layer = conv_block(self, layer, args, 16 * args.filters, (1, 1), 0)
            print(layer.shape)
            layer = conv_block(self, layer, args, 16 * args.filters, (1, 1), 0)
            print(layer.shape)
            layer = tf.layers.conv2d(layer,
                                     self.note_range, (1, 1),
                                     padding="valid",
                                     activation=None)
            print(layer.shape)
            # squeeze frequency dimension
            layer = tf.squeeze(layer, 2)

            self.note_logits = layer
            print("note_logits shape", self.note_logits.shape)

        if args.architecture.startswith("deep_hcnn"):
            assert len(args.conv_ctx) <= args.stacks
            # Prepare kernel sizes (time axis = audio context)
            args_ctx = np.abs(args.conv_ctx)
            args_dils = np.abs(args.dilations)
            ctxs = np.array([
                args_ctx[i] if i < len(args_ctx) else args_ctx[-1]
                for i in range(args.stacks)
            ])
            dils = np.array([
                args_dils[i] if i < len(args_dils) else args_dils[-1]
                for i in range(args.stacks)
            ])
            if args.conv_ctx[0] < 0:
                ctxs = np.array(list(reversed(ctxs)))
            if args.dilations[0] < 0:
                dils = np.array(list(reversed(dils)))
            print(ctxs)

            # Cut the unnecessary context
            needed_context_size = int(
                np.sum(np.ceil((ctxs - 1) / 2)) +
                np.ceil((args.last_conv_kernel[0] - 1) / 2))
            actual_context_size = args_context_size
            print("input context", args_context_size, "actual needed context",
                  needed_context_size)
            if args_context_size < needed_context_size:
                print(
                    "Warning: provided context is shorter than the needed context field of the network"
                )
            elif args_context_size > needed_context_size:
                if args.cut_context:
                    print("Cutting the unnecessary context {} --> ".format(
                        layer.shape),
                          end="")
                    diff = args_context_size - needed_context_size
                    layer = layer[:, diff:-diff, :, :]
                    actual_context_size -= diff
                    print(layer.shape, "context now:", actual_context_size)

            skip = None
            for i, conv_ctx, dil in zip(range(args.stacks), ctxs, dils):
                kernel = (conv_ctx, args.conv_range)

                if i > 0 and args.faster_hcnn:
                    print("add hconv2d {} filters, {} kernel".format(
                        args.filters, kernel))
                    layer = common.hconv2d(
                        layer,
                        args.filters,
                        kernel,
                        args.undertone_stacking,
                        args.overtone_stacking,
                        60,  # bins per octave
                        padding="same",
                        activation=None,
                        dilation_rate=(dil, 1),
                        use_bias=bool(args.use_bias))
                    print(layer.shape)
                else:
                    print("add conv2d {} filters, {} kernel".format(
                        args.filters, kernel))
                    layer = tf.layers.conv2d(layer,
                                             args.filters,
                                             kernel, (1, 1),
                                             padding="same",
                                             activation=None,
                                             dilation_rate=(dil, 1),
                                             use_bias=bool(args.use_bias))
                    print(layer.shape)

                layer = common.regularization(layer,
                                              args,
                                              training=self.is_training)
                layer = activation(layer)

                if (not args.faster_hcnn) and (args.undertone_stacking > 0
                                               or args.overtone_stacking > 1
                                               ) and i < args.stacking_until:
                    print("harmonic stacking {} --> ".format(layer.shape),
                          end="")
                    layer = common.harmonic_stacking(self,
                                                     layer,
                                                     args.undertone_stacking,
                                                     args.overtone_stacking,
                                                     bin_count=360,
                                                     bins_per_semitone=5)
                    print(layer.shape)

                if i < args.stacks - args.residual_end and i % args.residual_hop == 0:
                    if skip is None:
                        print(".- begin residual connection")
                    else:
                        if args.residual_op == "add":
                            print("|- adding residual connection")
                            layer += skip
                        if args.residual_op == "concat":
                            print("|- concatenating residual connection")
                            layer = tf.concat([skip, layer], -1)
                    skip = layer

            if args.last_pooling == "globalavg":
                layer = tf.layers.average_pooling2d(layer, (1, 360), (1, 360))
            if args.last_pooling == "avg":
                layer = tf.layers.average_pooling2d(layer, (1, 5), (1, 5))
            if args.last_pooling == "max":
                layer = tf.layers.max_pooling2d(layer, (1, 5), (1, 5))
            if args.last_pooling == "maxoct":
                layer = tf.layers.max_pooling2d(layer, (1, 60), (1, 60))

            if args.faster_hcnn:
                print("add last hconv2d {} filters, {} kernel".format(
                    args.filters, kernel))
                layer = common.hconv2d(
                    layer,
                    args.note_range,
                    args.last_conv_kernel,
                    args.undertone_stacking,
                    args.overtone_stacking,
                    12,  # bins per semitone
                    padding="valid",
                    activation=None,
                    use_bias=bool(args.use_bias))
                print(layer.shape)
            else:
                print("add last conv2d {} filters, {} kernel".format(
                    args.filters, kernel))
                layer = tf.layers.conv2d(layer,
                                         args.note_range,
                                         args.last_conv_kernel, (1, 1),
                                         padding="valid",
                                         activation=None,
                                         use_bias=bool(args.use_bias))
                print(layer.shape)

            if actual_context_size > 0:
                layer = layer[:,
                              actual_context_size:-actual_context_size, :, :]

            self.note_logits = tf.squeeze(layer, 2)
            print("note_logits shape", self.note_logits.shape)

    if args.class_weighting:
        weights = self.class_weights
    else:
        weights = None

    self.loss = common.loss_mir(self, args, weights=weights)
    self.est_notes = tf.constant(0)  # placeholder, we compute est_notes on cpu
    self.training = common.optimizer(self, args)
def create_model(self, args):
    context_size = int(self.context_width/self.spectrogram_hop_size)

    with tf.name_scope('model_pitch'):
        self.note_logits = None
        self.note_probabilities = self.spectrogram[:, context_size:-context_size, :360, 0]
    

    with tf.name_scope('model_voicing'):
        # voicing_layer = tf.concat([tf.stop_gradient(layer), spectrogram], axis=-1)

        if args.harmonic_stacking > 1:
            spectrogram_windows = []
            print("stacking the spectrogram")
            for i in range(args.harmonic_stacking):
                f_ref = 440 # arbitrary reference frequency
                hz = f_ref*(i+1)
                interval = librosa.core.hz_to_midi(hz) - librosa.core.hz_to_midi(f_ref)
                int_bins = int(round(interval*self.bins_per_semitone))
                spec_layer = self.spectrogram[:, :, int_bins:self.bin_count+int_bins, :]
                print(i+1, "offset", int_bins, "end", self.bin_count+int_bins, "shape", spec_layer.shape)
                spec_layer = tf.pad(spec_layer, ((0, 0), (0, 0), (0, self.bin_count-spec_layer.shape[2]), (0, 0)))
                print("padded shape", spec_layer.shape)
                spectrogram_windows.append(spec_layer)
            voicing_layer = tf.concat(spectrogram_windows, axis=-1)

        else:
            voicing_layer = self.spectrogram[:, :, :360, :]

        if args.first_pool_type == "avg":
            voicing_layer = tf.layers.average_pooling2d(voicing_layer, args.first_pool_size, args.first_pool_stride, padding="same")
        if args.first_pool_type == "max":
            voicing_layer = tf.layers.max_pooling2d(voicing_layer, args.first_pool_size, args.first_pool_stride, padding="same")
        
        print("after pooling", voicing_layer.shape)

        octave = int(int(voicing_layer.shape[2])/6)
        note = int(int(voicing_layer.shape[2])/6/12)

        if args.activation is not None:
            activation = getattr(tf.nn, args.activation)

        if args.architecture == "full_1layer":
            if args.conv_ctx:
                voicing_layer = tf.pad(voicing_layer, ((0, 0), (args.conv_ctx, args.conv_ctx), (0, 0), (0, 0)))
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, voicing_layer.shape[2]), (1, 1), "valid", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
        if args.architecture == "octave_1layer":
            if args.conv_ctx:
                voicing_layer = tf.pad(voicing_layer, ((0, 0), (args.conv_ctx, args.conv_ctx), (0, 0), (0, 0)))
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "valid", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
        if args.architecture == "note_1layer":
            if args.conv_ctx:
                voicing_layer = tf.pad(voicing_layer, ((0, 0), (args.conv_ctx, args.conv_ctx), (0, 0), (0, 0)))
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "valid", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
        
        if args.architecture == "octave_octave":
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
        
        if args.architecture == "note_note":
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
        
        if args.architecture == "note_dilated":
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (1, 6), (1, 1), "same", activation=activation, dilation_rate=(1, octave))
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

        if args.architecture == "dilated_note":
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (1, 6), (1, 1), "same", activation=activation, dilation_rate=(1, octave))
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

        if args.architecture == "note_octave":
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

        if args.architecture == "octave_note":
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
        

        if args.architecture == "note_octave_fix":
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
            octave = int(int(voicing_layer.shape[2])/6)
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

        if args.architecture == "note_note_octave":
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

            octave = int(int(voicing_layer.shape[2])/6)
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

        if args.architecture == "note_note_octave_octave":
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

            octave = int(int(voicing_layer.shape[2])/6)
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, 1), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

        if args.architecture == "note_note_note_octave":
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

            octave = int(int(voicing_layer.shape[2])/6)

            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

        if args.architecture == "note_note_note_octave_octave":
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

            octave = int(int(voicing_layer.shape[2])/6)
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, 1), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
        
        if args.architecture == "note_octave_octave_temporal":
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

            octave = int(int(voicing_layer.shape[2])/6)
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, note), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

            octave = int(int(voicing_layer.shape[2])/6)
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*7+1, 3), (1, 1), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)



        if args.last_layer == "conv":
            print("adding last conv valid layer")
            print("model output", voicing_layer.shape)
            if args.last_conv_ctx:
                voicing_layer = tf.pad(voicing_layer, ((0, 0), (args.last_conv_ctx, args.last_conv_ctx), (0, 0), (0, 0)))
                print("padded", voicing_layer.shape)
            voicing_layer = tf.layers.conv2d(voicing_layer, 1, (args.last_conv_ctx*2+1, voicing_layer.shape[2]), (1, 1), "valid", activation=None, use_bias=True)
            print("last conv output", voicing_layer.shape)
            voicing_layer = voicing_layer[:, context_size:-context_size, :, :]
            print("cut context", voicing_layer.shape)
            self.voicing_logits = tf.squeeze(voicing_layer)
            print("squeeze", voicing_layer.shape)
        if args.last_layer == "dense":
            voicing_layer = tf.layers.flatten(voicing_layer)
            self.voicing_logits = tf.layers.dense(voicing_layer, args.annotations_per_window)

    self.loss = common.loss(self, args)
    self.est_notes = common.est_notes(self, args)
    self.training = common.optimizer(self, args)
def create_model(self, args):
    if args.spectrogram_undertone_stacking > 0 or args.spectrogram_overtone_stacking > 1:
        # for spectrograms where the min. frequency doesn't correspond to output min. note
        # spectrogram_min_note = librosa.core.hz_to_midi(self.spectrogram_fmin)
        # offset = args.min_note - spectrogram_min_note
        spectrogram = common.harmonic_stacking(
            self,
            self.spectrogram,
            args.spectrogram_undertone_stacking,
            args.spectrogram_overtone_stacking,
            bin_count=360,
            bins_per_semitone=5)

    # else:
    #     spectrogram = self.spectrogram[:, :, :self.bin_count, :]

    args_context_size = int(self.context_width / self.spectrogram_hop_size)

    if args.activation is not None:
        activation = getattr(tf.nn, args.activation)

    with tf.name_scope('model_pitch'):
        layer = spectrogram
        print("self.spectrogram shape", self.spectrogram.shape)
        print("spectrogram shape", spectrogram.shape)

        if args.architecture.startswith("deep_hcnn"):
            assert len(args.conv_ctx) <= args.stacks
            # Prepare kernel sizes (time axis = audio context)
            args_ctx = np.abs(args.conv_ctx)
            args_dils = np.abs(args.dilations)
            ctxs = np.array([
                args_ctx[i] if i < len(args_ctx) else args_ctx[-1]
                for i in range(args.stacks)
            ])
            dils = np.array([
                args_dils[i] if i < len(args_dils) else args_dils[-1]
                for i in range(args.stacks)
            ])
            if args.conv_ctx[0] < 0:
                ctxs = np.array(list(reversed(ctxs)))
            if args.dilations[0] < 0:
                dils = np.array(list(reversed(dils)))
            print(ctxs)

            # Cut the unnecessary context
            needed_context_size = int(
                np.sum(np.ceil((ctxs - 1) / 2)) +
                np.ceil((args.last_conv_kernel[0] - 1) / 2))
            actual_context_size = args_context_size
            print("input context", args_context_size, "actual needed context",
                  needed_context_size)
            if args_context_size < needed_context_size:
                print(
                    "Warning: provided context is shorter than the needed context field of the network"
                )
            elif args_context_size > needed_context_size:
                if args.cut_context:
                    print("Cutting the unnecessary context {} --> ".format(
                        layer.shape),
                          end="")
                    diff = args_context_size - needed_context_size
                    layer = layer[:, diff:-diff, :, :]
                    actual_context_size -= diff
                    print(layer.shape, "context now:", actual_context_size)

            skip = None
            for i, conv_ctx, dil in zip(range(args.stacks), ctxs, dils):
                kernel = (conv_ctx, args.conv_range)

                if i > 0 and args.faster_hcnn:
                    print("add hconv2d {} filters, {} kernel".format(
                        args.filters, kernel))
                    layer = common.hconv2d(
                        layer,
                        args.filters,
                        kernel,
                        args.undertone_stacking,
                        args.overtone_stacking,
                        60,  # bins per semitone
                        padding="same",
                        activation=None,
                        dilation_rate=(dil, 1),
                        use_bias=bool(args.use_bias))
                    print(layer.shape)
                else:
                    print("add conv2d {} filters, {} kernel".format(
                        args.filters, kernel))
                    layer = tf.layers.conv2d(layer,
                                             args.filters,
                                             kernel, (1, 1),
                                             padding="same",
                                             activation=None,
                                             dilation_rate=(dil, 1),
                                             use_bias=bool(args.use_bias))
                    print(layer.shape)

                layer = common.regularization(layer,
                                              args,
                                              training=self.is_training)
                layer = activation(layer)

                if (not args.faster_hcnn) and (args.undertone_stacking > 0
                                               or args.overtone_stacking > 1):
                    print("harmonic stacking {} --> ".format(layer.shape),
                          end="")
                    layer = common.harmonic_stacking(self,
                                                     layer,
                                                     args.undertone_stacking,
                                                     args.overtone_stacking,
                                                     bin_count=360,
                                                     bins_per_semitone=5)
                    print(layer.shape)

                if i < args.stacks - args.residual_end and i % args.residual_hop == 0:
                    if skip is None:
                        print(".- begin residual connection")
                    else:
                        if args.residual_op == "add":
                            print("|- adding residual connection")
                            layer += skip
                        if args.residual_op == "concat":
                            print("|- concatenating residual connection")
                            layer = tf.concat([skip, layer], -1)
                    skip = layer

            layer = tf.layers.average_pooling2d(layer, (1, 5), (1, 5))

            if args.faster_hcnn:
                print("add last hconv2d {} filters, {} kernel".format(
                    args.filters, kernel))
                layer = common.hconv2d(
                    layer,
                    1,
                    args.last_conv_kernel,
                    args.undertone_stacking,
                    args.overtone_stacking,
                    12,  # bins per semitone
                    padding="same",
                    activation=None,
                    use_bias=bool(args.use_bias))
                print(layer.shape)
            else:
                print("add last conv2d {} filters, {} kernel".format(
                    args.filters, kernel))
                layer = tf.layers.conv2d(layer,
                                         1,
                                         args.last_conv_kernel, (1, 1),
                                         padding="same",
                                         activation=None,
                                         use_bias=bool(args.use_bias))
                print(layer.shape)

            if actual_context_size > 0:
                layer = layer[:,
                              actual_context_size:-actual_context_size, :, :]

        self.note_logits = tf.squeeze(layer, -1)
        print("note_logits shape", self.note_logits.shape)

    if args.voicing:
        raise NotImplementedError
    else:
        self.voicing_threshold = tf.Variable(0.5, trainable=False)
        tf.summary.scalar("model/voicing_threshold", self.voicing_threshold)

    self.loss = common.loss_mf0(self, args)
    self.est_notes = tf.constant(0)  # placeholder, we compute est_notes on cpu
    self.training = common.optimizer(self, args)