예제 #1
0
    def process_file(self,f0_nor, pho_target, singer_index,  sess):

        stat_file = h5py.File(config.stat_dir+'stats.hdf5', mode='r')

        max_feat = np.array(stat_file["feats_maximus"])
        min_feat = np.array(stat_file["feats_minimus"])
        stat_file.close()



        in_batches_f0, nchunks_in = utils.generate_overlapadd(np.expand_dims(f0_nor, -1))

        in_batches_pho, nchunks_in_pho = utils.generate_overlapadd(np.expand_dims(pho_target, -1))

        in_batches_pho = in_batches_pho.reshape([in_batches_pho.shape[0], config.batch_size, config.max_phr_len])


        out_batches_feats = []


        for in_batch_f0, in_batch_pho in zip(in_batches_f0, in_batches_pho) :
            speaker = np.repeat(singer_index, config.batch_size)
            feed_dict = { self.f0_placeholder: in_batch_f0,self.phoneme_labels: in_batch_pho, self.singer_labels:speaker, self.is_train: False}
            out_feats = sess.run(self.output, feed_dict=feed_dict)
            out_batches_feats.append(out_feats)

        out_batches_feats = np.array(out_batches_feats)

        out_batches_feats = utils.overlapadd(out_batches_feats,nchunks_in)

        out_batches_feats = out_batches_feats/2+0.5

        out_batches_feats = out_batches_feats*(max_feat[:-2] - min_feat[:-2]) + min_feat[:-2]

        return out_batches_feats
예제 #2
0
	def process_file(self, mix_stft, sess):

		stat_file = h5py.File(config.stat_dir+'stats.hdf5', mode='r')

		max_voc = np.array(stat_file["voc_stft_maximus"])
		min_voc = np.array(stat_file["voc_stft_minimus"])

		max_feat = np.array(stat_file["feats_maximus"])
		min_feat = np.array(stat_file["feats_minimus"])
		stat_file.close()

		in_batches_stft, nchunks_in = utils.generate_overlapadd(mix_stft)

		in_batches_stft = in_batches_stft/max_voc 

		out_batches_feats = []

		for in_batch_stft in in_batches_stft :
			feed_dict = {self.input_placeholder: in_batch_stft}
			harm, ap, f0, vuv = sess.run([self.harm, self.ap, self.f0, self.vuv], feed_dict=feed_dict)

			val_feats = np.concatenate((harm, ap, f0, vuv), axis=-1)
			out_batches_feats.append(val_feats)

		out_batches_feats = np.array(out_batches_feats)

		out_feats = utils.overlapadd(out_batches_feats,nchunks_in)

		out_feats[:,-1] = np.round(out_feats[:,-1])

		out_feats = out_feats*(max_feat-min_feat)+min_feat

		return out_feats
예제 #3
0
    def process_file(self, mel, speaker_index, speaker_index_2, sess):


        stat_file = h5py.File('./stats_yam.hdf5', mode='r')

        max_feat = stat_file["feats_maximus"][()]
        min_feat = stat_file["feats_minimus"][()]


        stat_file.close()  


        mel = (mel - min_feat[:-2])/(max_feat[:-2]-min_feat[:-2])

        in_batches_mel, nchunks_in = utils.generate_overlapadd(mel)

        out_batches_mel = []

        for in_batch_mel in in_batches_mel :
            speaker = np.repeat(speaker_index, config.batch_size)
            speaker_2 = np.repeat(speaker_index_2, config.batch_size)
            feed_dict = {self.input_placeholder: in_batch_mel, self.speaker_labels:speaker,self.speaker_labels_1:speaker_2, self.is_train: False}
            mel = sess.run(self.output, feed_dict=feed_dict)

            out_batches_mel.append(mel)
        out_batches_mel = np.array(out_batches_mel)

        out_batches_mel = utils.overlapadd(out_batches_mel,nchunks_in)

        out_batches_mel = out_batches_mel*(max_feat[:-2] - min_feat[:-2]) + min_feat[:-2]

        return out_batches_mel
예제 #4
0
def val_generator(train_filename=config.h5py_file_val, in_mode=config.in_mode):
    hdf5_file = h5py.File(train_filename, "r")
    if in_mode == 'voc':
        inps = hdf5_file["voc_stft"]
        feat = "voc_stft"
    elif in_mode == 'mix':
        inps = hdf5_file["mix_stft"]
        feat = "mix_stft"
    targ = hdf5_file["feats"]
    num_files = inps.shape[0]
    for i in range(num_files):
        in_batch, nchunks_in = utils.generate_overlapadd(inps[i])

        in_batch = normalize(in_batch, feat)

        targ_batch, nchunks_targ = utils.generate_overlapadd(targ[i])

        targ_batch = normalize(targ_batch, "feats")
        yield in_batch, nchunks_in, targ_batch, nchunks_targ
예제 #5
0
    def process_file(self, mel, sess):


        stat_file = h5py.File('./stats_yam.hdf5', mode='r')

        max_feat = stat_file["feats_maximus"][()]
        min_feat = stat_file["feats_minimus"][()]


        stat_file.close()  

        mel = np.clip(mel, 0.0, 1.0)


        in_batches_mel, nchunks_in = utils.generate_overlapadd(mel)

        out_batches_mel = []
        out_f0 = []
        out_vuv = []

        for in_batch_mel in in_batches_mel :
            # speaker = np.repeat(speaker_index, config.batch_size)
            speaker_index = config.singers.index('Nikolas')
            speaker_2 = np.repeat(speaker_index, config.batch_size)
            if config.use_speaker:
                feed_dict = {self.stft_placeholder: in_batch_mel, self.speaker_labels_1: speaker_2, self.is_train: False}
            else:
                feed_dict = {self.stft_placeholder: in_batch_mel, self.is_train: False}
            mel, f0, vuv = sess.run([self.output_stft, self.f0, self.vuv], feed_dict=feed_dict)
            out_batches_mel.append(mel)
            out_f0.append(f0)
            out_vuv.append(vuv)

        out_batches_mel = np.array(out_batches_mel)

        out_batches_mel = utils.overlapadd(out_batches_mel,nchunks_in)
        out_f0 = utils.overlapadd(np.array(out_f0), nchunks_in)
        out_vuv = utils.overlapadd(np.array(out_vuv), nchunks_in)

        if config.f0_mode == "cont":

            out_f0 = out_f0*(max_feat[-2] - min_feat[-2]) + min_feat[-2]



        out_batches_mel = out_batches_mel*(max_feat[:-2] - min_feat[:-2]) + min_feat[:-2]

        out_vuv = np.round(out_vuv)

        return out_batches_mel, out_f0, out_vuv
예제 #6
0
    def read_input_file(self, file_name):
        """
        Function to read and process input file, given name and the synth_mode.
        Returns features for the file based on mode (0 for hdf5 file, 1 for wav file).
        Currently, only the HDF5 version is implemented.
        """
        # if file_name.endswith('.hdf5'):
        feat_file = h5py.File(config.feats_dir + file_name)

        mix_stft = feat_file['voc_stft'][()]

        part_stft = feat_file['part_stft'][()]

        feat_file.close()

        in_batches_mix_stft, nchunks_in = utils.generate_overlapadd(
            abs(mix_stft))

        return in_batches_mix_stft, np.angle(mix_stft), part_stft, nchunks_in
예제 #7
0
    def read_input_wav_file(self, file_name):
        """
        Function to read and process input file, given name and the synth_mode.
        Returns features for the file based on mode (0 for hdf5 file, 1 for wav file).
        Currently, only the HDF5 version is implemented.
        """
        audio, fs = librosa.core.load(file_name, sr=config.fs)
        hcqt = sig_process.get_hcqt(audio / 4)

        hcqt = np.swapaxes(hcqt, 0, 1)

        in_batches_hcqt, nchunks_in = utils.generate_overlapadd(
            hcqt.reshape(-1, 6 * 360))
        in_batches_hcqt = in_batches_hcqt.reshape(in_batches_hcqt.shape[0],
                                                  config.batch_size,
                                                  config.max_phr_len, 6, 360)
        in_batches_hcqt = np.swapaxes(in_batches_hcqt, -1, -2)

        return in_batches_hcqt, nchunks_in, hcqt.shape[0]
예제 #8
0
    def read_input_file(self, file_name):
        """
        Function to read and process input file, given name and the synth_mode.
        Returns features for the file based on mode (0 for hdf5 file, 1 for wav file).
        Currently, only the HDF5 version is implemented.
        """
        # if file_name.endswith('.hdf5'):
        feat_file = h5py.File(config.feats_dir + file_name)
        atb = feat_file['atb'][()]

        atb = atb[:, 1:]

        hcqt = feat_file['voc_hcqt'][()]

        feat_file.close()

        in_batches_hcqt, nchunks_in = utils.generate_overlapadd(
            hcqt.reshape(-1, 6 * 360))
        in_batches_hcqt = in_batches_hcqt.reshape(in_batches_hcqt.shape[0],
                                                  config.batch_size,
                                                  config.max_phr_len, 6, 360)
        in_batches_hcqt = np.swapaxes(in_batches_hcqt, -1, -2)
        return in_batches_hcqt, atb, nchunks_in
예제 #9
0
def eval_file():
    file_path = config.wav_dir

    # log_dir = './log_ikala_notrain/'
    log_dir = config.log_dir

    mode = 0

    stat_file = h5py.File(config.stat_dir + 'stats.hdf5', mode='r')

    max_feat = np.array(stat_file["feats_maximus"])
    min_feat = np.array(stat_file["feats_minimus"])
    max_voc = np.array(stat_file["voc_stft_maximus"])
    min_voc = np.array(stat_file["voc_stft_minimus"])
    max_back = np.array(stat_file["back_stft_maximus"])
    min_back = np.array(stat_file["back_stft_minimus"])
    max_mix = np.array(max_voc) + np.array(max_back)

    with tf.Graph().as_default():

        input_placeholder = tf.placeholder(tf.float32,
                                           shape=(config.batch_size,
                                                  config.max_phr_len,
                                                  config.input_features),
                                           name='input_placeholder')

        with tf.variable_scope('First_Model') as scope:
            harm, ap, f0, vuv = modules.nr_wavenet(input_placeholder)

        saver = tf.train.Saver(max_to_keep=config.max_models_to_keep)

        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())
        sess = tf.Session()

        sess.run(init_op)

        ckpt = tf.train.get_checkpoint_state(log_dir)

        if ckpt and ckpt.model_checkpoint_path:
            print("Using the model in %s" % ckpt.model_checkpoint_path)
            # saver.restore(sess, ckpt.model_checkpoint_path)
            saver.restore(sess, './log/model.ckpt-59')

        # import pdb;pdb.set_trace()

        files = [
            x for x in os.listdir(config.wav_dir)
            if x.endswith('.wav') and not x.startswith('.')
        ]
        diffs = []
        count = 0
        for file_name in files:

            count += 1

            mix_stft = utils.file_to_stft(os.path.join(file_path, file_name),
                                          mode=mode)

            targs = utils.input_to_feats(os.path.join(file_path, file_name),
                                         mode=mode)

            # f0_sac = utils.file_to_sac(os.path.join(file_path,file_name))
            # f0_sac = (f0_sac-min_feat[-2])/(max_feat[-2]-min_feat[-2])

            in_batches, nchunks_in = utils.generate_overlapadd(mix_stft)
            in_batches = in_batches / max_mix
            # in_batches = utils.normalize(in_batches, 'mix_stft', mode=config.norm_mode_in)
            val_outer = []

            first_pred = []

            cleaner = []

            gan_op = []

            for in_batch in in_batches:
                val_harm, val_ap, val_f0, val_vuv = sess.run(
                    [harm, ap, f0, vuv],
                    feed_dict={input_placeholder: in_batch})
                if config.use_gan:
                    val_op = sess.run(gen_op,
                                      feed_dict={input_placeholder: in_batch})

                    gan_op.append(val_op)

                # first_pred.append(harm1)
                # cleaner.append(val_harm)
                val_harm = val_harm
                val_outs = np.concatenate((val_harm, val_ap, val_f0, val_vuv),
                                          axis=-1)
                val_outer.append(val_outs)

            val_outer = np.array(val_outer)
            val_outer = utils.overlapadd(val_outer, nchunks_in)
            val_outer[:, -1] = np.round(val_outer[:, -1])
            val_outer = val_outer[:targs.shape[0], :]
            val_outer = np.clip(val_outer, 0.0, 1.0)

            #Test purposes only
            # first_pred = np.array(first_pred)
            # first_pred = utils.overlapadd(first_pred, nchunks_in)

            # cleaner = np.array(cleaner)
            # cleaner = utils.overlapadd(cleaner, nchunks_in)

            f0_output = val_outer[:, -2] * (
                (max_feat[-2] - min_feat[-2]) + min_feat[-2])
            f0_output = f0_output * (1 - targs[:, -1])
            f0_output = utils.new_base_to_hertz(f0_output)
            f0_gt = targs[:, -2]
            f0_gt = f0_gt * (1 - targs[:, -1])
            f0_gt = utils.new_base_to_hertz(f0_gt)
            f0_outputs = []
            gt_outputs = []
            for i, f0_o in enumerate(f0_output):
                f0_outputs.append(
                    str(i * 0.00580498866 * 10000000) + ' ' + str(f0_o))

            for i, f0_o in enumerate(f0_gt):
                gt_outputs.append(
                    str(i * 0.00580498866 * 10000000) + ' ' + str(f0_o))

            utils.list_to_file(
                f0_outputs, './ikala_eval/net_out/' + file_name[:-4] + '.pv')
            utils.list_to_file(gt_outputs,
                               './ikala_eval/sac_gt/' + file_name[:-4] + '.pv')
            #     f0_difference = np.nan_to_num(abs(f0_gt-f0_output))
            #     f0_greater = np.where(f0_difference>config.f0_threshold)

            #     diff_per = f0_greater[0].shape[0]/len(f0_output)
            #     diffs.append(str(1-diff_per))
            utils.progress(count, len(files))
예제 #10
0
def synth_file(file_name="015.hdf5",
               singer_index=0,
               file_path=config.wav_dir,
               show_plots=True):

    stat_file = h5py.File('./stats.hdf5', mode='r')
    max_feat = np.array(stat_file["feats_maximus"])
    min_feat = np.array(stat_file["feats_minimus"])
    with tf.Graph().as_default():

        output_placeholder = tf.placeholder(tf.float32,
                                            shape=(config.batch_size,
                                                   config.max_phr_len, 64),
                                            name='output_placeholder')

        f0_output_placeholder = tf.placeholder(tf.float32,
                                               shape=(config.batch_size,
                                                      config.max_phr_len, 1),
                                               name='f0_output_placeholder')

        f0_input_placeholder = tf.placeholder(tf.float32,
                                              shape=(config.batch_size,
                                                     config.max_phr_len),
                                              name='f0_input_placeholder')
        f0_onehot_labels = tf.one_hot(indices=tf.cast(f0_input_placeholder,
                                                      tf.int32),
                                      depth=len(config.notes))

        f0_context_placeholder = tf.placeholder(tf.float32,
                                                shape=(config.batch_size,
                                                       config.max_phr_len, 1),
                                                name='f0_context_placeholder')

        phone_context_placeholder = tf.placeholder(
            tf.float32,
            shape=(config.batch_size, config.max_phr_len, 1),
            name='phone_context_placeholder')

        rand_input_placeholder = tf.placeholder(tf.float32,
                                                shape=(config.batch_size,
                                                       config.max_phr_len, 64),
                                                name='rand_input_placeholder')

        prob = tf.placeholder_with_default(1.0, shape=())

        phoneme_labels = tf.placeholder(tf.int32,
                                        shape=(config.batch_size,
                                               config.max_phr_len),
                                        name='phoneme_placeholder')
        phone_onehot_labels = tf.one_hot(indices=tf.cast(
            phoneme_labels, tf.int32),
                                         depth=len(config.phonemas))

        with tf.variable_scope('Generator_feats') as scope:
            inputs = tf.concat([
                phone_onehot_labels, f0_onehot_labels,
                phone_context_placeholder, f0_context_placeholder
            ],
                               axis=-1)
            voc_output = modules.GAN_generator(inputs)

        with tf.variable_scope('Generator_f0') as scope:
            inputs = tf.concat([
                phone_onehot_labels, f0_onehot_labels,
                phone_context_placeholder, f0_context_placeholder,
                output_placeholder
            ],
                               axis=-1)
            # inputs = tf.concat([phone_onehot_labels, f0_onehot_labels, phone_context_placeholder, f0_context_placeholder, (voc_output/2)+0.5], axis = -1)
            f0_output = modules.GAN_generator_f0(inputs)

            scope.reuse_variables()

            inputs = tf.concat([
                phone_onehot_labels, f0_onehot_labels,
                phone_context_placeholder, f0_context_placeholder,
                (voc_output / 2) + 0.5
            ],
                               axis=-1)
            f0_output_2 = modules.GAN_generator_f0(inputs)

        saver = tf.train.Saver(max_to_keep=config.max_models_to_keep)

        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())
        sess = tf.Session()

        sess.run(init_op)

        ckpt = tf.train.get_checkpoint_state(config.log_dir)

        if ckpt and ckpt.model_checkpoint_path:
            print("Using the model in %s" % ckpt.model_checkpoint_path)
            saver.restore(sess, ckpt.model_checkpoint_path)
        # saver.restore(sess, './log/model.ckpt-3999')

        # import pdb;pdb.set_trace()

        feat_file = h5py.File(config.feats_dir + file_name, "r")

        # speaker_file = h5py.File(config.voice_dir+speaker_file, "r")

        # feats = utils.input_to_feats('./54228_chorus.wav_ori_vocals.wav', mode = 1)

        feats = feat_file["world_feats"][()]

        feats = (feats - min_feat) / (max_feat - min_feat)

        phones = feat_file["phonemes"][()]

        notes = feat_file["notes"][()]

        phones = np.concatenate([phones, notes], axis=-1)

        # in_batches_f0, nchunks_in = utils.generate_overlapadd(f0_nor.reshape(-1,1))

        in_batches_pho, nchunks_in = utils.generate_overlapadd(phones)

        in_batches_feat, kaka = utils.generate_overlapadd(feats)

        noters = np.expand_dims(
            np.array([config.notes[int(x)] for x in notes[:, 0]]), 1)

        out_batches_feats = []

        out_batches_f0 = []

        for conds, feat in zip(in_batches_pho, in_batches_feat):
            # import pdb;pdb.set_trace()
            f0 = conds[:, :, 2]
            phones = conds[:, :, 0]
            f0_context = conds[:, :, -1:]
            phones_context = conds[:, :, 1:2]

            feed_dict = {
                f0_input_placeholder: f0,
                phoneme_labels: phones,
                phone_context_placeholder: phones_context,
                f0_context_placeholder: f0_context,
                output_placeholder: feat[:, :, :-2]
            }

            output_feats_gan, output_f0 = sess.run([voc_output, f0_output_2],
                                                   feed_dict=feed_dict)

            out_batches_feats.append(output_feats_gan / 2 + 0.5)
            out_batches_f0.append(output_f0 / 2 + 0.5)

            # out_batches_voc_stft_phase.append(output_voc_stft_phase)

        out_batches_feats = np.array(out_batches_feats)
        out_batches_feats = utils.overlapadd(out_batches_feats, nchunks_in)

        out_batches_f0 = np.array(out_batches_f0)
        out_batches_f0 = utils.overlapadd(out_batches_f0, nchunks_in)

        feats = feats * (max_feat - min_feat) + min_feat

        out_batches_feats = out_batches_feats * (max_feat[:-2] -
                                                 min_feat[:-2]) + min_feat[:-2]

        out_batches_feats = out_batches_feats[:len(feats)]

        out_batches_f0 = out_batches_f0 * (max_feat[-2] -
                                           min_feat[-2]) + min_feat[-2]

        out_batches_f0 = out_batches_f0[:len(feats)]

        diff_1 = (out_batches_f0 - noters) * (1 - feats[:, -1:])

        diff_2 = (feats[:, -2:-1] - noters) * (1 - feats[:, -1:])

        print("Mean predicted note deviation {}".format(diff_1.mean()))
        print("Mean original note deviation {}".format(diff_2.mean()))

        print("STD predicted note deviation {}".format(diff_1.std()))
        print("STD original note deviation {}".format(diff_2.std()))

        plt.figure(1)
        plt.suptitle("F0 contour")
        plt.plot(out_batches_f0, label='Predicted F0')
        plt.plot(feats[:, -2], label="Ground Truth F0")
        plt.plot(noters, label="Input Midi Note")
        # plt.plot(phones[:,])
        plt.legend()

        # plt.figure(2)
        # ax1 = plt.subplot(211)

        # plt.imshow(feats[:,:60].T,aspect='auto',origin='lower')

        # ax1.set_title("Ground Truth Vocoder Features", fontsize=10)

        # ax2 = plt.subplot(212, sharex = ax1, sharey = ax1)

        # plt.imshow(out_batches_feats[:,:60].T,aspect='auto',origin='lower')

        # ax2.set_title("GAN Output Vocoder Features", fontsize=10)

        plt.show()

        import pdb
        pdb.set_trace()

        # out_batches_feats_gan= out_batches_feats_gan[:len(feats)]

        first_op = np.concatenate(
            [out_batches_feats, out_batches_f0, feats[:, -1:]], axis=-1)
        second_op = np.concatenate(
            [feats[:, 60:64], out_batches_f0, feats[:, -1:]], axis=-1)

        # pho_op = np.concatenate([out_batches_feats_1,feats[:,-2:]], axis = -1)

        # gan_op = np.concatenate([out_batches_feats_gan,feats[:,-2:]], axis = -1)

        # import pdb;pdb.set_trace()
        # gan_op = np.ascontiguousarray(gan_op)

        # pho_op = np.ascontiguousarray(pho_op)

        first_op = np.ascontiguousarray(first_op)
        second_op = np.ascontiguousarray(second_op)

        utils.feats_to_audio(first_op, file_name[:-4] + '_gan_op')
        print("Full output saved to {}".format(
            os.path.join(config.val_dir, file_name[:-4] + '_gan_op.wav')))
        utils.feats_to_audio(first_op, file_name[:-4] + '_F0_op')
        print("Only F0 saved to {}".format(
            os.path.join(config.val_dir, file_name[:-4] + '_F0_op.wav')))
예제 #11
0
def evalNetwork(file_name,
                load_name='model_e4000_b50_bs5_1709',
                plot=False,
                synth=False):
    autoencoder_audio = AutoEncoder().cuda()
    epoch = 50

    eps = 1e-30

    # autoencoder_audio.load_state_dict(torch.load(config.log_dir+load_name+'_'+str(epoch)+'.pt'))

    autoencoder_audio.load_state_dict(
        torch.load('./log/model_e8000_b50_bs5_3369.pt'))
    stat_file = h5py.File(config.stat_dir + 'stats.hdf5', mode='r')
    '''
    import pdb;pdb.set_trace()
    enc = autoencoder_audio.encoder
    weight = enc[0].weight.data.cpu().numpy()
    plt.imshow(weight[0,0,:,:])
    '''
    max_feat = np.array(stat_file["feats_maximus"])
    min_feat = np.array(stat_file["feats_minimus"])

    max_feat_tars = max_feat[:8, :].reshape(8, 1, 513)
    min_feat_tars = min_feat[:8, :].reshape(8, 1, 513)

    max_feat_ins = max_feat[-2:, :].reshape(2, 1, 513)
    min_feat_ins = min_feat[-2:, :].reshape(2, 1, 513)

    audio, fs = stempeg.read_stems(os.path.join(config.wav_dir_test,
                                                file_name),
                                   stem_id=[0, 1, 2, 3, 4])

    mixture = audio[0]

    drums = audio[1]

    bass = audio[2]

    acc = audio[3]

    vocals = audio[4]

    mix_stft, mix_phase = utils.stft_stereo(mixture, phase=True)

    mix_stft = (mix_stft - min_feat_ins) / (max_feat_ins - min_feat_ins)

    drums_stft = utils.stft_stereo(drums)

    bass_stft = utils.stft_stereo(bass)

    acc_stft = utils.stft_stereo(acc)

    voc_stft = utils.stft_stereo(vocals)

    in_batches, nchunks_in = utils.generate_overlapadd(mix_stft)

    out_batches = []

    for in_batch in in_batches:
        # import pdb;pdb.set_trace()
        in_batch = Variable(torch.FloatTensor(in_batch)).cuda()
        out_batch = autoencoder_audio(in_batch)
        out_batches.append(np.array(out_batch.data.cpu().numpy()))

    out_batches = np.array(out_batches)

    #out_batches[out_batches == 0] = 1e-6

    vocals = out_batches[:, :, :2, :, :]

    drums = out_batches[:, :, 2:4, :, :]

    bass = out_batches[:, :, 4:6, :, :]

    others = out_batches[:, :, 6:, :, :]

    total_sources = vocals + bass + drums + others

    total_sources = total_sources

    mask_vocals = vocals / total_sources

    mask_drums = drums / total_sources

    mask_bass = bass / total_sources

    mask_others = 1 - (mask_vocals + mask_drums + mask_bass)

    out_vocals = in_batches * mask_vocals

    out_drums = in_batches * mask_drums

    out_bass = in_batches * mask_bass

    out_others = in_batches * mask_others

    out_vocals = out_vocals * (
        max_feat_tars[:2, :, :] -
        min_feat_tars[:2, :, :]) + min_feat_tars[:2, :, :]

    out_drums = out_drums * (max_feat_tars[2:4, :, :] - min_feat_tars[
        2:4, :, :]) + min_feat_tars[2:4, :, :]

    out_bass = out_bass * (max_feat_tars[4:6, :, :] -
                           min_feat_tars[4:6, :, :]) + min_feat_tars[4:6, :, :]

    out_others = out_others * (max_feat_tars[6:, :, :] - min_feat_tars[
        6:, :, :]) + min_feat_tars[6:, :, :]

    out_drums = utils.overlapadd(out_drums, nchunks_in)

    out_bass = utils.overlapadd(out_bass, nchunks_in)

    out_others = utils.overlapadd(out_others, nchunks_in)

    out_vocals = utils.overlapadd(out_vocals, nchunks_in)

    if plot:
        plt.figure(1)
        plt.suptitle(file_name[:-9])
        ax1 = plt.subplot(411)
        plt.imshow(np.log(drums_stft[0].T), aspect='auto', origin='lower')
        ax1.set_title("Drums Left Channel Ground Truth", fontsize=10)
        ax2 = plt.subplot(412, sharex=ax1, sharey=ax1)
        plt.imshow(np.log(out_drums[0].T), aspect='auto', origin='lower')
        ax2.set_title("Drums Left Channel Network Output", fontsize=10)
        ax3 = plt.subplot(413, sharex=ax1, sharey=ax1)
        plt.imshow(np.log(drums_stft[1].T), aspect='auto', origin='lower')
        ax3.set_title("Drums Right Channel Ground Truth", fontsize=10)
        ax4 = plt.subplot(414, sharex=ax1, sharey=ax1)
        plt.imshow(np.log(out_drums[1].T), aspect='auto', origin='lower')
        ax4.set_title("Drums Right Channel Network Output", fontsize=10)

        plt.figure(2)
        plt.suptitle(file_name[:-9])
        ax1 = plt.subplot(411)
        plt.imshow(np.log(voc_stft[0].T), aspect='auto', origin='lower')
        ax1.set_title("Vocals Left Channel Ground Truth", fontsize=10)
        ax2 = plt.subplot(412, sharex=ax1, sharey=ax1)
        plt.imshow(np.log(out_vocals[0].T), aspect='auto', origin='lower')
        ax2.set_title("Vocals Left Channel Network Output", fontsize=10)
        ax3 = plt.subplot(413, sharex=ax1, sharey=ax1)
        plt.imshow(np.log(voc_stft[1].T), aspect='auto', origin='lower')
        ax3.set_title("Vocals Right Channel Ground Truth", fontsize=10)
        ax4 = plt.subplot(414, sharex=ax1, sharey=ax1)
        plt.imshow(np.log(out_vocals[1].T), aspect='auto', origin='lower')
        ax4.set_title("Vocals Right Channel Network Output", fontsize=10)

        plt.figure(3)
        plt.suptitle(file_name[:-9])
        ax1 = plt.subplot(411)
        plt.imshow(np.log(bass_stft[0].T), aspect='auto', origin='lower')
        ax1.set_title("Bass Left Channel Ground Truth", fontsize=10)
        ax2 = plt.subplot(412, sharex=ax1, sharey=ax1)
        plt.imshow(np.log(out_bass[0].T), aspect='auto', origin='lower')
        ax2.set_title("Bass Left Channel Network Output", fontsize=10)
        ax3 = plt.subplot(413, sharex=ax1, sharey=ax1)
        plt.imshow(np.log(bass_stft[1].T), aspect='auto', origin='lower')
        ax3.set_title("Bass Right Channel Ground Truth", fontsize=10)
        ax4 = plt.subplot(414, sharex=ax1, sharey=ax1)
        plt.imshow(np.log(out_bass[1].T), aspect='auto', origin='lower')
        ax4.set_title("Bass Right Channel Network Output", fontsize=10)

        plt.figure(4)
        plt.suptitle(file_name[:-9])
        ax1 = plt.subplot(411)
        plt.imshow(np.log(acc_stft[0].T), aspect='auto', origin='lower')
        ax1.set_title("Others Left Channel Ground Truth", fontsize=10)
        ax2 = plt.subplot(412, sharex=ax1, sharey=ax1)
        plt.imshow(np.log(out_others[0].T), aspect='auto', origin='lower')
        ax2.set_title("Others Left Channel Network Output", fontsize=10)
        ax3 = plt.subplot(413, sharex=ax1, sharey=ax1)
        plt.imshow(np.log(acc_stft[1].T), aspect='auto', origin='lower')
        ax3.set_title("Others Right Channel Ground Truth", fontsize=10)
        ax4 = plt.subplot(414, sharex=ax1, sharey=ax1)
        plt.imshow(np.log(out_others[1].T), aspect='auto', origin='lower')
        ax4.set_title("Others Right Channel Network Output", fontsize=10)

        plt.show()

    if synth:
        # import pdb;pdb.set_trace()
        utils.inverse_stft_write(out_drums[:, :mix_phase.shape[1], :],
                                 mix_phase,
                                 config.out_dir + file_name + "_drums.wav")
        utils.inverse_stft_write(out_bass[:, :mix_phase.shape[1], :],
                                 mix_phase,
                                 config.out_dir + file_name + "_bass.wav")
        utils.inverse_stft_write(out_vocals[:, :mix_phase.shape[1], :],
                                 mix_phase,
                                 config.out_dir + file_name + "_vocals.wav")
        utils.inverse_stft_write(out_others[:, :mix_phase.shape[1], :],
                                 mix_phase,
                                 config.out_dir + file_name + "_others.wav")
예제 #12
0
def evalNetwork(file_name='Al James - Schoolboy Facination.stem.mp4',
                load_name_sep='model6',
                load_name_dn='dn_model_719',
                plot=True,
                synth=False):

    autoencoder_audio = AutoEncoder().cuda()
    denoiser = Encoder().cuda()
    epoch = 50
    autoencoder_audio.load_state_dict(
        torch.load(config.log_dir + load_name_sep + '.pt'))
    denoiser.load_state_dict(
        torch.load(config.dn_log_dir + load_name_dn + '.pt'))

    stat_file = h5py.File(config.stat_dir + 'stats.hdf5', mode='r')

    max_feat = np.array(stat_file["feats_maximus"])
    min_feat = np.array(stat_file["feats_minimus"])
    max_feat_tars = max_feat[:8, :].reshape(8, 1, 513)
    min_feat_tars = min_feat[:8, :].reshape(8, 1, 513)
    max_feat_ins = max_feat[-2:, :].reshape(2, 1, 513)
    min_feat_ins = min_feat[-2:, :].reshape(2, 1, 513)

    audio, fs = stempeg.read_stems(os.path.join(config.wav_dir_test,
                                                file_name),
                                   stem_id=[0, 1, 2, 3, 4])

    mixture = audio[0]
    drums = audio[1]
    bass = audio[2]
    acc = audio[3]
    vocals = audio[4]

    mix_stft, mix_phase = utils.stft_stereo(mixture, phase=True)

    mix_stft = (mix_stft - min_feat_ins) / (max_feat_ins - min_feat_ins)

    drums_stft = utils.stft_stereo(drums)

    bass_stft = utils.stft_stereo(bass)

    acc_stft = utils.stft_stereo(acc)

    voc_stft = utils.stft_stereo(vocals)

    in_batches, nchunks_in = utils.generate_overlapadd(mix_stft)

    out_batches = []

    for in_batch in in_batches:
        # import pdb;pdb.set_trace()
        in_batch = Variable(torch.FloatTensor(in_batch)).cuda()
        out_batch = autoencoder_audio(in_batch)
        out_batches.append(np.array(out_batch.data.cpu().numpy()))

    out_batches = np.array(out_batches)

    vocals = out_batches[:, :, :2, :, :]

    drums = out_batches[:, :, 2:4, :, :]

    bass = out_batches[:, :, 4:6, :, :]

    others = out_batches[:, :, 6:, :, :]

    total_sources = vocals + bass + drums + others

    mask_vocals = vocals / total_sources

    mask_drums = drums / total_sources

    mask_bass = bass / total_sources

    mask_others = 1 - (mask_vocals + mask_drums + mask_bass)

    out_vocals = in_batches * mask_vocals

    out_drums = in_batches * mask_drums

    out_bass = in_batches * mask_bass

    out_others = in_batches * mask_others

    out_vocals_2 = out_vocals * (
        max_feat_tars[:2, :, :] -
        min_feat_tars[:2, :, :]) + min_feat_tars[:2, :, :]
    out_drums = out_drums * (max_feat_tars[2:4, :, :] - min_feat_tars[
        2:4, :, :]) + min_feat_tars[2:4, :, :]

    out_bass = out_bass * (max_feat_tars[4:6, :, :] -
                           min_feat_tars[4:6, :, :]) + min_feat_tars[4:6, :, :]

    out_others = out_others * (max_feat_tars[6:, :, :] - min_feat_tars[
        6:, :, :]) + min_feat_tars[6:, :, :]

    out_batches_vocals = []
    #print (np.array(out_vocals_2).shape)
    for vocal_batch in range(vocals.shape[0]):
        vocal_batch = Variable(
            torch.FloatTensor(out_vocals_2[vocal_batch, :, :])).cuda()
        out_batch = denoiser(vocal_batch)
        out_batches_vocals.append(np.array(out_batch.data.cpu().numpy()))
    out_vocals_2 = utils.overlapadd(out_vocals_2, nchunks_in)
    out_vocals = utils.overlapadd(np.array(out_batches_vocals), nchunks_in)
    #out_vocals = out_vocals*(max_feat_tars[:2,:,:]-min_feat_tars[:2,:,:])+min_feat_tars[:2,:,:]
    print(out_vocals.shape)
    if plot:
        plt.figure(1)
        ax1 = plt.subplot(411)
        plt.imshow(np.log(out_vocals_2[0].T), aspect='auto', origin='lower')
        ax1.set_title("Vocals Left Channel Input", fontsize=10)
        ax2 = plt.subplot(412, sharex=ax1, sharey=ax1)
        plt.imshow(np.log(out_vocals[0].T), aspect='auto', origin='lower')
        ax2.set_title("Vocals Left Channel Network Output", fontsize=10)
        ax3 = plt.subplot(413, sharex=ax1, sharey=ax1)
        plt.imshow(np.log(out_vocals_2[1].T), aspect='auto', origin='lower')
        ax3.set_title("Vocals Right Channel Input", fontsize=10)
        ax4 = plt.subplot(414, sharex=ax1, sharey=ax1)
        plt.imshow(np.log(out_vocals[1].T), aspect='auto', origin='lower')
        ax4.set_title("Vocals Right Channel Network Output", fontsize=10)

        plt.show()
예제 #13
0
파일: models.py 프로젝트: pc2752/sep_synth
    def process_file(self, voc_stft, voc_stft_singer, sess):

        stat_file = h5py.File(config.stat_dir+'stats.hdf5', mode='r')

        max_voc = np.array(stat_file["voc_stft_maximus"])
        min_voc = np.array(stat_file["voc_stft_minimus"])

        max_feat = np.array(stat_file["feats_maximus"])
        min_feat = np.array(stat_file["feats_minimus"])
        stat_file.close()


        if len(voc_stft)>len(voc_stft_singer):
            voc_stft = voc_stft[:len(voc_stft_singer)]
        else:
            voc_stft_singer = voc_stft_singer[:len(voc_stft)]

        in_batches_stft, nchunks_in = utils.generate_overlapadd(voc_stft)

        in_batches_stft_singer, nchunks_in_singer = utils.generate_overlapadd(voc_stft_singer)

        in_batches_stft = np.clip(in_batches_stft, 0.0, 1.0) 

        in_batches_stft_singer = np.clip(in_batches_stft_singer, 0.0, 1.0) 


        out_batches_feats = []
        out_batches_f0 = []

        out_batches_singer = []

        # for in_batch_stft_singer in in_batches_stft_singer:
        #     feed_dict = {self.input_placeholder_singer: in_batch_stft_singer, self.is_train: False}
        #     singer_est = sess.run(self.singer_probs, feed_dict=feed_dict)
        #     out_batches_singer.append(singer_est)

        # singer_emb = np.tile(np.mean(np.mean(np.array(out_batches_singer), axis = 0), axis = 0), [config.batch_size, 1])


        # singer_emb = np.tile(one_hotize(np.argmax(np.mean(np.mean(np.array(out_batches_singer), axis = 0), axis = 0), axis = -1), config.num_singers), [config.batch_size, 1])
        # pho_est = one_hotize(np.argmax(pho_est, axis = -1), config.num_phos)

        for in_batch_stft, in_batch_stft_singer in zip(in_batches_stft, in_batches_stft_singer) :
            feed_dict = {self.input_placeholder: in_batch_stft, self.is_train: False}
            f0_est, pho_est = sess.run([self.f0_probs, self.pho_probs], feed_dict=feed_dict)
            feed_dict = {self.input_placeholder: in_batch_stft, self.input_placeholder_singer: in_batch_stft_singer, self.f0_onehot_labels: f0_est, self.phone_onehot_labels: pho_est, self.is_train: False}
            out_feats = sess.run(self.output, feed_dict=feed_dict)
            out_batches_feats.append(out_feats)
            out_batches_f0.append(f0_est)

        out_batches_feats = np.array(out_batches_feats)

        out_batches_feats = utils.overlapadd(out_batches_feats,nchunks_in)

        # out_batches_wav = utils.overlapadd(out_batches_wav,nchunks_in)

        # out_batches_wav = utils.overlapadd(np.expand_dims(out_batches_wav, -1),nchunks_in, overlap = config.max_phr_len*2**7) 

        # out_batches_wav = out_batches_wav *2 -1

        out_batches_feats = out_batches_feats*(max_feat[:-1] - min_feat[:-1]) + min_feat[:-1]

        return out_batches_feats
예제 #14
0
    def extract_part_from_file(self, file_name, part, sess):

        parts = ['_soprano_', '_alto_', '_bass_', '_tenor_']

        cqt = self.read_input_file(file_name)

        song_name = file_name.split('_')[0]

        voc_num = 9 - part
        voc_part = parts[part]
        voc_track = file_name[-voc_num]

        voc_feat_file = h5py.File(
            config.voc_feats_dir + song_name + voc_part + voc_track +
            '.wav.hdf5', 'r')

        voc_feats = voc_feat_file["voc_feats"][()]

        voc_feats[np.argwhere(np.isnan(voc_feats))] = 0.0

        atb = voc_feat_file['atb'][()]

        atb = atb[:, 1:]

        atb[:, 0:4] = 0

        atb = np.clip(atb, 0.0, 1.0)

        max_len = min(len(voc_feats), len(cqt))

        voc_feats = voc_feats[:max_len]

        cqt = cqt[:max_len]

        atb = atb[:max_len]

        # voc_feats = (voc_feats - min_feat) / (max_feat - min_feat)
        #
        # voc_feats = np.clip(voc_feats[:, :, :-2], 0.0, 0.1)

        # sig_process.feats_to_audio(voc_feats, 'booboo.wav')

        in_batches_cqt, nchunks_in = utils.generate_overlapadd(cqt)

        in_batches_atb, nchunks_in = utils.generate_overlapadd(atb)

        # import pdb;pdb.set_trace()
        out_batches_feats = []
        for in_batch_cqt, in_batch_atb in zip(in_batches_cqt, in_batches_atb):
            feed_dict = {
                self.input_placeholder: in_batch_cqt,
                self.f0_placeholder: in_batch_atb,
                self.is_train: False
            }
            out_feat = sess.run(self.output_logits, feed_dict=feed_dict)
            out_batches_feats.append(out_feat)

        out_batches_feats = np.array(out_batches_feats)

        out_feats = utils.overlapadd(
            out_batches_feats.reshape(out_batches_feats.shape[0],
                                      config.batch_size, config.max_phr_len,
                                      -1), nchunks_in)

        out_feats = out_feats * (max_feat - min_feat) + min_feat

        out_feats = out_feats[:max_len]

        out_feats = np.concatenate((out_feats, voc_feats[:, -2:]), axis=-1)

        plt.figure(1)
        plt.subplot(211)
        plt.imshow(voc_feats.T, origin='lower', aspect='auto')
        plt.subplot(212)
        plt.imshow(out_feats.T, origin='lower', aspect='auto')
        plt.show()

        sig_process.feats_to_audio(out_feats, 'extracted.wav')

        import pdb
        pdb.set_trace()
예제 #15
0
def synth_file(file_name,
               file_path=config.wav_dir,
               show_plots=True,
               save_file=True):
    if file_name.startswith('ikala'):
        file_name = file_name[6:]
        file_path = config.wav_dir
        utils.write_ori_ikala(os.path.join(file_path, file_name), file_name)
        mode = 0
    elif file_name.startswith('mir'):
        file_name = file_name[4:]
        file_path = config.wav_dir_mir
        utils.write_ori_ikala(os.path.join(file_path, file_name), file_name)
        mode = 0
    elif file_name.startswith('med'):
        file_name = file_name[4:]
        file_path = config.wav_dir_med
        utils.write_ori_med(os.path.join(file_path, file_name), file_name)
        mode = 2
    else:
        mode = 1
        file_path = './'

    stat_file = h5py.File(config.stat_dir + 'stats.hdf5', mode='r')

    max_feat = np.array(stat_file["feats_maximus"])
    min_feat = np.array(stat_file["feats_minimus"])
    max_voc = np.array(stat_file["voc_stft_maximus"])
    min_voc = np.array(stat_file["voc_stft_minimus"])
    max_back = np.array(stat_file["back_stft_maximus"])
    min_back = np.array(stat_file["back_stft_minimus"])
    max_mix = np.array(max_voc) + np.array(max_back)

    with tf.Graph().as_default():

        input_placeholder = tf.placeholder(tf.float32,
                                           shape=(config.batch_size,
                                                  config.max_phr_len,
                                                  config.input_features),
                                           name='input_placeholder')

        with tf.variable_scope('First_Model') as scope:
            harm, ap, f0, vuv = modules.nr_wavenet(input_placeholder)

            # harmy = harm_1+harm

        if config.use_gan:
            with tf.variable_scope('Generator') as scope:
                gen_op = modules.GAN_generator(harm)
        # with tf.variable_scope('Discriminator') as scope:
        #     D_real = modules.GAN_discriminator(target_placeholder[:,:,:60],input_placeholder)
        #     scope.reuse_variables()
        #     D_fake = modules.GAN_discriminator(gen_op,input_placeholder)

        saver = tf.train.Saver(max_to_keep=config.max_models_to_keep)

        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())
        sess = tf.Session()

        sess.run(init_op)

        ckpt = tf.train.get_checkpoint_state(config.log_dir_m1)

        if ckpt and ckpt.model_checkpoint_path:
            print("Using the model in %s" % ckpt.model_checkpoint_path)
            saver.restore(sess, ckpt.model_checkpoint_path)

        mix_stft = utils.file_to_stft(os.path.join(file_path, file_name),
                                      mode=mode)

        targs = utils.input_to_feats(os.path.join(file_path, file_name),
                                     mode=mode)

        import pdb
        pdb.set_trace()

        # f0_sac = utils.file_to_sac(os.path.join(file_path,file_name))
        # f0_sac = (f0_sac-min_feat[-2])/(max_feat[-2]-min_feat[-2])

        in_batches, nchunks_in = utils.generate_overlapadd(mix_stft)
        in_batches = in_batches / max_mix
        # in_batches = utils.normalize(in_batches, 'mix_stft', mode=config.norm_mode_in)
        val_outer = []

        first_pred = []

        cleaner = []

        gan_op = []

        for in_batch in in_batches:
            val_harm, val_ap, val_f0, val_vuv = sess.run(
                [harm, ap, f0, vuv], feed_dict={input_placeholder: in_batch})
            if config.use_gan:
                val_op = sess.run(gen_op,
                                  feed_dict={input_placeholder: in_batch})

                gan_op.append(val_op)

            # first_pred.append(harm1)
            # cleaner.append(val_harm)
            val_harm = val_harm
            val_outs = np.concatenate((val_harm, val_ap, val_f0, val_vuv),
                                      axis=-1)
            val_outer.append(val_outs)

        val_outer = np.array(val_outer)
        val_outer = utils.overlapadd(val_outer, nchunks_in)
        val_outer[:, -1] = np.round(val_outer[:, -1])
        val_outer = val_outer[:targs.shape[0], :]
        val_outer = np.clip(val_outer, 0.0, 1.0)

        import pdb
        pdb.set_trace()

        #Test purposes only
        # first_pred = np.array(first_pred)
        # first_pred = utils.overlapadd(first_pred, nchunks_in)

        # cleaner = np.array(cleaner)
        # cleaner = utils.overlapadd(cleaner, nchunks_in)

        if config.use_gan:
            gan_op = np.array(gan_op)
            gan_op = utils.overlapadd(gan_op, nchunks_in)

        targs = (targs - min_feat) / (max_feat - min_feat)

        # first_pred = (first_pred-min_feat[:60])/(max_feat[:60]-min_feat[:60])
        # cleaner = (cleaner-min_feat[:60])/(max_feat[:60]-min_feat[:60])

        # ax1 = plt.subplot(311)
        # plt.imshow(targs[:,:60].T, origin='lower', aspect='auto')
        # # ax1.set_title("Harmonic Spectral Envelope", fontsize = 10)
        # ax2 = plt.subplot(312)
        # plt.imshow(targs[:,60:64].T, origin='lower', aspect='auto')
        # # ax2.set_title("Aperiodicity Envelope", fontsize = 10)
        # ax3 = plt.subplot(313)
        # plt.plot(targs[:,-2])
        # ax3.set_title("Fundamental Frequency Contour", fontsize = 10)
        if show_plots:

            # import pdb;pdb.set_trace()

            ins = val_outer[:, :60]
            outs = targs[:, :60]
            plt.figure(1)
            ax1 = plt.subplot(211)
            plt.imshow(ins.T, origin='lower', aspect='auto')
            ax1.set_title("Predicted Harm ", fontsize=10)
            ax2 = plt.subplot(212)
            plt.imshow(outs.T, origin='lower', aspect='auto')
            ax2.set_title("Ground Truth Harm ", fontsize=10)
            # ax1 = plt.subplot(413)
            # plt.imshow(first_pred.T, origin='lower', aspect='auto')
            # ax1.set_title("Initial Prediction ", fontsize = 10)
            # ax2 = plt.subplot(412)
            # plt.imshow(cleaner.T, origin='lower', aspect='auto')
            # ax2.set_title("Residual Added ", fontsize = 10)

            if config.use_gan:
                plt.figure(5)
                ax1 = plt.subplot(411)
                plt.imshow(ins.T, origin='lower', aspect='auto')
                ax1.set_title("Predicted Harm ", fontsize=10)
                ax2 = plt.subplot(414)
                plt.imshow(outs.T, origin='lower', aspect='auto')
                ax2.set_title("Ground Truth Harm ", fontsize=10)
                ax1 = plt.subplot(412)
                plt.imshow(gan_op.T, origin='lower', aspect='auto')
                ax1.set_title("GAN output ", fontsize=10)
                ax1 = plt.subplot(413)
                plt.imshow((gan_op[:ins.shape[0], :] + ins).T,
                           origin='lower',
                           aspect='auto')
                ax1.set_title("GAN output ", fontsize=10)

            plt.figure(2)
            ax1 = plt.subplot(211)
            plt.imshow(val_outer[:, 60:-2].T, origin='lower', aspect='auto')
            ax1.set_title("Predicted Aperiodic Part", fontsize=10)
            ax2 = plt.subplot(212)
            plt.imshow(targs[:, 60:-2].T, origin='lower', aspect='auto')
            ax2.set_title("Ground Truth Aperiodic Part", fontsize=10)

            plt.figure(3)

            f0_output = val_outer[:, -2] * (
                (max_feat[-2] - min_feat[-2]) + min_feat[-2])
            f0_output = f0_output * (1 - targs[:, -1])
            f0_output[f0_output == 0] = np.nan
            plt.plot(f0_output, label="Predicted Value")
            f0_gt = targs[:, -2] * (
                (max_feat[-2] - min_feat[-2]) + min_feat[-2])
            f0_gt = f0_gt * (1 - targs[:, -1])
            f0_gt[f0_gt == 0] = np.nan
            plt.plot(f0_gt, label="Ground Truth")
            f0_difference = np.nan_to_num(abs(f0_gt - f0_output))
            f0_greater = np.where(f0_difference > config.f0_threshold)
            diff_per = f0_greater[0].shape[0] / len(f0_output)
            plt.suptitle("Percentage correct = " +
                         '{:.3%}'.format(1 - diff_per))
            # import pdb;pdb.set_trace()

            # import pdb;pdb.set_trace()
            # uu = f0_sac[:,0]*(1-f0_sac[:,1])
            # uu[uu == 0] = np.nan
            # plt.plot(uu, label="Sac f0")
            plt.legend()
            plt.figure(4)
            ax1 = plt.subplot(211)
            plt.plot(val_outer[:, -1])
            ax1.set_title("Predicted Voiced/Unvoiced", fontsize=10)
            ax2 = plt.subplot(212)
            plt.plot(targs[:, -1])
            ax2.set_title("Ground Truth Voiced/Unvoiced", fontsize=10)
            plt.show()
        if save_file:

            val_outer = np.ascontiguousarray(val_outer *
                                             (max_feat - min_feat) + min_feat)
            targs = np.ascontiguousarray(targs * (max_feat - min_feat) +
                                         min_feat)

            # import pdb;pdb.set_trace()

            # val_outer = np.ascontiguousarray(utils.denormalize(val_outer,'feats', mode=config.norm_mode_out))
            try:
                utils.feats_to_audio(val_outer,
                                     file_name[:-4] + '_synth_pred_f0')
                print("File saved to %s" % config.val_dir + file_name[:-4] +
                      '_synth_pred_f0.wav')
            except:
                print("Couldn't synthesize with predicted f0")
            try:
                val_outer[:, -2:] = targs[:, -2:]
                utils.feats_to_audio(val_outer,
                                     file_name[:-4] + '_synth_ori_f0')
                print("File saved to %s" % config.val_dir + file_name[:-4] +
                      '_synth_ori_f0.wav')
            except:
                print("Couldn't synthesize with original f0")
예제 #16
0
def synth_file(file_name="nus_MCUR_sing_10.hdf5",
               singer_index=0,
               file_path=config.wav_dir,
               show_plots=True,
               save_file="GBO"):

    stat_file = h5py.File(config.stat_dir + 'stats.hdf5', mode='r')
    max_feat = np.array(stat_file["feats_maximus"])
    min_feat = np.array(stat_file["feats_minimus"])
    with tf.Graph().as_default():

        input_placeholder = tf.placeholder(tf.float32,
                                           shape=(config.batch_size,
                                                  config.max_phr_len, 66),
                                           name='input_placeholder')
        tf.summary.histogram('inputs', input_placeholder)

        output_placeholder = tf.placeholder(tf.float32,
                                            shape=(config.batch_size,
                                                   config.max_phr_len, 64),
                                            name='output_placeholder')

        f0_input_placeholder = tf.placeholder(tf.float32,
                                              shape=(config.batch_size,
                                                     config.max_phr_len, 1),
                                              name='f0_input_placeholder')

        rand_input_placeholder = tf.placeholder(tf.float32,
                                                shape=(config.batch_size,
                                                       config.max_phr_len, 4),
                                                name='rand_input_placeholder')

        prob = tf.placeholder_with_default(1.0, shape=())

        phoneme_labels = tf.placeholder(tf.int32,
                                        shape=(config.batch_size,
                                               config.max_phr_len),
                                        name='phoneme_placeholder')
        phone_onehot_labels = tf.one_hot(indices=tf.cast(
            phoneme_labels, tf.int32),
                                         depth=42)

        phoneme_labels_2 = tf.placeholder(tf.float32,
                                          shape=(config.batch_size,
                                                 config.max_phr_len, 42),
                                          name='phoneme_placeholder_1')
        # phone_onehot_labels = tf.one_hot(indices=tf.cast(phoneme_labels, tf.int32), depth=42)

        singer_labels = tf.placeholder(tf.float32,
                                       shape=(config.batch_size),
                                       name='singer_placeholder')
        singer_onehot_labels = tf.one_hot(indices=tf.cast(
            singer_labels, tf.int32),
                                          depth=12)

        with tf.variable_scope('phone_Model') as scope:
            # regularizer = tf.contrib.layers.l2_regularizer(scale=0.1)
            pho_logits = modules.phone_network(input_placeholder)
            pho_classes = tf.argmax(pho_logits, axis=-1)
            pho_probs = tf.nn.softmax(pho_logits)

        with tf.variable_scope('Final_Model') as scope:
            voc_output = modules.final_net(singer_onehot_labels,
                                           f0_input_placeholder,
                                           phoneme_labels_2)
            voc_output_decoded = tf.nn.sigmoid(voc_output)
            scope.reuse_variables()
            voc_output_3 = modules.final_net(singer_onehot_labels,
                                             f0_input_placeholder, pho_probs)
            voc_output_3_decoded = tf.nn.sigmoid(voc_output_3)

            # scope.reuse_variables()

            # voc_output_gen = modules.final_net(singer_onehot_labels, f0_input_placeholder, pho_probs)
            # voc_output_decoded_gen = tf.nn.sigmoid(voc_output_gen)

        # with tf.variable_scope('singer_Model') as scope:
        #     singer_embedding, singer_logits = modules.singer_network(input_placeholder, prob)
        #     singer_classes = tf.argmax(singer_logits, axis=-1)
        #     singer_probs = tf.nn.softmax(singer_logits)

        with tf.variable_scope('Generator') as scope:
            voc_output_2 = modules.GAN_generator(singer_onehot_labels,
                                                 phoneme_labels_2,
                                                 f0_input_placeholder,
                                                 rand_input_placeholder)

        with tf.variable_scope('Discriminator') as scope:
            D_fake = modules.GAN_discriminator(voc_output_2,
                                               singer_onehot_labels,
                                               phone_onehot_labels,
                                               f0_input_placeholder)

        saver = tf.train.Saver(max_to_keep=config.max_models_to_keep)

        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())
        sess = tf.Session()

        sess.run(init_op)

        ckpt = tf.train.get_checkpoint_state(config.log_dir)

        if ckpt and ckpt.model_checkpoint_path:
            print("Using the model in %s" % ckpt.model_checkpoint_path)
            saver.restore(sess, ckpt.model_checkpoint_path)
        # saver.restore(sess, './log/model.ckpt-3999')

        # import pdb;pdb.set_trace()

        voc_file = h5py.File(config.voice_dir + file_name, "r")

        # speaker_file = h5py.File(config.voice_dir+speaker_file, "r")

        feats = np.array(voc_file['feats'])
        # feats = utils.input_to_feats('./54228_chorus.wav_ori_vocals.wav', mode = 1)

        f0 = feats[:, -2]

        # import pdb;pdb.set_trace()

        med = np.median(f0[f0 > 0])

        f0[f0 == 0] = med

        f0 = f0 - 12

        feats[:, -2] = feats[:, -2] - 12

        f0_nor = (f0 - min_feat[-2]) / (max_feat[-2] - min_feat[-2])

        feats = (feats - min_feat) / (max_feat - min_feat)

        pho_target = np.array(voc_file["phonemes"])

        in_batches_f0, nchunks_in = utils.generate_overlapadd(
            f0_nor.reshape(-1, 1))

        in_batches_pho, nchunks_in_pho = utils.generate_overlapadd(
            pho_target.reshape(-1, 1))

        in_batches_feat, kaka = utils.generate_overlapadd(feats)

        # import pdb;pdb.set_trace()

        out_batches_feats = []

        out_batches_feats_1 = []

        out_batches_feats_gan = []

        for in_batch_f0, in_batch_pho_target, in_batch_feat in zip(
                in_batches_f0, in_batches_pho, in_batches_feat):

            in_batch_f0 = in_batch_f0.reshape(
                [config.batch_size, config.max_phr_len, 1])

            in_batch_pho_target = in_batch_pho_target.reshape(
                [config.batch_size, config.max_phr_len])

            # in_batch_pho_target = sess.run(pho_probs, feed_dict = {input_placeholder: in_batch_feat})

            output_feats, output_feats_1, output_feats_gan = sess.run(
                [voc_output_decoded, voc_output_3_decoded, voc_output_2],
                feed_dict={
                    input_placeholder:
                    in_batch_feat,
                    f0_input_placeholder:
                    in_batch_f0,
                    phoneme_labels_2:
                    in_batch_pho_target,
                    singer_labels:
                    np.ones(30) * singer_index,
                    rand_input_placeholder:
                    np.random.normal(-1.0,
                                     1.0,
                                     size=[30, config.max_phr_len, 4])
                })

            out_batches_feats.append(output_feats)

            out_batches_feats_1.append(output_feats_1)

            out_batches_feats_gan.append(output_feats_gan / 2 + 0.5)

            # out_batches_voc_stft_phase.append(output_voc_stft_phase)

        # import pdb;pdb.set_trace()

        out_batches_feats = np.array(out_batches_feats)
        # import pdb;pdb.set_trace()
        out_batches_feats = utils.overlapadd(out_batches_feats, nchunks_in)

        out_batches_feats_1 = np.array(out_batches_feats_1)
        # import pdb;pdb.set_trace()
        out_batches_feats_1 = utils.overlapadd(out_batches_feats_1, nchunks_in)

        out_batches_feats_gan = np.array(out_batches_feats_gan)
        # import pdb;pdb.set_trace()
        out_batches_feats_gan = utils.overlapadd(out_batches_feats_gan,
                                                 nchunks_in)

        feats = feats * (max_feat - min_feat) + min_feat

        out_batches_feats = out_batches_feats * (max_feat[:-2] -
                                                 min_feat[:-2]) + min_feat[:-2]

        out_batches_feats_1 = out_batches_feats_1 * (
            max_feat[:-2] - min_feat[:-2]) + min_feat[:-2]

        out_batches_feats_gan = out_batches_feats_gan * (
            max_feat[:-2] - min_feat[:-2]) + min_feat[:-2]

        out_batches_feats = out_batches_feats[:len(feats)]

        out_batches_feats_1 = out_batches_feats_1[:len(feats)]

        out_batches_feats_gan = out_batches_feats_gan[:len(feats)]

        first_op = np.concatenate([out_batches_feats, feats[:, -2:]], axis=-1)

        pho_op = np.concatenate([out_batches_feats_1, feats[:, -2:]], axis=-1)

        gan_op = np.concatenate([out_batches_feats_gan, feats[:, -2:]],
                                axis=-1)

        # import pdb;pdb.set_trace()
        gan_op = np.ascontiguousarray(gan_op)

        pho_op = np.ascontiguousarray(pho_op)

        first_op = np.ascontiguousarray(first_op)

        if show_plots:

            plt.figure(1)

            ax1 = plt.subplot(311)

            plt.imshow(feats[:, :60].T, aspect='auto', origin='lower')

            ax1.set_title("Ground Truth Vocoder Features", fontsize=10)

            ax2 = plt.subplot(312, sharex=ax1, sharey=ax1)

            plt.imshow(out_batches_feats[:, :60].T,
                       aspect='auto',
                       origin='lower')

            ax2.set_title("Cross Entropy Output Vocoder Features", fontsize=10)

            ax3 = plt.subplot(313, sharex=ax1, sharey=ax1)

            ax3.set_title("GAN Vocoder Output Features", fontsize=10)

            # plt.imshow(out_batches_feats_1[:,:60].T,aspect='auto',origin='lower')
            #
            # plt.subplot(414, sharex = ax1, sharey = ax1)

            plt.imshow(out_batches_feats_gan[:, :60].T,
                       aspect='auto',
                       origin='lower')

            plt.figure(2)

            plt.subplot(211)

            plt.imshow(feats[:, 60:-2].T, aspect='auto', origin='lower')

            plt.subplot(212)

            plt.imshow(out_batches_feats[:, -4:].T,
                       aspect='auto',
                       origin='lower')

            plt.show()

            save_file = input(
                "Which files to synthesise G for GAN, B for Binary Entropy, "
                "O for original, or any combination. Default is None").upper(
                ) or "N"

        else:
            save_file = input(
                "Which files to synthesise G for GAN, B for Binary Entropy, "
                "O for original, or any combination. Default is all (GBO)"
            ).upper() or "GBO"

        if "G" in save_file:

            utils.feats_to_audio(gan_op[:, :], file_name[:-4] + 'gan_op.wav')

            print("GAN file saved to {}".format(
                os.path.join(config.val_dir, file_name[:-4] + 'gan_op.wav')))

        if "O" in save_file:

            utils.feats_to_audio(feats[:, :], file_name[:-4] + 'ori_op.wav')

            print("Originl file, resynthesized via WORLD vocoder saved to {}".
                  format(
                      os.path.join(config.val_dir,
                                   file_name[:-4] + 'ori_op.wav')))
            #
        if "B" in save_file:
            # # utils.feats_to_audio(pho_op[:5000,:],file_name[:-4]+'phoop.wav')
            #
            utils.feats_to_audio(first_op[:, :], file_name[:-4] + 'bce_op.wav')
            print("Binar cross entropy file saved to {}".format(
                os.path.join(config.val_dir, file_name[:-4] + 'bce_op.wav')))
예제 #17
0
def evalNets(
        pcs_model='model_e8000_b50_bs5_3429',
        file_to_eval="None",
        path='/home/pc2752/share/JoanMaster/PytorchConvSep/data_h5py_test'):

    autoencoder_audio = AutoEncoder().cuda()
    autoencoder_audio.load_state_dict(
        torch.load(config.log_dir + pcs_model + '.pt'))
    stat_file = h5py.File(config.stat_dir + 'stats.hdf5', mode='r')

    max_feat = np.array(stat_file["feats_maximus"])
    min_feat = np.array(stat_file["feats_minimus"])

    max_feat_tars = max_feat[:8, :].reshape(8, 1, 513)
    min_feat_tars = min_feat[:8, :].reshape(8, 1, 513)

    max_feat_ins = max_feat[-2:, :].reshape(2, 1, 513)
    min_feat_ins = min_feat[-2:, :].reshape(2, 1, 513)

    wav_files = [
        x for x in os.listdir(config.wav_dir_test)
        if x.endswith('.stem.mp4') and not x.startswith(".")
    ]

    random_files = [random.choice(wav_files) for x in range(50)]

    file_length = int(44100 * 6)

    SDR_error = []

    SIR_error = []

    SAR_error = []

    ISR_error = []

    for file_name in random_files:

        audio, fs = stempeg.read_stems(os.path.join(config.wav_dir_test,
                                                    file_name),
                                       stem_id=[0, 1, 2, 3, 4])

        mixture = audio[0]

        drums = audio[1]

        bass = audio[2]

        acc = audio[3]

        vocals = audio[4]

        mix_stft, mix_phase = utils.stft_stereo(mixture, phase=True)

        mix_stft = (mix_stft - min_feat_ins) / (max_feat_ins - min_feat_ins)

        drums_stft = utils.stft_stereo(drums)

        bass_stft = utils.stft_stereo(bass)

        acc_stft = utils.stft_stereo(acc)

        voc_stft = utils.stft_stereo(vocals)

        in_batches, nchunks_in = utils.generate_overlapadd(mix_stft)

        out_batches = []

        for in_batch in in_batches:
            # import pdb;pdb.set_trace()
            in_batch = Variable(torch.FloatTensor(in_batch)).cuda()
            out_batch = autoencoder_audio(in_batch)
            out_batches.append(np.array(out_batch.data.cpu().numpy()))

        out_batches = np.array(out_batches)

        vocals = out_batches[:, :, :2, :, :]

        drums = out_batches[:, :, 2:4, :, :]

        bass = out_batches[:, :, 4:6, :, :]

        others = out_batches[:, :, 6:, :, :]

        total_sources = vocals + bass + drums + others

        mask_vocals = vocals / total_sources

        mask_drums = drums / total_sources

        mask_bass = bass / total_sources

        mask_others = 1 - (mask_vocals + mask_drums + mask_bass)

        out_vocals = in_batches * mask_vocals

        out_drums = in_batches * mask_drums

        out_bass = in_batches * mask_bass

        out_others = in_batches * mask_others

        out_vocals = out_vocals * (
            max_feat_tars[:2, :, :] -
            min_feat_tars[:2, :, :]) + min_feat_tars[:2, :, :]

        out_drums = out_drums * (max_feat_tars[2:4, :, :] - min_feat_tars[
            2:4, :, :]) + min_feat_tars[2:4, :, :]

        out_bass = out_bass * (max_feat_tars[4:6, :, :] - min_feat_tars[
            4:6, :, :]) + min_feat_tars[4:6, :, :]

        out_others = out_others * (max_feat_tars[6:, :, :] - min_feat_tars[
            6:, :, :]) + min_feat_tars[6:, :, :]

        out_drums = utils.overlapadd(out_drums, nchunks_in)

        out_bass = utils.overlapadd(out_bass, nchunks_in)

        out_others = utils.overlapadd(out_others, nchunks_in)

        out_vocals = utils.overlapadd(out_vocals, nchunks_in)

        out_drums = utils.inverse_stft(out_drums[:, :mix_phase.shape[1], :],
                                       mix_phase)

        out_bass = utils.inverse_stft(out_bass[:, :mix_phase.shape[1], :],
                                      mix_phase)

        out_others = utils.inverse_stft(out_others[:, :mix_phase.shape[1], :],
                                        mix_phase)

        out_vocals = utils.inverse_stft(out_vocals[:, :mix_phase.shape[1], :],
                                        mix_phase)

        estimated = np.transpose(
            np.concatenate((out_drums, out_bass, out_others, out_vocals),
                           axis=1))

        zero_pad_drums = np.zeros(
            [abs(audio[1].shape[0] - out_drums.shape[0]), 2])

        zero_pad_bass = np.zeros(
            [abs(audio[2].shape[0] - out_bass.shape[0]), 2])

        zero_pad_others = np.zeros(
            [abs(audio[3].shape[0] - out_others.shape[0]), 2])

        zero_pad_vocals = np.zeros(
            [abs(audio[4].shape[0] - out_vocals.shape[0]), 2])

        target_drums = np.append(audio[1], zero_pad_drums, 0)

        target_bass = np.append(audio[2], zero_pad_bass, 0)

        target_others = np.append(audio[3], zero_pad_others, 0)

        target_vocals = np.append(audio[4], zero_pad_vocals, 0)

        targets = np.transpose(
            np.concatenate(
                (target_vocals, target_drums, target_bass, target_others),
                axis=1))

        index = np.random.randint(0, target_vocals.shape[0] - file_length)
        #import pdb;pdb.set_trace()
        targets_no_zero = targets[:, index:index + file_length]
        targets_no_zero[targets_no_zero == 0] = 1e-8

        estimated_no_zero = estimated[:, index:index + file_length]
        estimated_no_zero[estimated_no_zero == 0] = 1e-8

        [SDR, ISR, SIR, SAR,
         _] = mir_eval.separation.bss_eval_images(targets_no_zero,
                                                  estimated_no_zero)

        SDR_error.append(SDR)
        SAR_error.append(SAR)
        SIR_error.append(SIR)
        ISR_error.append(ISR)

        for sdr in SDR:
            print(sdr)
        if not np.isnan(SDR).any():
            np.save(config.err_dir + 'SDR_error', np.array(SDR_error))
            np.save(config.err_dir + 'SAR_error', np.array(SAR_error))
            np.save(config.err_dir + 'SIR_error', np.array(SIR_error))
            np.save(config.err_dir + 'ISR_error', np.array(ISR_error))