def _single_conversion(filename, model, one_hot_emo):
    '''
    THIS WON'T WORK RIGHT NOW, USE THE WORLD CONVERSION LOOP IN MAIN
    
    Call only from __main__ section in this module. Generates sample converted
    into each emotion.

    (str) filename - name.wav file to be converted
    (StarGAN-emo-VC1) model - pretrained model to perform conversion
    (torch.Tensor(long)) one_hot_emo - one hot encoding of emotion to convert to
    '''
    wav, labels = pp.get_wav_and_labels(filenames[5],
                                        config['data']['dataset_dir'])
    wav = np.array(wav, dtype=np.double)

    f0, ap, sp, coded_sp = preprocess_world.cal_mcep(wav)

    coded_sp = coded_sp.T

    coded_sp_torch = torch.Tensor(coded_sp).unsqueeze(0).unsqueeze(0).to(
        device=device)

    fake = model.G(coded_sp_torch, one_hot_emo.unsqueeze(0))
    fake = fake.squeeze()

    print("Sampled size = ", fake.size())

    converted_sp = fake.cpu().detach().numpy()
    converted_sp = np.array(converted_sp, dtype=np.float64)

    sample_length = converted_sp.shape[0]
    if sample_length != ap.shape[0]:
        ap = np.ascontiguousarray(ap[0:sample_length, :], dtype=np.float64)
        f0 = np.ascontiguousarray(f0[0:sample_length], dtype=np.float64)

    f0 = np.ascontiguousarray(f0[20:-20], dtype=np.float64)
    ap = np.ascontiguousarray(ap[20:-20, :], dtype=np.float64)
    converted_sp = np.ascontiguousarray(converted_sp[40:-40, :],
                                        dtype=np.float64)

    coded_sp = np.ascontiguousarray(coded_sp[20:-20, :], dtype=np.float64)

    target = np.argmax(one_hot_emo)
    out_name = filename[:-4] + str(labels[1]) + "to" + target + ".wav"

    audio_utils.save_world_wav([f0, ap, sp, converted_sp],
                               model.name + '_converted', out_name)
Exemplo n.º 2
0
    def sample_world(self):
        '''
        Passes each performance sample through G for every target emotion. They
        are saved to 'config(sample_dir)/model_name/filename-<emo>to<trg>.png + .npy'
        '''

        print("Saving world samples...")

        self.model.to_device(device=self.device)
        self.model.set_eval_mode()

        # Make one-hot vector for each emotion category
        num_emos = self.config['model']['num_classes']
        emo_labels = torch.Tensor(range(0, num_emos)).long()
        emo_targets = F.one_hot(
            emo_labels, num_classes=num_emos).float().to(device=self.device)

        for tag, val in self.sample_set.get_set().items():
            # tag is filename, val is [mel, labels, spec]

            f0_real = np.copy(val[0])
            ap_real = np.copy(val[1])
            sp = np.copy(val[2])
            coded_sp = torch.Tensor.clone(val[3])
            labels = torch.Tensor.clone(val[4])

            coded_sp = val[3].unsqueeze(0).unsqueeze(0).to(device=self.device)

            with torch.no_grad():
                # print(emo_targets)
                for i in range(0, emo_targets.size(0)):

                    f0 = np.copy(f0_real)
                    ap = np.copy(ap_real)

                    fake = self.model.G(coded_sp, emo_targets[i].unsqueeze(0))

                    filename_wav =  tag[0:-4] + "_" + str(int(labels[0].item())) + "to" + \
                                str(emo_labels[i].item()) + '_i=' +\
                                str(self.current_iter) + ".wav"

                    fake = fake.squeeze()
                    print("Sampled size = ", fake.size())
                    # f = fake.data()
                    converted_sp = fake.cpu().numpy()
                    converted_sp = np.array(converted_sp, dtype=np.float64)

                    sample_length = converted_sp.shape[0]
                    if sample_length != ap.shape[0]:
                        ap = np.ascontiguousarray(ap[0:sample_length, :],
                                                  dtype=np.float64)
                        f0 = np.ascontiguousarray(f0[0:sample_length],
                                                  dtype=np.float64)

                    f0 = np.ascontiguousarray(f0[40:-40], dtype=np.float64)
                    ap = np.ascontiguousarray(ap[40:-40, :], dtype=np.float64)
                    converted_sp = np.ascontiguousarray(
                        converted_sp[40:-40, :], dtype=np.float64)

                    # print("ap shape = ", val[1].shape)
                    # print("f0 shape = ", val[0].shape)

                    audio_utils.save_world_wav([f0, ap, sp, converted_sp],
                                               self.model_name, filename_wav)
Exemplo n.º 3
0
                sample_length = converted_sp.shape[0]
                if sample_length != ap.shape[0]:
                    # coded_sp_temp_copy = np.ascontiguousarray(coded_sp_temp_copy[0:sample_length, :], dtype = np.float64)
                    ap = np.ascontiguousarray(ap[0:sample_length, :], dtype = np.float64)
                    f0 = np.ascontiguousarray(f0[0:sample_length], dtype = np.float64)

                f0 = np.ascontiguousarray(f0[20:-20], dtype = np.float64)
                ap = np.ascontiguousarray(ap[20:-20,:], dtype = np.float64)
                converted_sp = np.ascontiguousarray(converted_sp[20:-20,:], dtype = np.float64)
                # coded_sp_temp_copy = np.ascontiguousarray(coded_sp_temp_copy[40:-40,:], dtype = np.float64)

                # print("ap shape = ", ap.shape)
                # print("f0 shape = ", f0.shape)
                print(converted_sp.shape)
                it = str(args.iteration)[0:3]
                audio_utils.save_world_wav([f0,ap,sp,converted_sp], args.out_dir +"_"+ it, filename_wav)
        # print(f, " converted.")
        if (file_num+1) % 20 == 0:
            print(file_num+1, " done.")

    ########################################
    #         MEL CONVERSION LOOP          #
    ########################################
    ### NEVER IMPLEMENTED AS ENDED UP NOT USING MEL SPECTROGRAMS
    # Make .npy arrays
    # Make audio
    # Make spec plots

    # Save all to directory