예제 #1
0
def run_predict(part_one=PART_ONE_RANGE, part_two=PART_TWO_RANGE):
    '''Load pretrained model'''
    loss_func = audio_loss(gamma=gamma_loss,
                           beta=beta_loss,
                           num_speaker=num_people)
    AO_model = load_model(MODEL_PATH,
                          custom_objects={
                              "tf": tf,
                              'loss_func': loss_func
                          })
    '''Load audio data'''
    loaded_file = 0
    for i, j in zip(range(part_one[0], part_one[1]),
                    range(part_two[0], part_two[1])):
        try:
            audio_data = np.load(dir_path_mix + "mix-%05d-%05d.npy" % (i, j))
            loaded_file += 1
            print(audio_data.shape)
            '''check shape - first dim should be 298'''
            audio_data = audio_data[:298]
            if len(audio_data) < 298:
                a = np.zeros((298, 257, 2))
                a[:len(audio_data), :, :] = audio_data
                audio_data = a
            print(audio_data.shape)
            mix_expand = np.expand_dims(audio_data, axis=0)
            print(mix_expand.shape)

            print("===== Completed processing audio =====")
            '''Predict data'''
            cRMs = AO_model.predict(mix_expand)
            cRMs = cRMs[0]

            print("===== Completed predicting cRMs =====")
            '''Save output as wav'''
            for k in range(num_people):
                cRM = cRMs[:, :, :, k]
                assert cRM.shape == (298, 257, 2)
                F = utils.fast_icRM(audio_data, cRM)
                T = utils.fast_istft(F, power=False)
                filename = dir_path_pred + '%05d-%05d_pred_output%d.wav' % (
                    i, j, k)
                wavfile.write(filename, 16000, T)
                print("%05d-%05d_pred_output%d.wav created" % (i, j, k))

            print("===== Completed saving output ===== \n")

        except FileNotFoundError:
            print('mix-%05d-%05d.npy is not found' % (i, j))

    print('num of processed audio : %d' % loaded_file)
def run_predict(video_name=VIDEO_NAME):
    '''Load audio data'''
    audio_data = np.load('preprocessed-%s.npy' % video_name)
    print(audio_data.shape)
    # TODO: check shape - first dim should be 298
    audio_data = audio_data[:298]
    if len(audio_data) < 298:
        a = np.zeros((298, 257, 2))
        a[:len(audio_data), :, :] = audio_data
        audio_data = a
    print(audio_data.shape)
    mix_expand = np.expand_dims(audio_data, axis=0)
    print(mix_expand.shape)
    '''Load visual data'''
    face_embs = np.zeros((1, 75, 1, 1792, num_people))
    print(face_embs.shape)
    for i in range(num_people):
        try:
            # face_embs[1,:,:,:,i] = np.load(dir_path_face_embs+"%s_face_emb.npy"%single_idxs[i])
            '''Currently does not use the correct face input for both speakers (uses the same images for both right now)'''
            face_embs[0, :, :, :,
                      i] = np.load(dir_path_face_embs + "%s_face_emb_p%d.npy" %
                                   (video_name, i))
        except Exception as e:
            print('No face embedding for speaker', i, "\n", e)
    '''TODO: use Google Vision AI to find the face embedding for each speaker'''

    # '''Load pretrained model'''
    loss_func = audio_loss(gamma=gamma_loss,
                           beta=beta_loss,
                           num_speaker=num_people)
    AV_model = load_model(MODEL_PATH,
                          custom_objects={
                              "tf": tf,
                              'loss_func': loss_func
                          })

    # '''Predict data'''
    cRMs = AV_model.predict([mix_expand, face_embs])
    cRMs = cRMs[0]

    # '''Save output as wav'''
    for j in range(num_people):
        cRM = cRMs[:, :, :, j]
        assert cRM.shape == (298, 257, 2)
        F = utils.fast_icRM(audio_data, cRM)
        T = utils.fast_istft(F, power=False)
        filename = dir_path_pred + 'pred_%s_output%d.wav' % (video_name, j)
        wavfile.write(filename, 16000, T)
SHAPE_CHECK = 0
MODEL_CHECK1 = 0
MODEL_CHECK2 = 1

data1, sr1 = librosa.load('../../data/audio/audio_train/trim_audio_train0.wav',
                          sr=16000)
data2, sr2 = librosa.load('../../data/audio/audio_train/trim_audio_train1.wav',
                          sr=16000)
mix = data1 * 0.5 + data2 * 0.5

if SHAPE_CHECK:
    # check shape
    print(data1.shape)
    D1 = avp.fast_stft(data1)
    print(D1.shape)
    T1 = avp.fast_istft(D1)
    print(T1.shape)

# check model
if MODEL_CHECK1:
    F1 = avp.fast_stft(data1)
    F1 = np.expand_dims(F1, axis=0)
    F2 = avp.fast_stft(data2)
    F2 = np.expand_dims(F2, axis=0)
    FM = avp.fast_stft(mix)
    FM = np.expand_dims(FM, axis=0)

    cRM1 = np.abs(F1) / np.abs(FM)
    cRM1[~np.isfinite(cRM1)] = 0
    cRM2 = np.abs(F2) / np.abs(FM)
    cRM2[~np.isfinite(cRM2)] = 0
예제 #4
0
AV_model = load_model(model_path, custom_objects={"tf": tf})
if NUM_GPU > 1:
    parallel_model = ModelMGPU(AV_model, NUM_GPU)
    for line in testfiles:
        mix, single_idxs, face_embs = parse_X_data(line)
        mix_expand = np.expand_dims(mix, axis=0)
        cRMs = parallel_model.predict([mix_expand, face_embs])
        cRMs = cRMs[0]
        prefix = ""
        for idx in single_idxs:
            prefix += idx + "-"
        for i in range(len(cRMs)):
            cRM = cRMs[:, :, :, i]
            assert cRM.shape == (298, 257, 2)
            F = utils.fast_icRM(mix, cRM)
            T = utils.fast_istft(F, power=False)
            filename = dir_path + prefix + str(single_idxs[i]) + '.wav'
            wavfile.write(filename, 16000, T)

if NUM_GPU <= 1:
    for line in testfiles:
        mix, single_idxs, face_embs = parse_X_data(line)
        mix_expand = np.expand_dims(mix, axis=0)
        cRMs = AV_model.predict([mix_expand, face_embs])
        cRMs = cRMs[0]
        prefix = ""
        for idx in single_idxs:
            prefix += idx + "-"
        for i in range(people_num):
            cRM = cRMs[:, :, :, i]
            assert cRM.shape == (298, 257, 2)
예제 #5
0
                  np.unique(seperated_speech_spectogram))

            mag_plot = np.sqrt(seperated_speech_spectogram[:, :, 0] *
                               seperated_speech_spectogram[:, :, 0] +
                               seperated_speech_spectogram[:, :, 1] *
                               seperated_speech_spectogram[:, :, 1])
            plt.imsave('mag_' + str(n) + '_' + str(i) + '.png', mag_plot)

            single_plot = np.sqrt(single_audio_npy_list[i][:, :, 0] *
                                  single_audio_npy_list[i][:, :, 0] +
                                  single_audio_npy_list[i][:, :, 1] *
                                  single_audio_npy_list[i][:, :, 1])
            plt.imsave('mag_' + str(n) + '_' + str(i) + 'GT' + '.png',
                       single_plot)

            T = utils.fast_istft(seperated_speech_spectogram, power=True)

            print(np.count_nonzero(seperated_speech_spectogram),
                  seperated_speech_spectogram.size)
            mix_1 = np.where(seperated_speech_spectogram > -1,
                             seperated_speech_spectogram, 0)
            mix_1 = np.where(mix_1 < 1, mix_1, 0)
            print(np.count_nonzero(mix_1), mix_1.size)
            mix_1 = np.where(seperated_speech_spectogram > -0.5,
                             seperated_speech_spectogram, 0)
            mix_1 = np.where(mix_1 < 0.5, mix_1, 0)
            print(np.count_nonzero(mix_1), mix_1.size)

            #             mix_ = mix.cpu().detach().numpy().transpose(0,2,3,1).reshape(298,257,2)/2
            #             T = utils.fast_istft(mix_,power=True)