def run_predict(part_one=PART_ONE_RANGE, part_two=PART_TWO_RANGE): '''Load pretrained model''' loss_func = audio_loss(gamma=gamma_loss, beta=beta_loss, num_speaker=num_people) AO_model = load_model(MODEL_PATH, custom_objects={ "tf": tf, 'loss_func': loss_func }) '''Load audio data''' loaded_file = 0 for i, j in zip(range(part_one[0], part_one[1]), range(part_two[0], part_two[1])): try: audio_data = np.load(dir_path_mix + "mix-%05d-%05d.npy" % (i, j)) loaded_file += 1 print(audio_data.shape) '''check shape - first dim should be 298''' audio_data = audio_data[:298] if len(audio_data) < 298: a = np.zeros((298, 257, 2)) a[:len(audio_data), :, :] = audio_data audio_data = a print(audio_data.shape) mix_expand = np.expand_dims(audio_data, axis=0) print(mix_expand.shape) print("===== Completed processing audio =====") '''Predict data''' cRMs = AO_model.predict(mix_expand) cRMs = cRMs[0] print("===== Completed predicting cRMs =====") '''Save output as wav''' for k in range(num_people): cRM = cRMs[:, :, :, k] assert cRM.shape == (298, 257, 2) F = utils.fast_icRM(audio_data, cRM) T = utils.fast_istft(F, power=False) filename = dir_path_pred + '%05d-%05d_pred_output%d.wav' % ( i, j, k) wavfile.write(filename, 16000, T) print("%05d-%05d_pred_output%d.wav created" % (i, j, k)) print("===== Completed saving output ===== \n") except FileNotFoundError: print('mix-%05d-%05d.npy is not found' % (i, j)) print('num of processed audio : %d' % loaded_file)
def run_predict(video_name=VIDEO_NAME): '''Load audio data''' audio_data = np.load('preprocessed-%s.npy' % video_name) print(audio_data.shape) # TODO: check shape - first dim should be 298 audio_data = audio_data[:298] if len(audio_data) < 298: a = np.zeros((298, 257, 2)) a[:len(audio_data), :, :] = audio_data audio_data = a print(audio_data.shape) mix_expand = np.expand_dims(audio_data, axis=0) print(mix_expand.shape) '''Load visual data''' face_embs = np.zeros((1, 75, 1, 1792, num_people)) print(face_embs.shape) for i in range(num_people): try: # face_embs[1,:,:,:,i] = np.load(dir_path_face_embs+"%s_face_emb.npy"%single_idxs[i]) '''Currently does not use the correct face input for both speakers (uses the same images for both right now)''' face_embs[0, :, :, :, i] = np.load(dir_path_face_embs + "%s_face_emb_p%d.npy" % (video_name, i)) except Exception as e: print('No face embedding for speaker', i, "\n", e) '''TODO: use Google Vision AI to find the face embedding for each speaker''' # '''Load pretrained model''' loss_func = audio_loss(gamma=gamma_loss, beta=beta_loss, num_speaker=num_people) AV_model = load_model(MODEL_PATH, custom_objects={ "tf": tf, 'loss_func': loss_func }) # '''Predict data''' cRMs = AV_model.predict([mix_expand, face_embs]) cRMs = cRMs[0] # '''Save output as wav''' for j in range(num_people): cRM = cRMs[:, :, :, j] assert cRM.shape == (298, 257, 2) F = utils.fast_icRM(audio_data, cRM) T = utils.fast_istft(F, power=False) filename = dir_path_pred + 'pred_%s_output%d.wav' % (video_name, j) wavfile.write(filename, 16000, T)
SHAPE_CHECK = 0 MODEL_CHECK1 = 0 MODEL_CHECK2 = 1 data1, sr1 = librosa.load('../../data/audio/audio_train/trim_audio_train0.wav', sr=16000) data2, sr2 = librosa.load('../../data/audio/audio_train/trim_audio_train1.wav', sr=16000) mix = data1 * 0.5 + data2 * 0.5 if SHAPE_CHECK: # check shape print(data1.shape) D1 = avp.fast_stft(data1) print(D1.shape) T1 = avp.fast_istft(D1) print(T1.shape) # check model if MODEL_CHECK1: F1 = avp.fast_stft(data1) F1 = np.expand_dims(F1, axis=0) F2 = avp.fast_stft(data2) F2 = np.expand_dims(F2, axis=0) FM = avp.fast_stft(mix) FM = np.expand_dims(FM, axis=0) cRM1 = np.abs(F1) / np.abs(FM) cRM1[~np.isfinite(cRM1)] = 0 cRM2 = np.abs(F2) / np.abs(FM) cRM2[~np.isfinite(cRM2)] = 0
AV_model = load_model(model_path, custom_objects={"tf": tf}) if NUM_GPU > 1: parallel_model = ModelMGPU(AV_model, NUM_GPU) for line in testfiles: mix, single_idxs, face_embs = parse_X_data(line) mix_expand = np.expand_dims(mix, axis=0) cRMs = parallel_model.predict([mix_expand, face_embs]) cRMs = cRMs[0] prefix = "" for idx in single_idxs: prefix += idx + "-" for i in range(len(cRMs)): cRM = cRMs[:, :, :, i] assert cRM.shape == (298, 257, 2) F = utils.fast_icRM(mix, cRM) T = utils.fast_istft(F, power=False) filename = dir_path + prefix + str(single_idxs[i]) + '.wav' wavfile.write(filename, 16000, T) if NUM_GPU <= 1: for line in testfiles: mix, single_idxs, face_embs = parse_X_data(line) mix_expand = np.expand_dims(mix, axis=0) cRMs = AV_model.predict([mix_expand, face_embs]) cRMs = cRMs[0] prefix = "" for idx in single_idxs: prefix += idx + "-" for i in range(people_num): cRM = cRMs[:, :, :, i] assert cRM.shape == (298, 257, 2)
np.unique(seperated_speech_spectogram)) mag_plot = np.sqrt(seperated_speech_spectogram[:, :, 0] * seperated_speech_spectogram[:, :, 0] + seperated_speech_spectogram[:, :, 1] * seperated_speech_spectogram[:, :, 1]) plt.imsave('mag_' + str(n) + '_' + str(i) + '.png', mag_plot) single_plot = np.sqrt(single_audio_npy_list[i][:, :, 0] * single_audio_npy_list[i][:, :, 0] + single_audio_npy_list[i][:, :, 1] * single_audio_npy_list[i][:, :, 1]) plt.imsave('mag_' + str(n) + '_' + str(i) + 'GT' + '.png', single_plot) T = utils.fast_istft(seperated_speech_spectogram, power=True) print(np.count_nonzero(seperated_speech_spectogram), seperated_speech_spectogram.size) mix_1 = np.where(seperated_speech_spectogram > -1, seperated_speech_spectogram, 0) mix_1 = np.where(mix_1 < 1, mix_1, 0) print(np.count_nonzero(mix_1), mix_1.size) mix_1 = np.where(seperated_speech_spectogram > -0.5, seperated_speech_spectogram, 0) mix_1 = np.where(mix_1 < 0.5, mix_1, 0) print(np.count_nonzero(mix_1), mix_1.size) # mix_ = mix.cpu().detach().numpy().transpose(0,2,3,1).reshape(298,257,2)/2 # T = utils.fast_istft(mix_,power=True)