def main(_): # REVIEW josephz: This paradigm was copied from inference-hack.py # initialize_globals() sample_dir = "sample" # sample_names = ["new_test"] sample_names = ["rolling_in_the_deep"] post_processor = PostProcessor() post_processor.load_weights("weights.h5") # sample_names = ["perfect_features"] # sample_names = ["rolling_in_the_one_more_time"] for sample_name in sample_names: console.h1("Processing %s" % sample_name) console.time("total processing for " + sample_name) sample_path = sample_dir + "/" + sample_name style_path = sample_path + "/style.mp3" content_path = sample_path + "/content.mp3" stylized_img_path = sample_path + "/stylized.png" stylized_img_raw_path = sample_path + "/stylized_raw.png" stylized_audio_path = sample_path + "/stylized.mp3" stylized_audio_raw_path = sample_path + "/stylized_raw.mp3" # Read style audio to spectrograms. style_audio, style_sample_rate = conversion.file_to_audio(style_path) style_img, style_phase = conversion.audio_to_spectrogram( style_audio, fft_window_size=1536) # Read content audio to spectrograms. content_audio, content_sample_rate = conversion.file_to_audio( content_path) content_img, content_phase = conversion.audio_to_spectrogram( content_audio, fft_window_size=1536) stylized_img_raw, stylized_img = stylize(content_img, style_img, content_phase, style_phase, content_path, style_path, post_processor) # Save raw stylized spectrogram and audio. stylized_audio_raw = conversion.amplitude_to_audio( stylized_img_raw, fft_window_size=1536, phase_iterations=15, phase=content_phase) conversion.image_to_file(stylized_img_raw, stylized_img_raw_path) conversion.audio_to_file(stylized_audio_raw, stylized_audio_raw_path) # Save post-processed stylized spectrogram and audio. stylized_audio = conversion.amplitude_to_audio(stylized_img, fft_window_size=1536, phase_iterations=15, phase=content_phase) # np.save("stylized_img.npy", stylized_img) # np.save("content_phase.npy", content_phase) conversion.image_to_file(stylized_img, stylized_img_path) conversion.audio_to_file(stylized_audio, stylized_audio_path) console.timeEnd("total processing for " + sample_name) console.info("Finished processing %s; saved to %s" % (sample_name, stylized_audio_path))
#!/usr/bin/env python import conversion import console import numpy as np from post_processor import PostProcessor post_processor = PostProcessor() post_processor.load_weights("weights.h5") stylized = conversion.file_to_image("sample/rolling_in_the_deep/stylized.png") content_harmonics = conversion.file_to_image("sample/rolling_in_the_deep/content.mp3.harmonics.png") content_sibilants = conversion.file_to_image("sample/rolling_in_the_deep/content.mp3.harmonics.png") stylized = post_processor.predict_unstacked(amplitude=stylized, harmonics=content_harmonics, sibilants=content_sibilants) conversion.image_to_file(stylized, "/Users/ollin/Desktop/boop.png")
#!/usr/bin/env python import console import conversion import numpy as np import sys import cv2 # hacky, will replace with argparse later img_file_path = sys.argv[1] console.log("sawifying", img_file_path) spectrogram = conversion.file_to_image(img_file_path) # non-maximum suppression since im lazy and don't wanna interpolate output = np.zeros(spectrogram.shape) # slow, will replace with numpy later for y in range(64): console.progressBar((y + 1) / 64) for t in range(spectrogram.shape[1]): if spectrogram[y][t] > 0.1: harmonic = 1 while harmonic * (y + 1) < spectrogram.shape[0]: center = harmonic * y end = int(harmonic * (y + 1)) for i in range(center, end): output[i][t] = spectrogram[y][t] * 1 / harmonic harmonic += 1 conversion.image_to_file(output, img_file_path + ".saw.png")
def denoise_from_file(self, file_path): noisy = np.load(file_path) denoised = self.predict(noisy) conversion.image_to_file(denoised, file_path + ".denoised.png")
"../data/aligned/one_last_time/one_last_time_cover_aligned_30s.mp3", "../data/aligned/one_last_time/one_last_time_original_30s.mp3" ] #test_files = ["sample/rolling_in_the_deep/style.mp3"] for f in test_files: console.time("preprocessing") console.log("starting", f) audio, sample_rate = conversion.file_to_audio(f) amplitude, phase = conversion.audio_to_spectrogram(audio, fft_window_size=1536) console.timeEnd("preprocessing") console.time("extracting fundamental") fundamental_mask = sst.extract_fundamental(amplitude) console.timeEnd("extracting fundamental") conversion.image_to_file(fundamental_mask, f + ".fundamental.png") console.time("fundamental to harmonics") fundamental_freqs, fundamental_amps = sst.extract_fundamental_freqs_amps( fundamental_mask, amplitude) harmonics = sst.fundamental_to_harmonics(fundamental_freqs, fundamental_amps, amplitude) console.timeEnd("fundamental to harmonics") conversion.image_to_file(harmonics, f + ".harmonics.png") # pitch normalization haha if True: pitch_normalized_amp, pitch_normalized_phase = sst.normalize_pitch( amplitude, phase, fundamental_freqs, fundamental_amps) conversion.image_to_file(pitch_normalized_amp, f + ".pitch_normalized.png")
def denoise_from_file(self, file_path): noisy = np.load(file_path) # lmao denoised = self.predict_unstacked(noisy[:,:,0], noisy[:,:,1], noisy[:,:,2]) conversion.image_to_file(denoised, file_path + ".denoised.png")
style_audio, fft_window_size=1536) console.timeEnd("preprocessing") stylized_amplitude = np.zeros(content_amplitude.shape) num_freq, num_timesteps, _ = content_amplitude.shape num_timesteps = min(num_timesteps, style_amplitude.shape[1]) # Preprocessing - compute fundamentals and harmonics console.time("super resolution") content_fundamental_mask = sst.extract_fundamental(content_amplitude) content_fundamental_freqs, content_fundamental_amps = sst.extract_fundamental_freqs_amps( content_fundamental_mask, content_amplitude) content_sibilants = sst.get_sibilants(content_amplitude, content_fundamental_amps) conversion.image_to_file(content_sibilants, test_content_file + ".sibilants.jpg") console.log("finished sibilants") content_harmonics = sst.fundamental_to_harmonics(content_fundamental_freqs, content_fundamental_amps, content_amplitude) content_harmonics = dilation(content_harmonics) content_sibilants *= content_amplitude.max() / content_sibilants.max() console.stats(content_sibilants, "content sibilants") content_harmonics *= content_amplitude.max() / content_harmonics.max() console.stats(content_harmonics, "content harmonics") console.timeEnd("super resolution") console.time("frequency weighting") # ELEMENT 1: Frequency weighting for t in range(num_timesteps):