def get_feature_array(file_path): npy_file_path = "/tmp/features.npy" dvc_call(file_path, npy_file_path) network_output = np.load(npy_file_path) console.stats(network_output, "network output") return network_output
def main(_): initialize_globals() # scrape data from folder RAW_DATA_DIR = "../data/studio_acapellas" PROCESSED_DATA_DIR = "../data/processed" # for each one, generate the data using sst methods, and save the data for file_name in os.listdir(RAW_DATA_DIR): file_path = os.path.join(RAW_DATA_DIR, file_name) if file_path.endswith(".mp3"): processed_file_name = file_name.replace("mp3", "npy") # haha # todo: rewrite all this using pathlib processed_file_path_x = PROCESSED_DATA_DIR + "/x/" + processed_file_name processed_file_path_y = PROCESSED_DATA_DIR + "/y/" + processed_file_name console.h1("Processing", file_path) processed_file_path_style = PROCESSED_DATA_DIR + "/style/" + processed_file_name x_arr, y_arr, style_arr = generate_data_arrs(file_path) # for debugging just save as images console.stats(x_arr, "x_arr") console.stats(y_arr, "y_arr") console.stats(style_arr, "style_arr") #ipdb.set_trace() io.imsave(processed_file_path_x + ".jpg", x_arr / x_arr.max()) io.imsave(processed_file_path_y + ".jpg", y_arr / y_arr.max()) np.save(processed_file_path_x, x_arr) np.save(processed_file_path_y, y_arr) np.save(processed_file_path_style, style_arr) else: console.info("Skipping", file_path)
console.time("fundamental to harmonics") fundamental_freqs, fundamental_amps = sst.extract_fundamental_freqs_amps( fundamental_mask, amplitude) harmonics = sst.fundamental_to_harmonics(fundamental_freqs, fundamental_amps, amplitude) console.timeEnd("fundamental to harmonics") conversion.image_to_file(harmonics, f + ".harmonics.png") # pitch normalization haha if True: pitch_normalized_amp, pitch_normalized_phase = sst.normalize_pitch( amplitude, phase, fundamental_freqs, fundamental_amps) conversion.image_to_file(pitch_normalized_amp, f + ".pitch_normalized.png") console.stats(pitch_normalized_amp, "pitch_normalized_amp") pitch_normalized_audio = conversion.amplitude_to_audio( pitch_normalized_amp, fft_window_size=1536, phase_iterations=1, phase=pitch_normalized_phase, ) conversion.audio_to_file(pitch_normalized_audio, f + ".pitch_normalized.mp3") fundamental_audio = conversion.amplitude_to_audio(fundamental_mask, fft_window_size=1536, phase_iterations=1, phase=phase) conversion.audio_to_file(fundamental_audio, f + ".fundamental.mp3")
#!/usr/bin/env python import console import conversion import sst import matplotlib.pyplot as plt import skimage.io as io import numpy as np import ipdb from sst import extract_fundamental test_files = ["sample/rolling_in_the_deep/reference_stylized.mp3"] for f in test_files: console.log("starting", f) audio, sample_rate = conversion.file_to_audio(f) amplitude, phase = conversion.audio_to_spectrogram(audio, fft_window_size=1536) console.stats(phase) io.imsave("phase.png", plt.get_cmap("plasma")((np.clip(phase[:, :, 0], -2, 2) + 2) / 4))
console.time("super resolution") content_fundamental_mask = sst.extract_fundamental(content_amplitude) content_fundamental_freqs, content_fundamental_amps = sst.extract_fundamental_freqs_amps( content_fundamental_mask, content_amplitude) content_sibilants = sst.get_sibilants(content_amplitude, content_fundamental_amps) conversion.image_to_file(content_sibilants, test_content_file + ".sibilants.jpg") console.log("finished sibilants") content_harmonics = sst.fundamental_to_harmonics(content_fundamental_freqs, content_fundamental_amps, content_amplitude) content_harmonics = dilation(content_harmonics) content_sibilants *= content_amplitude.max() / content_sibilants.max() console.stats(content_sibilants, "content sibilants") content_harmonics *= content_amplitude.max() / content_harmonics.max() console.stats(content_harmonics, "content harmonics") console.timeEnd("super resolution") console.time("frequency weighting") # ELEMENT 1: Frequency weighting for t in range(num_timesteps): content_slice = np.maximum( content_amplitude[:, t], np.maximum(content_harmonics[:, t], content_sibilants[:, t])) style_slice = style_amplitude[:, t, :] content_env = sst.spectral_envelope(content_slice) style_env = sst.spectral_envelope(style_slice) weights = np.clip(style_env / (0.001 + content_env), 0, 5) stylized_amplitude[:, t, :] = content_slice * weights[:, np.newaxis]