示例#1
0
def get_feature_array(file_path):
    npy_file_path = "/tmp/features.npy"
    dvc_call(file_path, npy_file_path)

    network_output = np.load(npy_file_path)
    console.stats(network_output, "network output")
    return network_output
def main(_):
    initialize_globals()

    # scrape data from folder
    RAW_DATA_DIR = "../data/studio_acapellas"
    PROCESSED_DATA_DIR = "../data/processed"
    # for each one, generate the data using sst methods, and save the data
    for file_name in os.listdir(RAW_DATA_DIR):
        file_path = os.path.join(RAW_DATA_DIR, file_name)
        if file_path.endswith(".mp3"):
            processed_file_name = file_name.replace("mp3", "npy")  # haha
            # todo: rewrite all this using pathlib
            processed_file_path_x = PROCESSED_DATA_DIR + "/x/" + processed_file_name
            processed_file_path_y = PROCESSED_DATA_DIR + "/y/" + processed_file_name
            console.h1("Processing", file_path)
            processed_file_path_style = PROCESSED_DATA_DIR + "/style/" + processed_file_name
            x_arr, y_arr, style_arr = generate_data_arrs(file_path)
            # for debugging just save as images
            console.stats(x_arr, "x_arr")
            console.stats(y_arr, "y_arr")
            console.stats(style_arr, "style_arr")
            #ipdb.set_trace()
            io.imsave(processed_file_path_x + ".jpg", x_arr / x_arr.max())
            io.imsave(processed_file_path_y + ".jpg", y_arr / y_arr.max())
            np.save(processed_file_path_x, x_arr)
            np.save(processed_file_path_y, y_arr)
            np.save(processed_file_path_style, style_arr)
        else:
            console.info("Skipping", file_path)
    console.time("fundamental to harmonics")
    fundamental_freqs, fundamental_amps = sst.extract_fundamental_freqs_amps(
        fundamental_mask, amplitude)
    harmonics = sst.fundamental_to_harmonics(fundamental_freqs,
                                             fundamental_amps, amplitude)
    console.timeEnd("fundamental to harmonics")
    conversion.image_to_file(harmonics, f + ".harmonics.png")

    # pitch normalization haha
    if True:
        pitch_normalized_amp, pitch_normalized_phase = sst.normalize_pitch(
            amplitude, phase, fundamental_freqs, fundamental_amps)
        conversion.image_to_file(pitch_normalized_amp,
                                 f + ".pitch_normalized.png")
        console.stats(pitch_normalized_amp, "pitch_normalized_amp")
        pitch_normalized_audio = conversion.amplitude_to_audio(
            pitch_normalized_amp,
            fft_window_size=1536,
            phase_iterations=1,
            phase=pitch_normalized_phase,
        )
        conversion.audio_to_file(pitch_normalized_audio,
                                 f + ".pitch_normalized.mp3")

    fundamental_audio = conversion.amplitude_to_audio(fundamental_mask,
                                                      fft_window_size=1536,
                                                      phase_iterations=1,
                                                      phase=phase)
    conversion.audio_to_file(fundamental_audio, f + ".fundamental.mp3")
#!/usr/bin/env python
import console
import conversion
import sst
import matplotlib.pyplot as plt
import skimage.io as io
import numpy as np
import ipdb

from sst import extract_fundamental

test_files = ["sample/rolling_in_the_deep/reference_stylized.mp3"]

for f in test_files:
    console.log("starting", f)
    audio, sample_rate = conversion.file_to_audio(f)
    amplitude, phase = conversion.audio_to_spectrogram(audio,
                                                       fft_window_size=1536)
    console.stats(phase)
    io.imsave("phase.png",
              plt.get_cmap("plasma")((np.clip(phase[:, :, 0], -2, 2) + 2) / 4))
示例#5
0
console.time("super resolution")
content_fundamental_mask = sst.extract_fundamental(content_amplitude)
content_fundamental_freqs, content_fundamental_amps = sst.extract_fundamental_freqs_amps(
    content_fundamental_mask, content_amplitude)
content_sibilants = sst.get_sibilants(content_amplitude,
                                      content_fundamental_amps)
conversion.image_to_file(content_sibilants,
                         test_content_file + ".sibilants.jpg")
console.log("finished sibilants")
content_harmonics = sst.fundamental_to_harmonics(content_fundamental_freqs,
                                                 content_fundamental_amps,
                                                 content_amplitude)
content_harmonics = dilation(content_harmonics)

content_sibilants *= content_amplitude.max() / content_sibilants.max()
console.stats(content_sibilants, "content sibilants")
content_harmonics *= content_amplitude.max() / content_harmonics.max()
console.stats(content_harmonics, "content harmonics")
console.timeEnd("super resolution")

console.time("frequency weighting")
# ELEMENT 1: Frequency weighting
for t in range(num_timesteps):
    content_slice = np.maximum(
        content_amplitude[:, t],
        np.maximum(content_harmonics[:, t], content_sibilants[:, t]))
    style_slice = style_amplitude[:, t, :]
    content_env = sst.spectral_envelope(content_slice)
    style_env = sst.spectral_envelope(style_slice)
    weights = np.clip(style_env / (0.001 + content_env), 0, 5)
    stylized_amplitude[:, t, :] = content_slice * weights[:, np.newaxis]