Exemplo n.º 1
0
def main(_):
    # REVIEW josephz: This paradigm was copied from inference-hack.py
    # initialize_globals()

    sample_dir = "sample"
    # sample_names = ["new_test"]
    sample_names = ["rolling_in_the_deep"]
    post_processor = PostProcessor()
    post_processor.load_weights("weights.h5")
    # sample_names = ["perfect_features"]
    # sample_names = ["rolling_in_the_one_more_time"]
    for sample_name in sample_names:
        console.h1("Processing %s" % sample_name)
        console.time("total processing for " + sample_name)
        sample_path = sample_dir + "/" + sample_name

        style_path = sample_path + "/style.mp3"
        content_path = sample_path + "/content.mp3"
        stylized_img_path = sample_path + "/stylized.png"
        stylized_img_raw_path = sample_path + "/stylized_raw.png"
        stylized_audio_path = sample_path + "/stylized.mp3"
        stylized_audio_raw_path = sample_path + "/stylized_raw.mp3"

        # Read style audio to spectrograms.
        style_audio, style_sample_rate = conversion.file_to_audio(style_path)
        style_img, style_phase = conversion.audio_to_spectrogram(
            style_audio, fft_window_size=1536)

        # Read content audio to spectrograms.
        content_audio, content_sample_rate = conversion.file_to_audio(
            content_path)
        content_img, content_phase = conversion.audio_to_spectrogram(
            content_audio, fft_window_size=1536)
        stylized_img_raw, stylized_img = stylize(content_img, style_img,
                                                 content_phase, style_phase,
                                                 content_path, style_path,
                                                 post_processor)

        # Save raw stylized spectrogram and audio.
        stylized_audio_raw = conversion.amplitude_to_audio(
            stylized_img_raw,
            fft_window_size=1536,
            phase_iterations=15,
            phase=content_phase)
        conversion.image_to_file(stylized_img_raw, stylized_img_raw_path)
        conversion.audio_to_file(stylized_audio_raw, stylized_audio_raw_path)

        # Save post-processed stylized spectrogram and audio.
        stylized_audio = conversion.amplitude_to_audio(stylized_img,
                                                       fft_window_size=1536,
                                                       phase_iterations=15,
                                                       phase=content_phase)
        # np.save("stylized_img.npy", stylized_img)
        # np.save("content_phase.npy", content_phase)
        conversion.image_to_file(stylized_img, stylized_img_path)
        conversion.audio_to_file(stylized_audio, stylized_audio_path)

        console.timeEnd("total processing for " + sample_name)
        console.info("Finished processing %s; saved to %s" %
                     (sample_name, stylized_audio_path))
def generate_data_arrs(file_path, slice_size_t=1536):
    audio, sr = conversion.file_to_audio(file_path)
    amplitude, phase = conversion.audio_to_spectrogram(audio,
                                                       fft_window_size=1536)
    amplitude = amplitude[:, :2 * slice_size_t]

    # clipping only the first part to minimize "easy" repeated audio
    content = amplitude[:, :slice_size_t]
    style = amplitude[:, slice_size_t:2 * slice_size_t]
    freq_rang = [np.min(content, 1), np.max(content, 1)]

    console.log("Content shape", content.shape)
    console.log("Style shape", style.shape)
    # it's a lot of work to compute the x...
    fundamental_mask = sst.extract_fundamental(amplitude)
    #console.stats(fundamental_mask, "fundamental_mask")
    #console.stats(amplitude, "amplitude")
    fundamental_freqs, fundamental_amps = sst.extract_fundamental_freqs_amps(
        fundamental_mask, amplitude)
    content_fundamental_freqs = fundamental_freqs[:slice_size_t]
    content_fundamental_amps = fundamental_amps[:slice_size_t]
    style_fundamental_freqs = fundamental_freqs[slice_size_t:2 * slice_size_t]
    # features are computed directly and then sliced
    features = sst.get_feature_array(file_path) / 5
    features = sst.resize(features, (2048, amplitude.shape[1]))
    content_features = features[:, :slice_size_t]
    style_features = features[:, slice_size_t:2 * slice_size_t]
    stylized = sst.audio_patch_match(content,
                                     style,
                                     content_fundamental_freqs,
                                     style_fundamental_freqs,
                                     content_features,
                                     style_features,
                                     iterations=10)  # Harmonic recovery
    content_harmonics = sst.fundamental_to_harmonics(content_fundamental_freqs,
                                                     content_fundamental_amps,
                                                     content)
    content_harmonics = sst.grey_dilation(content_harmonics, size=3)
    content_harmonics *= content.max() / content_harmonics.max()
    # Sibilant recovery
    content_sibilants = sst.get_sibilants(content, content_fundamental_amps)
    content_sibilants *= content.max() / content_sibilants.max()

    x_arr = np.dstack([
        np.mean(stylized, axis=2),
        np.mean(content_harmonics, axis=2),
        np.mean(content_sibilants, axis=2)
    ])
    y_arr = np.mean(content, axis=2)
    style_arr = np.mean(style, axis=2)
    return 0, 0, style_arr
Exemplo n.º 3
0
    def isolate_vocals(self, path, fft_window_size, phase_iterations=10):
        console.log("Attempting to isolate vocals from", path)
        start_time = time.time()
        audio, sample_rate = conversion.load_audio(path)
        spectrogram, phase = conversion.audio_to_spectrogram(
            audio, fft_window_size=fft_window_size, sr=sample_rate)
        # spectrogram, phase = conversion.isolate_vocal_simple(audio, fft_window_size=fft_window_size, sr=sample_rate)
        console.log("Retrieved spectrogram; processing...")

        expanded_spectrogram = conversion.expand_to_grid(
            spectrogram, self.peak_downscale_factor)
        expanded_spectrogram_with_batch_channels = expanded_spectrogram[
            np.newaxis, :, :, np.newaxis]
        predicted_spectrogram_with_batch_channels = self.model.predict(
            expanded_spectrogram_with_batch_channels)
        predicted_spectrogram = predicted_spectrogram_with_batch_channels[
            0, :, :, 0]  # o /// o
        new_spectrogram = predicted_spectrogram[:spectrogram.shape[0], :
                                                spectrogram.shape[1]]
        console.log("Processed spectrogram; reconverting to audio")

        new_audio = conversion.spectrogram_to_audio(
            new_spectrogram,
            fft_window_size=fft_window_size,
            phase_iterations=phase_iterations)
        path_parts = os.path.split(path)
        filename_parts = os.path.splitext(path_parts[1])
        output_filename_base = os.path.join(path_parts[0],
                                            filename_parts[0] + "_acapella")
        console.log("Converted to audio; writing to", output_filename_base)

        conversion.save_audio(new_audio, output_filename_base + ".wav",
                              sample_rate)
        conversion.save_spectrogram(new_spectrogram,
                                    output_filename_base + ".png")
        conversion.save_spectrogram(
            spectrogram,
            os.path.join(path_parts[0], filename_parts[0]) + ".png")
        # console.log("Vocal isolation complete 👌")
        print('execution time: {}'.format(time.time() - start_time))
        return new_audio
Exemplo n.º 4
0
 def load(self, as_h5=False):
     h5_path = os.path.join(self.inPath, "data.h5")
     if os.path.isfile(h5_path):
         h5f = h5py.File(h5_path, "r")
         self.x = h5f["x"][:]
         self.y = h5f["y"][:]
     else:
         acapellas = {}
         instrumentals = {}
         # Hash bins for each camelot key so we can merge
         # in the future, this should be a generator w/ yields in order to eat less memory
         for i in range(NUMBER_OF_KEYS):
             key = i + 1
             acapellas[key] = []
             instrumentals[key] = []
         for dir_path, dir_names, file_names in os.walk(self.inPath):
             for file_name in filter(
                     lambda f: (f.endswith(".mp3") or f.endswith(".wav"))
                     and not f.startswith("."), file_names):
                 key = key_of_file(file_name)
                 if key:
                     target_path_map = acapellas if file_is_acapella(
                         file_name) else instrumentals
                     tag = "[Acapella]" if file_is_acapella(
                         file_name) else "[Instrumental]"
                     audio, sample_rate = conversion.load_audio(
                         os.path.join(self.inPath, file_name))
                     spectrogram, phase = conversion.audio_to_spectrogram(
                         audio, self.fft_window_size, sr=sample_rate)
                     target_path_map[key].append(spectrogram)
                     console.info(tag, "Created spectrogram for", file_name,
                                  "in key", key, "with shape",
                                  spectrogram.shape)
         # Merge mashups
         for k in range(NUMBER_OF_KEYS):
             acapellas_in_key = acapellas[k + 1]
             instrumentals_in_key = instrumentals[k + 1]
             count = 0
             for acapella in acapellas_in_key:
                 for instrumental in instrumentals_in_key:
                     # Pad if smaller
                     if instrumental.shape[1] < acapella.shape[1]:
                         new_instrumental = np.zeros(acapella.shape)
                         new_instrumental[:instrumental.
                                          shape[0], :instrumental.
                                          shape[1]] = instrumental
                         instrumental = new_instrumental
                     elif acapella.shape[1] < instrumental.shape[1]:
                         new_acapella = np.zeros(instrumental.shape)
                         new_acapella[:acapella.shape[0], :acapella.
                                      shape[1]] = acapella
                         acapella = new_acapella
                     # simulate a limiter/low mixing (loses info, but that's the point)
                     # I've tested this against making the same mashups in Logic and it's pretty close
                     mashup = np.maximum(acapella, instrumental)
                     # chop into slices so everything's the same size in a batch
                     dim = SLICE_SIZE
                     mashup_slices = chop(mashup, dim)
                     acapella_slices = chop(acapella, dim)
                     count += 1
                     self.x.extend(mashup_slices)
                     self.y.extend(acapella_slices)
             console.info("Created", count, "mashups for key", k, "with",
                          len(self.x), "total slices so far")
         # Add a "channels" channel to please the network
         self.x = np.array(self.x)[:, :, :, np.newaxis]
         self.y = np.array(self.y)[:, :, :, np.newaxis]
         # Save to file if asked
         if as_h5:
             h5f = h5py.File(h5_path, "w")
             h5f.create_dataset("x", data=self.x)
             h5f.create_dataset("y", data=self.y)
             h5f.close()
import conversion
import numpy as np
import sst
import ipdb

test_files = [
    "../data/aligned/one_last_time/one_last_time_cover_aligned_30s.mp3",
    "../data/aligned/one_last_time/one_last_time_original_30s.mp3"
]
#test_files = ["sample/rolling_in_the_deep/style.mp3"]

for f in test_files:
    console.time("preprocessing")
    console.log("starting", f)
    audio, sample_rate = conversion.file_to_audio(f)
    amplitude, phase = conversion.audio_to_spectrogram(audio,
                                                       fft_window_size=1536)
    console.timeEnd("preprocessing")
    console.time("extracting fundamental")
    fundamental_mask = sst.extract_fundamental(amplitude)
    console.timeEnd("extracting fundamental")
    conversion.image_to_file(fundamental_mask, f + ".fundamental.png")

    console.time("fundamental to harmonics")
    fundamental_freqs, fundamental_amps = sst.extract_fundamental_freqs_amps(
        fundamental_mask, amplitude)
    harmonics = sst.fundamental_to_harmonics(fundamental_freqs,
                                             fundamental_amps, amplitude)
    console.timeEnd("fundamental to harmonics")
    conversion.image_to_file(harmonics, f + ".harmonics.png")

    # pitch normalization haha
Exemplo n.º 6
0
import conversion
import numpy as np
import sst
import ipdb
from skimage.morphology import dilation

# a test of what we could get if we perfectly matched each element of style

test_content_file = "sample/rolling_in_the_deep/content.mp3"
test_style_file = "sample/rolling_in_the_deep/reference_stylized.mp3"

# Load them both as spectrograms
console.time("preprocessing")
content_audio, content_sample_rate = conversion.file_to_audio(
    test_content_file)
content_amplitude, content_phase = conversion.audio_to_spectrogram(
    content_audio, fft_window_size=1536)
style_audio, style_sample_rate = conversion.file_to_audio(test_style_file)
style_amplitude, style_phase = conversion.audio_to_spectrogram(
    style_audio, fft_window_size=1536)
console.timeEnd("preprocessing")

stylized_amplitude = np.zeros(content_amplitude.shape)

num_freq, num_timesteps, _ = content_amplitude.shape
num_timesteps = min(num_timesteps, style_amplitude.shape[1])

# Preprocessing - compute fundamentals and harmonics
console.time("super resolution")
content_fundamental_mask = sst.extract_fundamental(content_amplitude)
content_fundamental_freqs, content_fundamental_amps = sst.extract_fundamental_freqs_amps(
    content_fundamental_mask, content_amplitude)