Exemplo n.º 1
0
def main(_):
    # REVIEW josephz: This paradigm was copied from inference-hack.py
    # initialize_globals()

    sample_dir = "sample"
    # sample_names = ["new_test"]
    sample_names = ["rolling_in_the_deep"]
    post_processor = PostProcessor()
    post_processor.load_weights("weights.h5")
    # sample_names = ["perfect_features"]
    # sample_names = ["rolling_in_the_one_more_time"]
    for sample_name in sample_names:
        console.h1("Processing %s" % sample_name)
        console.time("total processing for " + sample_name)
        sample_path = sample_dir + "/" + sample_name

        style_path = sample_path + "/style.mp3"
        content_path = sample_path + "/content.mp3"
        stylized_img_path = sample_path + "/stylized.png"
        stylized_img_raw_path = sample_path + "/stylized_raw.png"
        stylized_audio_path = sample_path + "/stylized.mp3"
        stylized_audio_raw_path = sample_path + "/stylized_raw.mp3"

        # Read style audio to spectrograms.
        style_audio, style_sample_rate = conversion.file_to_audio(style_path)
        style_img, style_phase = conversion.audio_to_spectrogram(
            style_audio, fft_window_size=1536)

        # Read content audio to spectrograms.
        content_audio, content_sample_rate = conversion.file_to_audio(
            content_path)
        content_img, content_phase = conversion.audio_to_spectrogram(
            content_audio, fft_window_size=1536)
        stylized_img_raw, stylized_img = stylize(content_img, style_img,
                                                 content_phase, style_phase,
                                                 content_path, style_path,
                                                 post_processor)

        # Save raw stylized spectrogram and audio.
        stylized_audio_raw = conversion.amplitude_to_audio(
            stylized_img_raw,
            fft_window_size=1536,
            phase_iterations=15,
            phase=content_phase)
        conversion.image_to_file(stylized_img_raw, stylized_img_raw_path)
        conversion.audio_to_file(stylized_audio_raw, stylized_audio_raw_path)

        # Save post-processed stylized spectrogram and audio.
        stylized_audio = conversion.amplitude_to_audio(stylized_img,
                                                       fft_window_size=1536,
                                                       phase_iterations=15,
                                                       phase=content_phase)
        # np.save("stylized_img.npy", stylized_img)
        # np.save("content_phase.npy", content_phase)
        conversion.image_to_file(stylized_img, stylized_img_path)
        conversion.audio_to_file(stylized_audio, stylized_audio_path)

        console.timeEnd("total processing for " + sample_name)
        console.info("Finished processing %s; saved to %s" %
                     (sample_name, stylized_audio_path))
Exemplo n.º 2
0
def extract_fundamental(amplitude):
    fundamental = np.zeros(amplitude.shape)
    # TODO: replace all of this with real code or at least clean it up
    # it should just be one big numpy thingy
    f_band_min = -4
    f_band_max = 8
    f_band_len = f_band_max - f_band_min
    f_band_coeffs = (1 - np.concatenate(
        (np.array(range(f_band_min, 0)) / f_band_min,
         np.array(range(f_band_max)) / f_band_max)))[:, np.newaxis]
    peak_finder = np.array([-0.5, -0.5, 2, -0.5, -0.5])[:, np.newaxis].T
    console.time("big loop")
    freqs = np.argmax(np.mean(amplitude[:50], axis=2), axis=0)
    # console.stats(freqs)
    for t in range(amplitude.shape[1]):
        f = freqs[t]
        # handle case where 2nd harmonic > first
        if np.mean(amplitude[f // 2, t]) > 0.4 * np.mean(amplitude[f, t]):
            f = f // 2
            freqs[t] = f
        if f > 5:
            f_min = f + f_band_min
            f_max = f + f_band_max
            fundamental[f_min:f_max,
                        t] = f_band_coeffs * amplitude[f_min:f_max, t]

    console.timeEnd("big loop")
    console.time("remove dots")
    mask = (grey_dilation(grey_erosion(fundamental,
                                       structure=np.ones((3, 5, 1))),
                          structure=np.ones((6, 12, 1))) > 0.1)
    console.timeEnd("remove dots")
    fundamental *= mask
    return fundamental
Exemplo n.º 3
0
    for txt_path in progress_bar:
        paper_id = str(txt_path.name).replace(ext, "")
        txt_size_bytes = txt_path.stat().st_size
        if min_size_bytes < txt_size_bytes < max_size_bytes:  # filter out the theses
            txt_paths.append(str(txt_path))
            pids.append(paper_id)
        else:
            progress_bar.set_description("skipped %s with %d bytes" %
                                         (paper_id, txt_size_bytes))
    print("in total read in %d text files out of %d possible." %
          (len(txt_paths), len(txt_paths_all)))
    return txt_paths, pids


print("getting valid papers")
console.time("get valid papers")
txt_paths, pids = get_valid_papers()
console.time_end("get valid papers")

# compute tfidf vectors with scikits
v = TfidfVectorizer(
    input="content",
    encoding="utf-8",
    decode_error="replace",
    strip_accents="ascii",  # DO NOT USE "unicode"; it is very slow
    lowercase=True,
    analyzer="word",
    stop_words="english",
    token_pattern=r"(?u)\b[a-zA-Z_][a-zA-Z0-9_]+\b",
    ngram_range=(1, 2),
    max_features=max_features,
Exemplo n.º 4
0
def stylize(content, style, content_phase, style_phase, content_path,
            style_path, post_processor):
    stylized = content
    # Pitch fundamental extraction
    console.time("extracting fundamentals")
    content_fundamental_mask = extract_fundamental(content)
    style_fundamental_mask = extract_fundamental(style)
    console.timeEnd("extracting fundamentals")
    console.time("fundamental freqs and amps")
    content_fundamental_freqs, content_fundamental_amps = extract_fundamental_freqs_amps(
        content_fundamental_mask, content)
    style_fundamental_freqs, style_fundamental_amps = extract_fundamental_freqs_amps(
        style_fundamental_mask, style)
    console.timeEnd("fundamental freqs and amps")

    if True:
        console.time("pitch normalization")
        content_normalized, content_normalized_phase = normalize_pitch(
            content,
            content_phase,
            content_fundamental_freqs,
            content_fundamental_amps,
            base_pitch=32)
        style_normalized, style_normalized_phase = normalize_pitch(
            style,
            style_phase,
            style_fundamental_freqs,
            style_fundamental_amps,
            base_pitch=32)
        content_normalized_path = content_path + ".normalized.mp3"
        content_normalized_audio = conversion.amplitude_to_audio(
            content_normalized,
            fft_window_size=1536,
            phase_iterations=1,
            phase=content_normalized_phase)
        conversion.audio_to_file(content_normalized_audio,
                                 content_normalized_path)

        style_normalized_path = style_path + ".normalized.mp3"
        style_normalized_audio = conversion.amplitude_to_audio(
            style_normalized,
            fft_window_size=1536,
            phase_iterations=1,
            phase=style_normalized_phase)
        conversion.audio_to_file(style_normalized_audio, style_normalized_path)

        console.timeEnd("pitch normalization")

    # Featurization
    use_spectral_features = False
    if use_spectral_features:
        # Pitch normalization
        content_features = compute_features(content)
        style_features = compute_features(style)
    if not use_spectral_features:
        # neural features
        content_features = get_feature_array(content_path)
        content_features /= content_features.max()
        #console.stats(content_features, "content features")
        # conversion.image_to_file(content_features[:,:,np.newaxis], "content_features.png")
        #console.debug(content.shape, "content.shape")
        content_features = resize(
            content_features, (content_features.shape[0], content.shape[1]))
        style_features = get_feature_array(style_path)
        style_features /= style_features.max()
        #console.stats(style_features, "style features")
        #console.debug(style.shape, "style.shape")
        # conversion.image_to_file(style_features[:,:,np.newaxis], "style_features.png")
        style_features = resize(style_features,
                                (style_features.shape[0], style.shape[1]))

    # Harmonic recovery
    content_harmonics = fundamental_to_harmonics(content_fundamental_freqs,
                                                 content_fundamental_amps,
                                                 content)
    content_harmonics = grey_dilation(content_harmonics, size=3)
    content_harmonics *= content.max() / content_harmonics.max()
    # Sibilant recovery
    content_sibilants = get_sibilants(content, content_fundamental_amps)
    content_sibilants *= content.max() / content_sibilants.max()

    # Patchmatch
    console.time("patch match")
    if False:
        stylized = audio_patch_rescale(
            content,
            style,
            content_fundamental_freqs,
            style_fundamental_freqs,
            content_features,
            style_features,
            content_harmonics,
            content_sibilants,
        )
    if True:
        stylized = audio_patch_match(content,
                                     style,
                                     content_fundamental_freqs,
                                     style_fundamental_freqs,
                                     content_features,
                                     style_features,
                                     iterations=96)
    console.timeEnd("patch match")
    console.log("normal stylized has shape", stylized.shape)
    # ipdb.set_trace()
    stylized_post_processed = post_processor.predict_unstacked(
        amplitude=np.mean(stylized, axis=2),
        harmonics=np.mean(content_harmonics, axis=2),
        sibilants=np.mean(content_sibilants, axis=2))
    stylized_post_processed = np.dstack([
        stylized_post_processed, stylized_post_processed
    ])  # TODO: actually run the network on both channels instead of doing this
    stylized_post_processed = global_eq_match(stylized_post_processed, style)
    return stylized, stylized_post_processed
#!/usr/bin/env python
import console
import conversion
import numpy as np
import sst
import ipdb

test_files = [
    "../data/aligned/one_last_time/one_last_time_cover_aligned_30s.mp3",
    "../data/aligned/one_last_time/one_last_time_original_30s.mp3"
]
#test_files = ["sample/rolling_in_the_deep/style.mp3"]

for f in test_files:
    console.time("preprocessing")
    console.log("starting", f)
    audio, sample_rate = conversion.file_to_audio(f)
    amplitude, phase = conversion.audio_to_spectrogram(audio,
                                                       fft_window_size=1536)
    console.timeEnd("preprocessing")
    console.time("extracting fundamental")
    fundamental_mask = sst.extract_fundamental(amplitude)
    console.timeEnd("extracting fundamental")
    conversion.image_to_file(fundamental_mask, f + ".fundamental.png")

    console.time("fundamental to harmonics")
    fundamental_freqs, fundamental_amps = sst.extract_fundamental_freqs_amps(
        fundamental_mask, amplitude)
    harmonics = sst.fundamental_to_harmonics(fundamental_freqs,
                                             fundamental_amps, amplitude)
    console.timeEnd("fundamental to harmonics")
Exemplo n.º 6
0
        np.random.shuffle(self.pairs)
        console.log("Loaded", len(self.pairs), "pairs")
        console.log("Shape of first pair [", self.pairs[0][1], "] is",
                    self.pairs[0][0].shape, self.pairs[0][2].shape)

    def on_epoch_end(self):
        np.random.shuffle(self.pairs)

    def __len__(self):
        return 64

    def __getitem__(self, index):
        max_index = int(np.floor(len(self.pairs) / self.batch_size))
        index %= max_index
        x = []
        y = []
        style = []
        for b in range(self.batch_size):
            x_i, file_name, y_i, f_min, f_max = self.pairs[index *
                                                           self.batch_size + b]
            x.append(x_i)
            y.append(y_i)
            style.append(self.style_inputs[file_name][:, f_min:f_max])
        return [np.array(x), np.array(style)], np.array(y)


if __name__ == "__main__":
    console.time("loading all data")
    d = DataGenerator()
    console.timeEnd("loading all data")
Exemplo n.º 7
0
#!/usr/bin/env python
import console
import conversion
import numpy as np
import sst
import ipdb
from skimage.morphology import dilation

# a test of what we could get if we perfectly matched each element of style

test_content_file = "sample/rolling_in_the_deep/content.mp3"
test_style_file = "sample/rolling_in_the_deep/reference_stylized.mp3"

# Load them both as spectrograms
console.time("preprocessing")
content_audio, content_sample_rate = conversion.file_to_audio(
    test_content_file)
content_amplitude, content_phase = conversion.audio_to_spectrogram(
    content_audio, fft_window_size=1536)
style_audio, style_sample_rate = conversion.file_to_audio(test_style_file)
style_amplitude, style_phase = conversion.audio_to_spectrogram(
    style_audio, fft_window_size=1536)
console.timeEnd("preprocessing")

stylized_amplitude = np.zeros(content_amplitude.shape)

num_freq, num_timesteps, _ = content_amplitude.shape
num_timesteps = min(num_timesteps, style_amplitude.shape[1])

# Preprocessing - compute fundamentals and harmonics
console.time("super resolution")