def main(_): # REVIEW josephz: This paradigm was copied from inference-hack.py # initialize_globals() sample_dir = "sample" # sample_names = ["new_test"] sample_names = ["rolling_in_the_deep"] post_processor = PostProcessor() post_processor.load_weights("weights.h5") # sample_names = ["perfect_features"] # sample_names = ["rolling_in_the_one_more_time"] for sample_name in sample_names: console.h1("Processing %s" % sample_name) console.time("total processing for " + sample_name) sample_path = sample_dir + "/" + sample_name style_path = sample_path + "/style.mp3" content_path = sample_path + "/content.mp3" stylized_img_path = sample_path + "/stylized.png" stylized_img_raw_path = sample_path + "/stylized_raw.png" stylized_audio_path = sample_path + "/stylized.mp3" stylized_audio_raw_path = sample_path + "/stylized_raw.mp3" # Read style audio to spectrograms. style_audio, style_sample_rate = conversion.file_to_audio(style_path) style_img, style_phase = conversion.audio_to_spectrogram( style_audio, fft_window_size=1536) # Read content audio to spectrograms. content_audio, content_sample_rate = conversion.file_to_audio( content_path) content_img, content_phase = conversion.audio_to_spectrogram( content_audio, fft_window_size=1536) stylized_img_raw, stylized_img = stylize(content_img, style_img, content_phase, style_phase, content_path, style_path, post_processor) # Save raw stylized spectrogram and audio. stylized_audio_raw = conversion.amplitude_to_audio( stylized_img_raw, fft_window_size=1536, phase_iterations=15, phase=content_phase) conversion.image_to_file(stylized_img_raw, stylized_img_raw_path) conversion.audio_to_file(stylized_audio_raw, stylized_audio_raw_path) # Save post-processed stylized spectrogram and audio. stylized_audio = conversion.amplitude_to_audio(stylized_img, fft_window_size=1536, phase_iterations=15, phase=content_phase) # np.save("stylized_img.npy", stylized_img) # np.save("content_phase.npy", content_phase) conversion.image_to_file(stylized_img, stylized_img_path) conversion.audio_to_file(stylized_audio, stylized_audio_path) console.timeEnd("total processing for " + sample_name) console.info("Finished processing %s; saved to %s" % (sample_name, stylized_audio_path))
def generate_data_arrs(file_path, slice_size_t=1536): audio, sr = conversion.file_to_audio(file_path) amplitude, phase = conversion.audio_to_spectrogram(audio, fft_window_size=1536) amplitude = amplitude[:, :2 * slice_size_t] # clipping only the first part to minimize "easy" repeated audio content = amplitude[:, :slice_size_t] style = amplitude[:, slice_size_t:2 * slice_size_t] freq_rang = [np.min(content, 1), np.max(content, 1)] console.log("Content shape", content.shape) console.log("Style shape", style.shape) # it's a lot of work to compute the x... fundamental_mask = sst.extract_fundamental(amplitude) #console.stats(fundamental_mask, "fundamental_mask") #console.stats(amplitude, "amplitude") fundamental_freqs, fundamental_amps = sst.extract_fundamental_freqs_amps( fundamental_mask, amplitude) content_fundamental_freqs = fundamental_freqs[:slice_size_t] content_fundamental_amps = fundamental_amps[:slice_size_t] style_fundamental_freqs = fundamental_freqs[slice_size_t:2 * slice_size_t] # features are computed directly and then sliced features = sst.get_feature_array(file_path) / 5 features = sst.resize(features, (2048, amplitude.shape[1])) content_features = features[:, :slice_size_t] style_features = features[:, slice_size_t:2 * slice_size_t] stylized = sst.audio_patch_match(content, style, content_fundamental_freqs, style_fundamental_freqs, content_features, style_features, iterations=10) # Harmonic recovery content_harmonics = sst.fundamental_to_harmonics(content_fundamental_freqs, content_fundamental_amps, content) content_harmonics = sst.grey_dilation(content_harmonics, size=3) content_harmonics *= content.max() / content_harmonics.max() # Sibilant recovery content_sibilants = sst.get_sibilants(content, content_fundamental_amps) content_sibilants *= content.max() / content_sibilants.max() x_arr = np.dstack([ np.mean(stylized, axis=2), np.mean(content_harmonics, axis=2), np.mean(content_sibilants, axis=2) ]) y_arr = np.mean(content, axis=2) style_arr = np.mean(style, axis=2) return 0, 0, style_arr
def isolate_vocals(self, path, fft_window_size, phase_iterations=10): console.log("Attempting to isolate vocals from", path) start_time = time.time() audio, sample_rate = conversion.load_audio(path) spectrogram, phase = conversion.audio_to_spectrogram( audio, fft_window_size=fft_window_size, sr=sample_rate) # spectrogram, phase = conversion.isolate_vocal_simple(audio, fft_window_size=fft_window_size, sr=sample_rate) console.log("Retrieved spectrogram; processing...") expanded_spectrogram = conversion.expand_to_grid( spectrogram, self.peak_downscale_factor) expanded_spectrogram_with_batch_channels = expanded_spectrogram[ np.newaxis, :, :, np.newaxis] predicted_spectrogram_with_batch_channels = self.model.predict( expanded_spectrogram_with_batch_channels) predicted_spectrogram = predicted_spectrogram_with_batch_channels[ 0, :, :, 0] # o /// o new_spectrogram = predicted_spectrogram[:spectrogram.shape[0], : spectrogram.shape[1]] console.log("Processed spectrogram; reconverting to audio") new_audio = conversion.spectrogram_to_audio( new_spectrogram, fft_window_size=fft_window_size, phase_iterations=phase_iterations) path_parts = os.path.split(path) filename_parts = os.path.splitext(path_parts[1]) output_filename_base = os.path.join(path_parts[0], filename_parts[0] + "_acapella") console.log("Converted to audio; writing to", output_filename_base) conversion.save_audio(new_audio, output_filename_base + ".wav", sample_rate) conversion.save_spectrogram(new_spectrogram, output_filename_base + ".png") conversion.save_spectrogram( spectrogram, os.path.join(path_parts[0], filename_parts[0]) + ".png") # console.log("Vocal isolation complete 👌") print('execution time: {}'.format(time.time() - start_time)) return new_audio
def load(self, as_h5=False): h5_path = os.path.join(self.inPath, "data.h5") if os.path.isfile(h5_path): h5f = h5py.File(h5_path, "r") self.x = h5f["x"][:] self.y = h5f["y"][:] else: acapellas = {} instrumentals = {} # Hash bins for each camelot key so we can merge # in the future, this should be a generator w/ yields in order to eat less memory for i in range(NUMBER_OF_KEYS): key = i + 1 acapellas[key] = [] instrumentals[key] = [] for dir_path, dir_names, file_names in os.walk(self.inPath): for file_name in filter( lambda f: (f.endswith(".mp3") or f.endswith(".wav")) and not f.startswith("."), file_names): key = key_of_file(file_name) if key: target_path_map = acapellas if file_is_acapella( file_name) else instrumentals tag = "[Acapella]" if file_is_acapella( file_name) else "[Instrumental]" audio, sample_rate = conversion.load_audio( os.path.join(self.inPath, file_name)) spectrogram, phase = conversion.audio_to_spectrogram( audio, self.fft_window_size, sr=sample_rate) target_path_map[key].append(spectrogram) console.info(tag, "Created spectrogram for", file_name, "in key", key, "with shape", spectrogram.shape) # Merge mashups for k in range(NUMBER_OF_KEYS): acapellas_in_key = acapellas[k + 1] instrumentals_in_key = instrumentals[k + 1] count = 0 for acapella in acapellas_in_key: for instrumental in instrumentals_in_key: # Pad if smaller if instrumental.shape[1] < acapella.shape[1]: new_instrumental = np.zeros(acapella.shape) new_instrumental[:instrumental. shape[0], :instrumental. shape[1]] = instrumental instrumental = new_instrumental elif acapella.shape[1] < instrumental.shape[1]: new_acapella = np.zeros(instrumental.shape) new_acapella[:acapella.shape[0], :acapella. shape[1]] = acapella acapella = new_acapella # simulate a limiter/low mixing (loses info, but that's the point) # I've tested this against making the same mashups in Logic and it's pretty close mashup = np.maximum(acapella, instrumental) # chop into slices so everything's the same size in a batch dim = SLICE_SIZE mashup_slices = chop(mashup, dim) acapella_slices = chop(acapella, dim) count += 1 self.x.extend(mashup_slices) self.y.extend(acapella_slices) console.info("Created", count, "mashups for key", k, "with", len(self.x), "total slices so far") # Add a "channels" channel to please the network self.x = np.array(self.x)[:, :, :, np.newaxis] self.y = np.array(self.y)[:, :, :, np.newaxis] # Save to file if asked if as_h5: h5f = h5py.File(h5_path, "w") h5f.create_dataset("x", data=self.x) h5f.create_dataset("y", data=self.y) h5f.close()
import conversion import numpy as np import sst import ipdb test_files = [ "../data/aligned/one_last_time/one_last_time_cover_aligned_30s.mp3", "../data/aligned/one_last_time/one_last_time_original_30s.mp3" ] #test_files = ["sample/rolling_in_the_deep/style.mp3"] for f in test_files: console.time("preprocessing") console.log("starting", f) audio, sample_rate = conversion.file_to_audio(f) amplitude, phase = conversion.audio_to_spectrogram(audio, fft_window_size=1536) console.timeEnd("preprocessing") console.time("extracting fundamental") fundamental_mask = sst.extract_fundamental(amplitude) console.timeEnd("extracting fundamental") conversion.image_to_file(fundamental_mask, f + ".fundamental.png") console.time("fundamental to harmonics") fundamental_freqs, fundamental_amps = sst.extract_fundamental_freqs_amps( fundamental_mask, amplitude) harmonics = sst.fundamental_to_harmonics(fundamental_freqs, fundamental_amps, amplitude) console.timeEnd("fundamental to harmonics") conversion.image_to_file(harmonics, f + ".harmonics.png") # pitch normalization haha
import conversion import numpy as np import sst import ipdb from skimage.morphology import dilation # a test of what we could get if we perfectly matched each element of style test_content_file = "sample/rolling_in_the_deep/content.mp3" test_style_file = "sample/rolling_in_the_deep/reference_stylized.mp3" # Load them both as spectrograms console.time("preprocessing") content_audio, content_sample_rate = conversion.file_to_audio( test_content_file) content_amplitude, content_phase = conversion.audio_to_spectrogram( content_audio, fft_window_size=1536) style_audio, style_sample_rate = conversion.file_to_audio(test_style_file) style_amplitude, style_phase = conversion.audio_to_spectrogram( style_audio, fft_window_size=1536) console.timeEnd("preprocessing") stylized_amplitude = np.zeros(content_amplitude.shape) num_freq, num_timesteps, _ = content_amplitude.shape num_timesteps = min(num_timesteps, style_amplitude.shape[1]) # Preprocessing - compute fundamentals and harmonics console.time("super resolution") content_fundamental_mask = sst.extract_fundamental(content_amplitude) content_fundamental_freqs, content_fundamental_amps = sst.extract_fundamental_freqs_amps( content_fundamental_mask, content_amplitude)