def mean_squared_error(self, processed_file=None, vocal_file=None): normalizer = Normalizer() normalize = normalizer.get(both=False) if processed_file is None: vocal_isolation = VocalIsolation(config) vocal_isolation.loadWeights(config.weights) data = Data() mses = [] for track in data.validation_tracks + data.test_tracks: mashup = data.prepare_spectrogram(data.mashup[track]) vocal = data.prepare_spectrogram(data.vocal[track]) mashup, norm = normalize(mashup) vocal, _ = normalize(vocal, norm) info = vocal_isolation.\ process_spectrogram(mashup, config.get_channels()) new_spectrogram = info[1] mse = ((new_spectrogram - vocal)**2).mean() mses.append(mse) print(track, mse) print(np.mean(mses)) else: vocal_audio, _ = conversion.load_audio_file(vocal_file) processed_audio, _ = conversion.load_audio_file(processed_file) # make sure audios have the same length vocal_audio = vocal_audio[:processed_audio.shape[0]] processed_audio = processed_audio[:vocal_audio.shape[0]] wave_mse = ((vocal_audio - processed_audio)**2).mean() print("\n") self._write("Wave mean squared error: %s" % wave_mse)
def stoi(self, filepath, clean_filepath=None): # filepath = path to mashup # Needs octave and octave-signal installed # Use "pip install oct2py" to install python - octave bridge # STOI assumes # * a sampling rate of 10kHz, resamples otherwise # * window length of 384ms # * 15 third octave bands over full frequency range # * overlapping segments with hanning window # * removes silent frames import librosa from oct2py import octave if clean_filepath is None: # No clean file given. # Get processed and clean file from mashup. vocal_isolation = VocalIsolation(config) vocal_isolation.loadWeights(config.weights) audio, sampleRate = conversion.load_audio_file(filepath) spectrogram = conversion.audio_file_to_spectrogram( audio, fftWindowSize=config.fft, learn_phase=self.config.learn_phase) normalizer = Normalizer() normalize = normalizer.get(both=False) denormalize = normalizer.get_reverse() # normalize spectogram, norm = normalize(spectrogram) info = vocal_isolation.process_spectrogram(spectrogram, config.get_channels()) spectrogram, new_spectrogram = info # de-normalize new_spectrogram = denormalize(new_spectrogram, norm) processed = conversion.spectrogram_to_audio_file(new_spectrogram, config.fft, config.phase_iterations) clean_filepath = filepath.replace("_all.wav", "_vocal.wav") clean, sampling_rate = librosa.load(clean_filepath) else: # A clean file is given. # Compare it with the processed audio. processed, sampling_rate = librosa.load(filepath) clean, sampling_rate = librosa.load(clean_filepath) # Make sure the original and processed audio have the same length clean = clean[:processed.shape[0]] octave.eval("pkg load signal") d = octave.stoi(clean, processed, sampling_rate) self._write("stoi: %f" % d)
def process_spectrogram(self, spectrogram, channels=1): chopper = Chopper() chopper.name = "infer" chopper.params = "{'scale': %d}" % self.config.inference_slice chop = chopper.get(both=False) slices = chop(spectrogram) normalizer = Normalizer() normalize = normalizer.get(both=False) denormalize = normalizer.get_reverse() new_spectrogram = np.zeros((spectrogram.shape[0], 0, channels)) for slice in slices: # normalize slice, norm = normalize(slice) epanded_spectrogram = conversion.expand_to_grid( slice, self.peakDownscaleFactor, channels) epanded_spectrogram_with_batch_and_channels = \ epanded_spectrogram[np.newaxis, :, :] predicted_spectrogram_with_batch_and_channels = self.model.predict( epanded_spectrogram_with_batch_and_channels) # o /// o predicted_spectrogram = \ predicted_spectrogram_with_batch_and_channels[0, :, :, :] local_spectrogram = predicted_spectrogram[:slice.shape[0], :slice. shape[1], :] # de-normalize local_spectrogram = denormalize(local_spectrogram, norm) new_spectrogram = np.concatenate( (new_spectrogram, local_spectrogram), axis=1) console.log("Processed spectrogram") return spectrogram, new_spectrogram
def volume(self, filepath): normalizer = Normalizer() normalize = normalizer.get(both=False) denormalize = normalizer.get_reverse() vocal_file = filepath.replace("_all.wav", "_vocal.wav") instrumental_file = filepath.replace("_all.wav", "_instrumental.wav") vocal_isolation = VocalIsolation(config) vocal_isolation.loadWeights(config.weights) instrumental_audio, _ = conversion.load_audio_file(instrumental_file) vocal_audio, _ = conversion.load_audio_file(vocal_file) instrumental = conversion.audio_file_to_spectrogram( instrumental_audio, fftWindowSize=config.fft, learn_phase=self.config.learn_phase) vocal = conversion.audio_file_to_spectrogram( vocal_audio, fftWindowSize=config.fft, learn_phase=self.config.learn_phase) if not os.path.exists(self.analysisPath): os.mkdir(self.analysisPath) h5f_path = os.path.join(self.analysisPath, "volume.hdf5") h5file = h5py.File(h5f_path, "w") ratio = 100 x = [i/ratio for i in range(1, ratio)] + \ [1] + \ [ratio/i for i in range(ratio-1, 0, -1)] h5file.create_dataset(name="x", data=x) print("Unscaled original mix") mashup, norm = normalize(instrumental + vocal) info = vocal_isolation.process_spectrogram(mashup, config.get_channels()) new_spectrogram = denormalize(info[1], norm) mse = ((new_spectrogram - vocal)**2).mean() y = [mse for _ in x] plt.loglog(x, y, label="baseline") h5file.create_dataset(name="baseline", data=y) original_ratio = np.max(vocal)/np.max(instrumental) print("Original ratio: %s" % original_ratio) vocal /= original_ratio print("Change vocal volume") y = [] for i in x: mashup, norm = normalize(instrumental + i * vocal) info = vocal_isolation.process_spectrogram(mashup, config.get_channels()) new_spectrogram = denormalize(info[1], norm) if i != 0: new_spectrogram = new_spectrogram / i mse = ((new_spectrogram - vocal)**2).mean() y.append(mse) print(mse) plt.loglog(x, y, label="scaled") plt.xlabel("vocal/instrumental") plt.ylabel("mean squared error") plt.legend() h5file.create_dataset(name="scale", data=y) h5file.close() if not os.path.exists(self.analysisPath): os.mkdir(self.analysisPath) plt.savefig(os.path.join(self.analysisPath, "volume.png"))