def music_processing(music_pth, ret_value): separator = Separator(params_descriptor='spleeter:2stems') audio_adapter = AudioAdapter.get( 'spleeter.audio.ffmpeg.FFMPEGProcessAudioAdapter') waveform, _ = audio_adapter.load(music_path, dtype=np.float32, sample_rate=22050) sources = separator.separate(waveform=waveform, audio_descriptor=music_pth) vocals = sources['vocals'] ret_value['vocals'] = vocals return vocals
def separate_one_audio_on_accompaniment_and_vocals_by_spleeter(path_to_audio, sample_rate, output_directory): audio_loader = get_default_audio_adapter() separator = Separator('spleeter:2stems') filename=path_to_audio.split('/')[-1].split('\\')[-1] waveform, _ = audio_loader.load(path_to_audio, sample_rate=sample_rate) # Perform the separation : prediction = separator.separate(waveform) accompaniment=prediction['accompaniment'] vocals=prediction['vocals'] wavfile.write(output_directory + '.'.join(filename.split('.')[:-1])+'_accompaniment'+'.wav', sample_rate, accompaniment) wavfile.write(output_directory + '.'.join(filename.split('.')[:-1])+'_vocals'+'.wav', sample_rate, vocals) del audio_loader, separator, waveform, prediction, accompaniment, vocals gc.collect()
def test_separate(configuration, instruments): """ Test separation from raw data. """ adapter = get_default_audio_adapter() waveform, _ = adapter.load(TEST_AUDIO_DESCRIPTOR) separator = Separator(configuration) prediction = separator.separate(waveform) assert len(prediction) == len(instruments) for instrument in instruments: assert instrument in prediction for instrument in instruments: track = prediction[instrument] assert not (waveform == track).all() for compared in instruments: if instrument != compared: assert not (track == prediction[compared]).all()
def do_svs_spleeter(y, sr): from spleeter.separator import Separator import warnings separator = Separator('spleeter:2stems') warnings.filterwarnings('ignore') if sr != 44100: y = librosa.core.resample(y=y, orig_sr=sr, target_sr=44100) waveform = np.expand_dims(y, axis=1) prediction = separator.separate(waveform) ret = librosa.core.to_mono(prediction["vocals"].T) # soundfile.write("/HDD2/b06902046/ADLfinal/vocal.wav", prediction["vocals"], 44100, subtype='PCM_16') # print (prediction["vocals"].shape) return ret, 44100
def initialize_components(self): spleeter_sr = 44100 waveform = self._original_mix separator = Separator(self.model_name, multiprocess=False) waveform = librosa.resample(waveform, self.target_sr, spleeter_sr) waveform = np.expand_dims(waveform, axis=1) prediction = separator.separate(waveform) original_components = [ librosa.resample(np.mean(prediction[key], axis=1), spleeter_sr, self.target_sr) for key in prediction ] components_names = list(prediction.keys()) return original_components, components_names
def test_separate(configuration, instruments, backend): """ Test separation from raw data. """ adapter = get_default_audio_adapter() waveform, _ = adapter.load(TEST_AUDIO_DESCRIPTOR) separator = Separator(configuration, stft_backend=backend) prediction = separator.separate(waveform, TEST_AUDIO_DESCRIPTOR) assert len(prediction) == len(instruments) for instrument in instruments: assert instrument in prediction for instrument in instruments: track = prediction[instrument] assert waveform.shape == track.shape assert not np.allclose(waveform, track) for compared in instruments: if instrument != compared: assert not np.allclose(track, prediction[compared])
def do_svs_spleeter(y, sr): from spleeter.separator import Separator import warnings separator = Separator('spleeter:2stems') warnings.filterwarnings('ignore') if sr != 44100: y = librosa.core.resample(y=y, orig_sr=sr, target_sr=44100) waveform = np.expand_dims(y, axis=1) prediction = separator.separate(waveform) ret = librosa.core.to_mono(prediction["vocals"].T) ret = np.clip(ret, -1.0, 1.0) del separator return ret, 44100
def test_separate(test_file, configuration, backend): """ Test separation from raw data. """ instruments = MODEL_TO_INST[configuration] adapter = get_default_audio_adapter() waveform, _ = adapter.load(test_file) separator = Separator(configuration, stft_backend=backend, multiprocess=False) prediction = separator.separate(waveform, test_file) assert len(prediction) == len(instruments) for instrument in instruments: assert instrument in prediction for instrument in instruments: track = prediction[instrument] assert waveform.shape[:-1] == track.shape[:-1] assert not np.allclose(waveform, track) for compared in instruments: if instrument != compared: assert not np.allclose(track, prediction[compared])
class SpleeterSeparator: """Performs source separation using Spleeter API.""" def __init__(self, config=None): """Default constructor. :param config: Separator config, defaults to None """ if config is None: self.audio_bitrate = '256k' self.audio_format = 'mp3' self.sample_rate = 44100 self.spleeter_stem = 'config/4stems-16kHz.json' else: self.audio_bitrate = config['audio_bitrate'] self.audio_format = config['audio_format'] self.sample_rate = config['sample_rate'] self.spleeter_stem = config['spleeter_stem'] # Use librosa backend as it is less memory intensive self.separator = Separator(self.spleeter_stem, stft_backend='librosa', multiprocess=False) self.audio_adapter = get_default_audio_adapter() def separate(self, parts, input_path, output_path): """Performs source separation by adding together the parts to be kept. :param parts: List of parts to keep ('vocals', 'drums', 'bass', 'other') :param input_path: Path to source file :param output_path: Path to output file :raises e: FFMPEG error """ waveform, _ = self.audio_adapter.load(input_path, sample_rate=self.sample_rate) prediction = self.separator.separate(waveform) out = np.zeros_like(prediction['vocals']) part_count = 0 # Add up parts that were requested for key in prediction: if parts[key]: out += prediction[key] part_count += 1 out /= part_count self.audio_adapter.save(output_path, out, self.separator._sample_rate, self.audio_format, self.audio_bitrate)
def source_seperate_ogg(ogg_list: list): separator = Separator('spleeter:4stems') audio_loader = AudioAdapter.default() sample_rate = 22050 range_ = 32767 for ogg in ogg_list: waveform, _ = audio_loader.load(ogg, sample_rate=sample_rate) prediction = separator.separate(waveform) prediction['other'] = prediction['other'] * range_ save_path = Path( str(ogg).replace('Unprocessed', 'source_separated', 1)) if not os.path.isdir(save_path.parent): os.mkdir(save_path.parent) print(prediction) break
def execute(args): try: logger.info('音声認識処理開始: {0}', args.audio_file, decoration=MLogger.DECORATION_BOX) if not os.path.exists(args.audio_file): logger.error("指定された音声ファイルパスが存在しません。\n{0}", args.video_file, decoration=MLogger.DECORATION_BOX) return False, None # 親パス(指定がなければ動画のある場所。Colabはローカルで作成するので指定あり想定) base_path = str(pathlib.Path(args.audio_file).parent ) if not args.parent_dir else args.parent_dir audio_adapter = get_default_audio_adapter() sample_rate = 44100 waveform, _ = audio_adapter.load(args.audio_file, sample_rate=sample_rate) # 音声と曲に分離 separator = Separator('spleeter:2stems') # Perform the separation : prediction = separator.separate(waveform) # 音声データ vocals = prediction['vocals'] audio_adapter.save(f"{base_path}/vocals.wav", vocals, separator._sample_rate, "wav", "16k") logger.info('音声認識処理終了: {0}', base_path, decoration=MLogger.DECORATION_BOX) return True except Exception as e: logger.critical("音声認識で予期せぬエラーが発生しました。", e, decoration=MLogger.DECORATION_BOX) return False
def initialize_components(self): spleeter_sr = 44100 precomputed_name = os.path.basename(self._audio_path) + ".pt" precomputed_path = os.path.join(self.spleeter_sources_path, precomputed_name) if self.recompute: waveform = self._original_mix separator = Separator(self.model_name, multiprocess=False) waveform = librosa.resample(waveform, self.target_sr, spleeter_sr) waveform = np.expand_dims(waveform, axis=1) prediction = separator.separate(waveform) pickle_dump(prediction, precomputed_path) else: prediction = pickle_load(precomputed_path) original_components = [ librosa.resample(np.mean(prediction[key], axis=1), spleeter_sr, self.target_sr) for key in prediction ] components_names = list(prediction.keys()) return original_components, components_names
def spleet_wav(songpath,outfolder,num_stems): rate, audio = wavfile.read(songpath) songname = os.path.basename(os.path.normpath(songpath)) warnings.filterwarnings('ignore') stem_param = str(num_stems) + 'stems' # Using embedded configuration... stems can be 2,4, 5 (number of instruments in network) separator = Separator('spleeter:'+stem_param) # Perform the separation prediction = separator.separate(audio) rate = 44100 for instrument in prediction: name = outfolder + "/" + songname + "_" + instrument + "_16-bit.wav" print("Saving", instrument, "as: ", name) wavio.write(name, prediction[instrument].astype(np.int16), rate, sampwidth=2) print("Overwriting other.wav with merged version") sound1 = AudioSegment.from_wav(outfolder + "/" + songname + "_" + "piano" + "_16-bit.wav") sound2 = AudioSegment.from_wav(outfolder + "/" + songname + "_" + "other" + "_16-bit.wav") merged_piano_other = sound1.overlay(sound2) merged_piano_other.export(outfolder + "/" + songname + "_" + "other" + "_16-bit.wav",format="wav") os.remove(outfolder + "/" + songname + "_" + "piano" + "_16-bit.wav") print("done merging other and piano")
dataset_dir = sys.argv[1] from spleeter.separator import Separator import warnings separator = Separator('spleeter:2stems') for the_dir in os.listdir(dataset_dir): mix_path = os.path.join(dataset_dir, the_dir, "Mixture.mp3") y, sr = librosa.core.load(mix_path, sr=None, mono=True) if sr != 44100: y = librosa.core.resample(y=y, orig_sr=sr, target_sr=44100) waveform = np.expand_dims(y, axis=1) prediction = separator.separate(waveform) voc = librosa.core.to_mono(prediction["vocals"].T) voc = np.clip(voc, -1.0, 1.0) acc = librosa.core.to_mono(prediction["accompaniment"].T) acc = np.clip(acc, -1.0, 1.0) import soundfile soundfile.write(os.path.join(dataset_dir, the_dir, "Vocal.wav"), voc, 44100, subtype='PCM_16') soundfile.write(os.path.join(dataset_dir, the_dir, "Inst.wav"), acc, 44100, subtype='PCM_16')
def transcribe(self, input_audio, model_path=None, output="./"): """Transcribe vocal notes in the audio. This function transcribes onset, offset, and pitch of the vocal in the audio. This module is reponsible for predicting onset and offset time of each note, and pitches are estimated by the `vocal-contour` submodule. Parameters ---------- input_audio: Path Path to the raw audio file (.wav). model_path: Path Path to the trained model or the supported transcription mode. output: Path (optional) Path for writing out the transcribed MIDI file. Default to the current path. Returns ------- midi: pretty_midi.PrettyMIDI The transcribed vocal notes. Outputs ------- This function will outputs three files as listed below: - <song>.mid: the MIDI file with complete transcription results in piano sondfount. - <song>_f0.csv: pitch contour information of the vocal. - <song>_trans.wav: the rendered pitch contour audio. See Also -------- omnizart.cli.vocal.transcribe: CLI entry point of this function. omnizart.vocal_contour.transcribe: Pitch estimation function. """ logger.info("Separating vocal track from the audio...") separator = Separator('spleeter:2stems') # Tricky way to avoid the annoying tensorflow graph being finalized issue. separator._params["stft_backend"] = "librosa" # pylint: disable=protected-access wav, fs = load_audio(input_audio, mono=False) pred = separator.separate(wav) logger.info("Loading model...") model, model_settings = self._load_model(model_path) logger.info("Extracting feature...") wav = librosa.to_mono(pred["vocals"].squeeze().T) feature = _extract_vocal_cfp( wav, fs, down_fs=model_settings.feature.sampling_rate, hop=model_settings.feature.hop_size, fr=model_settings.feature.frequency_resolution, fc=model_settings.feature.frequency_center, tc=model_settings.feature.time_center, g=model_settings.feature.gamma, bin_per_octave=model_settings.feature.bins_per_octave) logger.info("Predicting...") pred = predict(feature, model) logger.info("Infering notes...") interval = infer_interval( pred, ctx_len=model_settings.inference.context_length, threshold=model_settings.inference.threshold, min_dura=model_settings.inference.min_duration, t_unit=model_settings.feature.hop_size) logger.info("Extracting pitch contour") agg_f0 = vcapp.app.transcribe( input_audio, model_path=model_settings.inference.pitch_model, output=output) logger.info("Inferencing MIDI...") midi = infer_midi(interval, agg_f0, t_unit=model_settings.feature.hop_size) self._output_midi(output=output, input_audio=input_audio, midi=midi) logger.info("Transcription finished") return midi
class SpleeterSeparator(ABCSeparator): """Spleeter separator uses the spleeter library to separate music sources. """ def __init__(self, stems: int, chunk_size=2): """ Args: stems (int): total files to generate (2/3/5). chunk_size (int): chunk size (in seconds) indicates duration size of individual chunk before splitting. NOTE: Longer audio file takes more memory. Hence, splitting the audio is a workaround. """ # specified stem loads a specific model # hence, it should be specified which model # to load. self.stems = stems #in minutes self.chunk_size = int(chunk_size * 60) self._separator = Separator(f"spleeter:{self.stems}stems") # spleeter specific config self._audio_adapter = get_default_audio_adapter() def _chunk(self, waveform, sr): chunks = [] length = len(waveform) // sr remainder = len(waveform) % sr print(len(waveform), len(waveform) / sr) for c in range(0, length, self.chunk_size): print(c) chunk = waveform[c * sr:(c + self.chunk_size) * sr] print(len(chunk)) yield chunk """ if remainder: chunk = waveform[(c + 1)*sr + remainder:] print(len(chunk), "remainder") yield chunk """ def separate(self, audio: Union[str, np.ndarray], sample_rate=44_100): """Separate audio into specified stems. Note: Spleeter uses tensorflow backend. Hence, corresponding installed device will automatically be used (CPU/GPU). Minimum VRAM/RAM requirement: 4GB (for small audio, <6 minutes). Args: audio_file (str, array): path to the original signal or the signal itself. sample_rate (int): sampling rate of the file. Returns: signal (Signal): separated signals. Raises: tf.errors.ResourceExhaustedError: When memory gets exhausted. """ if isinstance(audio, np.ndarray): waveform = audio else: waveform, _ = self._audio_adapter.load(audio, sample_rate=sample_rate) print(waveform.shape) #predict in chunks prediction = {} for chunk in self._chunk(waveform, sample_rate): chunk_prediction = self._separator.separate(chunk) for chunk_key, chunk_value in chunk_prediction.items(): if chunk_key not in prediction: prediction[chunk_key] = [] prediction.get(chunk_key).append(chunk_value) #merge chunk prediction prediction = {k: np.vstack(v) for k, v in prediction.items()} print(list(v.shape for v in prediction.values())) signal = Signal(prediction.keys(), prediction.values()) return signal
print("converting") else: print("not converting") sep = Separator('./2stem-finetune-realtime.json', MWF=False, stft_backend='tensorflow', multiprocess=False) class Spleeter_Server(LADSPA_TCPServer): def process(self, channel, sample_rate, data): if np.max(data) == np.min(data) == 0: return data if should_process: processed = sep.separate(data.astype('float64').reshape((-1, 1))) return processed['vocals'].astype('float32')[:, 0] else: return data if __name__ == "__main__": signal.signal(signal.SIGUSR1, handler) print("warming up") sep.separate(np.zeros((1024, 2))) print("run kill -SIGUSR1 %d to toggle the service on/off" % os.getpid()) print("serving on :18083") Spleeter_Server.serve_forever(18083)
# audio_loader = get_default_audio_adapter() # sample_rate = 44100 # waveform, _ = audio_loader.load('/path/to/audio/file', sample_rate=sample_rate) # Perform the separation : # for f in tqdm.tqdm(glob.glob("./seperate/*.wav")): f = "audio_example.mp3" y,sr = librosa.load(librosa.util.example_audio_file(),sr=None,mono=False) yx = np.hstack([y]) duration = yx.shape[1]/sr print(duration) time.sleep(4) start_time = time.time() silce_t = duration/10 prediction = separator.separate(yx.T) for i in tqdm.tqdm(range(10)): start = int(sr*silce_t*i) end = int(sr*silce_t*(i+1)) silce = yx[:,start:end] prediction = separator.separate(silce.T) time.sleep(4) print(duration,time.time()-start_time)
def main(argv=None): args = docopt(__doc__, argv=argv) fi = Clip(args['<input>']) fo = args['-o'] ranges = list( tuple(ptime(t) for t in range.split('~')) for range in args['<range>']) loader = AudioAdapter.default() sample_rate = 44100 separator = Separator('spleeter:2stems') segments = {} for start, end in ranges: print(f'Processing range {start}-{end}...') options = ['-vn', '-r', str(sample_rate), '-f', 'wav'] clip = fi.slice(start, end - start, output_options=options)[0] for i in range(int(args['--pass'])): waveform, _ = loader.load(clip.path, sample_rate=sample_rate) prediction = separator.separate(waveform) output = tmpfile('wav') target = 'accompaniment' if args['--inverse'] else 'vocals' loader.save(output, prediction[target], sample_rate) clip = Clip(output, tmpfile=output) segments[start] = clip print('Writing output file...') # Mute ranges in the original audio track # asetnsamples is required, source: https://superuser.com/a/1230890 filters = '[0:a]asetnsamples=8192,' filters += ','.join(f"volume=0:enable='between(t,{start},{end})'" for start, end in ranges) filters += '[main]' # Delay processed segments for i, (start, end) in enumerate(ranges): delay = int(start * 1000) filters += f';[{i+1}]' filters += 'asetnsamples=8192' filters += f',adelay={delay}|{delay},apad[delay{i+1}]' # Mix muted original track and all processed segments filters += ';[main]' for i, (start, end) in enumerate(ranges): filters += f'[delay{i+1}]' filters += f'amix=inputs={len(ranges) + 1}:duration=first' filters += f',volume={len(ranges) + 1}' filters += '[audio]' command = ['ffmpeg', '-i', fi.path] for start, segment in segments.items(): command += ['-i', segment.path] # Copy codecs from the original video ainfo = fi.ffprobe('stream=codec_name,bit_rate', 'a')['streams'][0] command += [ '-c:v', 'copy', '-c:a', ainfo['codec_name'], '-b:a', ainfo['bit_rate'], '-strict', '-2' ] command += [ '-filter_complex', filters, '-map', '0:v', '-map', '[audio]', fo ] if run(command).returncode != 0: if os.path.exists(fo): os.unlink(fo) raise Exception('ffmpeg exited with non-zero code')
class separateQThread(QThread): position = Signal(int) percent = Signal(float) voiceList = Signal(list) avgList = Signal(list) finish = Signal(bool) def __init__(self, videoPath, duration, before, after, multiThread, parent=None): super(separateQThread, self).__init__(parent) self.videoPath = videoPath self.duration = duration self.beforeCnt = int(before) // 20 self.afterCnt = int(after) // 20 self.separate = Separator('spleeter:2stems', stft_backend='tensorflow', multiprocess=multiThread) self.audioLoader = get_default_audio_adapter() def run(self): cuts = self.duration // 60000 + 1 for cut in range(cuts): cmd = [ 'utils/ffmpeg.exe', '-y', '-i', self.videoPath, '-vn', '-ss', str(cut * 60), '-t', '60', 'temp_audio.m4a' ] p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) p.wait() for line in p.stdout.readlines(): try: line = line.decode('gb18030', 'ignore') if 'Audio:' in line: break except: pass for hz in line.split(','): if 'Hz' in hz: hz = int(hz.split('Hz')[0]) break hz20 = hz // 50 # 20ms waveform, _ = self.audioLoader.load('temp_audio.m4a') prediction = self.separate.separate(waveform) msList = [] varList = [] voiceList = [] avgList = [] for cnt, l in enumerate(prediction['vocals']): # 只提取人声键值 for i in l: msList.append(i) if not cnt % hz20: # 每20ms取一次方差 varList.append(np.var(msList)) # 每20ms内的方差值 avgList.append(np.mean(msList)) # 每20ms内的平均值 msList = [] med = np.median(varList) # 1分钟内所有方差中位数 cnt = self.beforeCnt # 用户设置打轴前侧预留时间 / 20ms的次数 start = 0 # 人声开始时间 end = 0 # 人声结束时间 avgVarList = [] # 平滑方差值 for varCnt in range(len(varList) - 5): avgVarList.append(np.mean(varList[varCnt:varCnt + 5])) # 方差值+后四位一起取平均 avgVarList += varList[-4:] # 补上最后四个没计算的方差值 while cnt < len(avgVarList) - self.afterCnt: # 开始判断人声区域 if avgVarList[cnt] >= med: # 平均方差值超过1分钟内方差中位数 start = cut * 60000 + ( cnt - self.beforeCnt) * 20 # 开始时间为当前时间-用户前侧留白时间 cnt += self.afterCnt # 向后延伸用户后侧留白时间 if cnt < len(avgVarList): # 没超出一分钟则开始往后查询 finishToken = False while not finishToken: try: # 查询超出长度一律跳出循环 while avgVarList[cnt] >= med: # 向后查询至平均方差值<中位数 cnt += 1 cnt += self.afterCnt # 补一次用户留白时间后再次判断平均方差值<中位数 是则通过 if avgVarList[cnt] < med: finishToken = True except: break end = cut * 60000 + cnt * 20 # 结束时间即结束向后查询的时间 voiceList.append([start, end]) # 添加起止时间给信号槽发送 else: cnt += 1 # 没检测到人声则+1 self.position.emit(cut + 1) self.percent.emit((cut + 1) / cuts * 100) self.voiceList.emit(voiceList) self.avgList.emit(avgList) # plt.subplot(311) # plt.plot([x for x in range(len(avgList))], avgList) # plt.subplot(312) # plt.plot([x for x in range(len(avgVarList))], avgVarList) # plt.axhline(med, label='median') # plt.subplot(313) # x = [] # y = [] # modifyVoice = [] # for l in voiceList: # modifyVoice += l # trig = False # for i in range(self.duration): # for l in modifyVoice: # if i > l: # trig = not trig # x.append(i) # if not trig: # y.append(0) # else: # y.append(1) # plt.plot(x, y) # plt.legend() # plt.show() self.finish.emit(True)
'add padding zeros to each waveform' zero_padded_waveform = np.zeros( (clipped_waveform.shape[0] + padding_length * 2, clipped_waveform.shape[1])) zero_padded_waveform[padding_length:-padding_length] = clipped_waveform 'we may also use the extra data' if split_waveform_index - padding_length >= 0 and split_waveform_index + window_size + padding_length < len( waveform): zero_padded_waveform[:padding_length] = waveform[ split_waveform_index - padding_length:split_waveform_index] zero_padded_waveform[-padding_length:] = waveform[ split_waveform_index + window_size:split_waveform_index + window_size + padding_length] 'sperate using spleeter' prediction = separator.separate(zero_padded_waveform) 'clip padded part, throw them away' prediction['vocals'] = prediction['vocals'][padding_length:-padding_length] prediction['accompaniment'] = prediction['accompaniment'][ padding_length:-padding_length] 'merge results together' vocal_res.extend(prediction['vocals']) accompan_res.extend(prediction['accompaniment']) def interval_to_info(interval_seq): res = [] start = 0 in_interval = False for (index, label) in zip(range(len(interval_seq)), interval_seq): if label >= 1 and not in_interval:
def separate(waveform): separator = Separator('spleeter:2stems') return separator.separate(waveform)
class AudioDetect: def __init__(self, model_path_1, model_path_2): self.spleeter = Separator('spleeter:2stems', model_path_1) # 基于频域进行音轨分离,分离人声的话一般只需要2轨,accompaniment.wav 提取的背景/伴奏; vocals.wav是提取的人声 self.spleeter._get_predictor() self.ina_speech_segmenter = Segmenter(detect_gender=False, model_dir=model_path_2) ###### logging.info("init done") def file_base_name(self, file_path): return Path(file_path).resolve().stem def spleeter_volcals_file_name(self, input_file, output_dir): input_base_name = self.file_base_name(input_file) return output_dir + "/" + input_base_name + "/vocals.wav" # get def do_spleeter_from_buffer(self, input_buffer): waveform = buffer_utils.buffer_to_wave_for_spleeter( input_buffer, 44100) sources = self.spleeter.separate(waveform) return sources['vocals'] def do_spleeter(self, input_file, out_dir): # 分轨文件目录 out_dir self.spleeter.separate_to_file( input_file, out_dir, filename_format='{filename}/{instrument}.{codec}') return True def do_segment_from_buffer(self, input_buffer): with warnings.catch_warnings(): warnings.simplefilter("ignore") mspec, loge, difflen = buffer_utils.feat_from_spleeter_vocals_for_segment_two_transcode( input_buffer) segmention = self.ina_speech_segmenter.segment_feats( mspec, loge, difflen, 0) return (True, segmention) def do_segment(self, input, output_dir): with warnings.catch_warnings(): warnings.simplefilter("ignore") segmention = self.ina_speech_segmenter( self.spleeter_volcals_file_name(input, output_dir)) return (True, segmention) def process_segmention(self, result_dic, segmention): last_lable = "" last_start = -1 last_end = -1 segments = [] for segment in segmention: label = segment[0] label = self.map_label(label) start = round(float(segment[1]), 2) end = round(float(segment[2]), 2) if last_lable == "": last_lable = label last_start = start last_end = end continue if last_lable == label: last_end = end continue else: if last_lable == "speech": segments.append({ "type": "speech", "startSec": last_start, "endSec": last_end }) last_lable = label last_start = start last_end = end if last_lable == "speech": segments.append({ "type": "speech", "startSec": last_start, "endSec": last_end }) result_dic["segments"] = segments def map_label(self, label): speech_labels = ["music", "speech"] if label in speech_labels: return "speech" return "noEnergy" def process_from_buffer(self, input_buffer, input_file): result_dic = {} result_dic.clear() input_base_name = os.path.basename(input_file) result_dic["fileName"] = input_base_name vocals_data = self.do_spleeter_from_buffer(input_buffer) if vocals_data is None: logging.error("separate failed") return json.dumps(result_dic, ensure_ascii=False) result, segmention = self.do_segment_from_buffer( vocals_data) # make sure vocals_data is 16kHz if not result: logging.error("segment failed") return json.dumps(result_dic, ensure_ascii=False) self.process_segmention(result_dic, segmention) return json.dumps(result_dic, ensure_ascii=False) def process(self, input, output): result_dic = {} result_dic.clear() input_base_name = os.path.basename(input) result_dic["fileName"] = input_base_name if not self.do_spleeter(input, output): ### step 1 logging.error("separate failed") return json.dumps(result_dic, ensure_ascii=False) result, segmention = self.do_segment(input, output) ### step 2 if not result: logging.error("segment failed") return json.dumps(result_dic, ensure_ascii=False) self.process_segmention(result_dic, segmention) return json.dumps(result_dic, ensure_ascii=False)
class ApplicationWindow(QtWidgets.QMainWindow): def __init__(self): super(ApplicationWindow, self).__init__() pg.setConfigOption('background', 'k') self.ui = Ui_MainWindow() self.ui.setupUi(self) self.separator = Separator('spleeter:2stems') self.Input_X = [] self.Input_Y = [] self.music = [] self.vocals = [] self.Ecg = [] self.Sample_Rate = 1000 #sample rate , taken from the file source #---------------------------------------------------------------------------------------------------------------- self.graphic_View_Array = [ self.ui.Original_GV, self.ui.Vocals_GV, self.ui.Music_GV, self.ui.Original_ECG, self.ui.ECG, self.ui.Arrhythmia ] for x in self.graphic_View_Array: x.getPlotItem().hideAxis('bottom') x.getPlotItem().hideAxis('left') x.setMouseEnabled(x=False, y=False) self.playArray = [ self.ui.Play_Music, self.ui.Play_original, self.ui.Play_vocals ] #---------------------------------------------------------------------------------------------------------------- self.ui.Import.clicked.connect(self.Import) self.ui.Import_ECG.clicked.connect(self.Import_ECG) #----------------------------------------------------------------------------------------------------------------' self.stopArray = [self.ui.Stop, self.ui.Stop2, self.ui.Stop3] for x in self.stopArray: x.clicked.connect(self.Stop) #---------------------------------------------------------------------------------------------------------------- #----------------------------------------------------------------------------------------------------------------' self.ui.Save_music.clicked.connect(lambda: self.Save_music(self.music)) self.ui.Save_vocals.clicked.connect( lambda: self.Save_vocals(self.vocals)) #---------------------------------------------------------------------------------------------------------------- def Import(self): filePaths = QtWidgets.QFileDialog.getOpenFileNames( self, 'Multiple File', "~/Desktop", '*') for filePath in filePaths: for f in filePath: if f == "*" or f == None: break ext = os.path.splitext(f)[-1].lower() # Check file extension if ext == ".wav": self.Input_Y, frame_rate = self.ReadFromWav(f) self.Input_X = np.arange(0, len(self.Input_Y)) self.plot(self.Input_X, self.Input_Y[:, 0], self.ui.Original_GV, 'r') self.ui.Play_original.clicked.connect( lambda: self.Play_Wav(self.Input_Y)) self.vocals, self.music = self.split(self.Input_Y) self.plot(self.Input_X[200:len(self.Input_X) - 2000], self.music[:, 0][200:len(self.Input_X) - 2000], self.ui.Music_GV, 'w') self.plot(self.Input_X[200:len(self.Input_X) - 2000], self.vocals[:, 0][200:len(self.Input_X) - 2000], self.ui.Vocals_GV, 'w') self.ui.Play_Music.clicked.connect( lambda: self.Play_Wav(self.music)) self.ui.Play_vocals.clicked.connect( lambda: self.Play_Wav(self.vocals)) if ext == ".csv": ECG_data = pd.read_csv(f) self.Ecg = [data for data in ECG_data.ECG] self.Ecg = np.array(self.Ecg) x = np.arange(0, len(self.Ecg)) self.plot(x, self.Ecg, self.ui.Original_GV, 'r') self.split_ECG(self.Ecg) def Import_ECG(self): filePaths = QtWidgets.QFileDialog.getOpenFileNames( self, 'Multiple File', "~/Desktop", '*') for filePath in filePaths: for f in filePath: if f == "*" or f == None: break ext = os.path.splitext(f)[-1].lower() # Check file extension if ext == ".csv": ECG_data = pd.read_csv(f) self.Ecg = [data for data in ECG_data.ECG] self.Ecg = np.array(self.Ecg) x = np.arange(0, len(self.Ecg)) self.plot(x, self.Ecg, self.ui.Original_ECG, 'r') self.split_ECG(self.Ecg) #---------------------------------------------------------------------------------------------------------------- def ReadFromWav(self, file): (freq, sig) = wav.read(file) sig = (sig.astype(np.float32)) / 100000 return (sig, freq) #------------------------------------------------------------------------------------------------------------------------ def split_ECG(self, ecg): Data, phase = librosa.magphase(librosa.stft(ecg)) Filter = librosa.decompose.nn_filter( Data, aggregate=np.median, metric='cosine', width=int(librosa.time_to_frames(2, sr=self.Sample_Rate))) Filter = np.minimum(Data, Filter) margin_i, margin_v = 2, 10 power = 2 mask_i = librosa.util.softmask(Filter, margin_i * (Data - Filter), power=power) mask_v = librosa.util.softmask(Data - Filter, margin_v * Filter, power=power) pure_arrhythmia = (mask_v * Data) * phase pure_ECG = (mask_i * Data) * phase arrhythmia = librosa.istft(pure_arrhythmia) ECG = librosa.istft(pure_ECG) * 1.5 x_A = np.arange(0, len(arrhythmia)) x_E = np.arange(0, len(ECG)) # pure ECG self.plot(x_E, ECG, self.ui.ECG, 'w') # Pure arrhythmia self.plot(x_A, arrhythmia, self.ui.Arrhythmia, 'w') def split(self, WavData): splitted = self.separator.separate(WavData) Music = (splitted.get('accompaniment')) Vocals = (splitted.get('vocals')) return Vocals, Music #------------------------------------------------------------------------------------------------------------------------ def Play_Wav(self, array): if len(self.Input_Y) != 0: sd.play(array) else: pass def Stop(self): sd.stop() #------------------------------------------------------------------------------------------------------------------------ def plot(self, x, y, gv, color): gv.clear() gv.plotItem.getViewBox().setRange(xRange=x, yRange=y) gv.plot(x, y, pen=color) #------------------------------------------------------------------------------------------------------------------------ def Save_music(self, arr): if len(self.music) > 0: write("outputs/music.wav", 44100, arr) def Save_vocals(self, arr): if len(self.vocals) > 0: write("outputs/vocals.wav", 44100, arr)
class SpleeterSeparator: """Performs source separation using Spleeter API.""" def __init__(self, config=None): """Default constructor. :param config: Separator config, defaults to None """ if config is None: self.audio_bitrate = '256k' self.audio_format = 'mp3' self.sample_rate = 44100 self.spleeter_stem = 'config/4stems-16kHz.json' else: self.audio_bitrate = config['audio_bitrate'] self.audio_format = config['audio_format'] self.sample_rate = config['sample_rate'] self.spleeter_stem = config['spleeter_stem'] # Use librosa backend as it is less memory intensive self.separator = Separator(self.spleeter_stem, stft_backend='librosa', multiprocess=False) self.audio_adapter = get_default_audio_adapter() def create_static_mix(self, parts, input_path, output_path): """Creates a static mix by performing source separation and adding the parts to be kept into a single track. :param parts: List of parts to keep ('vocals', 'drums', 'bass', 'other') :param input_path: Path to source file :param output_path: Path to output file :raises e: FFMPEG error """ waveform, _ = self.audio_adapter.load(input_path, sample_rate=self.sample_rate) prediction = self.separator.separate(waveform) out = np.zeros_like(prediction['vocals']) part_count = 0 # Add up parts that were requested for key in prediction: if parts[key]: out += prediction[key] part_count += 1 self.audio_adapter.save(output_path, out, self.separator._sample_rate, self.audio_format, self.audio_bitrate) def separate_into_parts(self, input_path, output_path): """Creates a dynamic mix :param input_path: [description] :type input_path: [type] :param output_path: [description] :type output_path: [type] """ self.separator.separate_to_file(input_path, output_path, self.audio_adapter, codec='mp3', bitrate=self.audio_bitrate, filename_format='{instrument}.{codec}', synchronous=False) self.separator.join(600)
# import zipfile # import uuid from flask import Flask, flash, request, redirect, url_for, send_from_directory from spleeter.separator import Separator from spleeter.audio.adapter import AudioAdapter app = Flask(__name__) if __name__ == '__main__': separator = Separator('spleeter:5stems') audio_loader = AudioAdapter.default() sample_rate = 44100 waveform, _ = audio_loader.load("audio/Never Catch Me.mp3", sample_rate=sample_rate) prediction = separator.separate(waveform, audio_descriptor='') print(prediction) for instrument, data in prediction.items(): audio_loader.save(os.path.join("output", f'{instrument}.mp3'), data, sample_rate, 'mp3', '128k') # ALLOWED_EXTENSIONS = ['mp3', 'wav'] # virtualenv E:\Code\spleeter-back-end\venv -p C:/Users/aidan/AppData/Local/Programs/Python/Python38/python.exe # virtualenv --python=E:\Code\spleeter-back-end\venv C:/Users/aidan/AppData/Local/Programs/Python/Python38/python.exe # .\venv\Scripts\activate