def evaluate_sdr_chorale(evaluation_dataset, model, chorale, source, models, regularization=0): import museval dataset_sr = dataset_sample_rates[evaluation_dataset] frame_size = dataset_sr estimate_filename = get_estimate_filename(evaluation_dataset, model, chorale, source, models.loc[model].multi_source) estimate, estimate_sr = soundfile.read(estimate_filename) reference, reference_sr = read_chorale(evaluation_dataset, chorale, source) assert reference_sr == estimate_sr == dataset_sr assert reference.shape == estimate.shape model_params = models.loc[model] reference_sources = np.array([reference]) estimated_sources = np.array([estimate]) sdr, _isr, _sir, _sar = museval.evaluate(reference_sources, estimated_sources, padding=False, win=frame_size, hop=frame_size) snr = evaluate_snr(reference_sources, estimated_sources, frame_size) snr_reg = evaluate_snr(reference_sources, estimated_sources, frame_size, regularization) assert sdr.shape == snr.shape == snr_reg.shape return pandas.DataFrame({ 'sdr_bsseval': sdr[0], 'sdr_mine': snr[0], 'sdr_reg': snr_reg[0], 'frame': range(sdr.shape[1]) })
def bss_eval_sdr_v4_2s(src_list, pred_src_list, do_remove_abnormal=True): import museval len_cropped = pred_src_list.shape[-1] src_list = src_list[:, :len_cropped] sdr, isr, sir, sar = museval.evaluate(src_list, pred_src_list, win=44100 * 2, hop=44100 * 2, mode="v4", padding=True) if do_remove_abnormal: sdr = [remove_abnormal(x) for x in sdr] isr = [remove_abnormal(x) for x in isr] sir = [remove_abnormal(x) for x in sir] sar = [remove_abnormal(x) for x in sar] return sdr, isr, sir, sar
def eval(reference_dir, estimates_dir, output_dir=None, target='vocals'): scores = {} reference_glob = os.path.join(reference_dir, '*.wav') for reference_file in tqdm(glob(reference_glob), desc='reference_files'): track_name = os.path.basename(reference_file).split('.wav')[0] estimate_file = os.path.join(estimates_dir, f'{track_name}.wav') if os.path.exists(estimate_file): reference = [] estimates = [] ref_audio, rate = sf.read(reference_file, always_2d=True) est_audio, rate = sf.read(estimate_file, always_2d=True) reference.append(ref_audio) estimates.append(est_audio) SDR, ISR, SIR, SAR = museval.evaluate( reference, estimates, ) m, s = divmod(ref_audio.shape[0] // rate, 60) scores[track_name] = { 'duration': f'{m}:{s}', "SDR(dB)": np.nanmedian(SDR[0].tolist()), "Target": target } else: print(f'\nEstimated file of {track_name} not found') df = pd.DataFrame.from_dict(scores, orient='index') if output_dir is None: df.to_csv(f'{target}_results.csv') else: df.to_csv(f'{output_dir}/{target}_results.csv') print(f'Avg of {target}: {df["SDR(dB)"].mean()}') return scores
for i in range(0, ln - seglen, seglen): if (np.mean(np.abs(yr_vocals[i:i + seglen])) > t) and (np.mean( np.abs(yr_accomp[i:i + seglen])) > t) and (np.mean( np.abs(ye_vocals[i:i + seglen])) > t) and (np.mean( np.abs(ye_accomp[i:i + seglen])) > t): references = np.concatenate( (np.reshape(yr_vocals[i:i + seglen], (1, seglen)), np.reshape(yr_accomp[i:i + seglen], (1, seglen))), axis=0) estimates = np.concatenate( (np.reshape(ye_vocals[i:i + seglen], (1, seglen)), np.reshape(ye_accomp[i:i + seglen], (1, seglen))), axis=0) [SDR, _, SIR, SAR] = museval.evaluate(references, estimates) #sdr, isr, sir, sar vocal_SDR.append(SDR[0]) vocal_SIR.append(SIR[0]) vocal_SAR.append(SAR[0]) print("Current vocal SDR median/mad/mean/std", np.median(np.asarray(vocal_SDR)), robust.mad(np.asarray(vocal_SDR)), np.mean(np.asarray(vocal_SDR)), np.std(np.asarray(vocal_SDR))) print("Current vocal SIR median/mad/mean/std", np.median(np.asarray(vocal_SIR)), robust.mad(np.asarray(vocal_SIR)), np.mean(np.asarray(vocal_SIR)), np.std(np.asarray(vocal_SIR))) print("Current vocal SAR median/mad/mean/std", np.median(np.asarray(vocal_SAR)), robust.mad(np.asarray(vocal_SAR)),
def evaluate(model, musdb_path, eval_folder, workers=2, device="cpu", rank=0, save=False, shifts=0, split=False, overlap=0.25, is_wav=False, world_size=1): """ Evaluate model using museval. Run the model on a single GPU, the bottleneck being the call to museval. """ output_dir = eval_folder / "results" output_dir.mkdir(exist_ok=True, parents=True) json_folder = eval_folder / "results/test" json_folder.mkdir(exist_ok=True, parents=True) # we load tracks from the original musdb set test_set = musdb.DB(musdb_path, subsets=["test"], is_wav=is_wav) src_rate = 44100 # hardcoded for now... for p in model.parameters(): p.requires_grad = False p.grad = None pendings = [] with futures.ProcessPoolExecutor(workers or 1) as pool: for index in tqdm.tqdm(range(rank, len(test_set), world_size), file=sys.stdout): track = test_set.tracks[index] out = json_folder / f"{track.name}.json.gz" if out.exists(): continue mix = th.from_numpy(track.audio).t().float() ref = mix.mean(dim=0) # mono mixture mix = (mix - ref.mean()) / ref.std() mix = convert_audio(mix, src_rate, model.samplerate, model.audio_channels) estimates = apply_model(model, mix.to(device), shifts=shifts, split=split, overlap=overlap) estimates = estimates * ref.std() + ref.mean() estimates = estimates.transpose(1, 2) references = th.stack([ th.from_numpy(track.targets[name].audio).t() for name in model.sources ]) references = convert_audio(references, src_rate, model.samplerate, model.audio_channels) references = references.transpose(1, 2).numpy() estimates = estimates.cpu().numpy() win = int(1. * model.samplerate) hop = int(1. * model.samplerate) if save: folder = eval_folder / "wav/test" / track.name folder.mkdir(exist_ok=True, parents=True) for name, estimate in zip(model.sources, estimates): wavfile.write(str(folder / (name + ".wav")), 44100, estimate) if workers: pendings.append((track.name, pool.submit(museval.evaluate, references, estimates, win=win, hop=hop))) else: pendings.append((track.name, museval.evaluate(references, estimates, win=win, hop=hop))) del references, mix, estimates, track for track_name, pending in tqdm.tqdm(pendings, file=sys.stdout): print(track_name) if workers: pending = pending.result() print('pending') sdr, isr, sir, sar = pending print('track_store') track_store = museval.TrackStore(win=44100, hop=44100, track_name=track_name) for idx, target in enumerate(model.sources): print(target) values = { "SDR": sdr[idx].tolist(), "SIR": sir[idx].tolist(), "ISR": isr[idx].tolist(), "SAR": sar[idx].tolist() } track_store.add_target(target_name=target, values=values) json_path = json_folder / f"{track_name}.json.gz" gzip.open(json_path, "w").write(track_store.json.encode('utf-8')) if world_size > 1: distributed.barrier()
def score(self, loader, framewise=False, save_dir=None): """ Score the model. Args ---- loader : PyTorch DataLoader. """ self.model.eval() class_sdr = defaultdict(list) class_sir = defaultdict(list) class_sar = defaultdict(list) # only perform framewise evaluation at testing time if self.n_fft == 1025: rate = 22050 hop = 512 win = 2048 elif self.n_fft == 2049: rate = 44100 hop = 1024 win = 4096 if not framewise: rate = np.inf if save_dir: class_map = {0: 'bass', 1: 'drums', 2: 'other', 3: 'vocals'} mus = musdb.DB(root_dir="data/musdb18") # list of batches preds, ys, cs, ts, _, nm = self.predict(loader) # for each batch for b_preds, b_ys, b_cs, b_ts, b_nm in tqdm(list(zip(preds, ys, cs, ts, nm))): # for each sample for pred, y, c, t, n in zip(b_preds, b_ys, b_cs, b_ts, b_nm): pred_recons = [] y_recons = [] pred_cs = [] pred_recons_dict = defaultdict(list) y_recons_dict = defaultdict(list) # for each class for i, (c_pred, c_y, c_c) in enumerate(zip(pred, y, c)): # if the class exists in the source signal if c_c == 1 and np.abs(c_y).sum() > 0: c_pred = c_pred[..., :t] c_y = c_y[..., :t] # predictions can be over multiple channels pred_recon = [] y_recon = [] for c_pred_chan, c_y_chan in zip(c_pred, c_y): pred_recon += [istft( c_pred_chan, hop_length=hop, win_length=win)] y_recon += [istft( c_y_chan, hop_length=hop, win_length=win)] pred_recon = np.stack(pred_recon, axis=-1) y_recon = np.stack(y_recon, axis=-1) # accumulate list of reconstructions for stacking pred_recons += [pred_recon] y_recons += [y_recon] pred_cs += [i] if save_dir: pred_recons_dict[class_map[i]] = pred_recon y_recons_dict[class_map[i]] = y_recon # possible to sample from targets that are all zeros if pred_recons: pred_recons = np.stack(pred_recons) # possible to predict all zeros... # TODO: Figure out how to handle this case properly if np.abs(pred_recons.sum()) > 0: y_recons = np.stack(y_recons) # nclassex x time if self.eval_version == 'v3': sdr, sir, sar, _ = bss_eval_sources( y_recons, pred_recons, compute_permutation=False) elif self.eval_version == 'v4': if save_dir: name = loader.dataset.metadata.at[ int(n.cpu().numpy()), 'urlId'] track = mus.load_mus_tracks( tracknames=[name])[0] sdr, isr, sir, sar = evaluate( y_recons, pred_recons, win=rate, hop=rate, padding=True) data = self._to_evalstore( sdr, sir, isr, sar, rate, rate, class_map) self._save_framewise(data, save_dir, track) continue else: sdr, isr, sir, sar = evaluate( y_recons, pred_recons, win=rate, hop=rate, padding=True) cmb_sdr = np.concatenate([x for x in sdr]) sdr = np.nanmean(sdr, axis=1) sir = np.nanmean(sir, axis=1) sar = np.nanmean(sar, axis=1) for m1, m2, m3, cl in zip(sdr, sir, sar, pred_cs): class_sdr[cl] += [m1] class_sir[cl] += [m2] class_sar[cl] += [m3] class_sdr_out = defaultdict(list) class_sir_out = defaultdict(list) class_sar_out = defaultdict(list) class_sdr_out['median'] = {k: np.round(np.median(v), 2) for k, v in class_sdr.items()} class_sdr_out['mean'] = {k: np.round(np.mean(v), 2) for k, v in class_sdr.items()} class_sir_out['median'] = {k: np.round(np.median(v), 2) for k, v in class_sir.items()} class_sir_out['mean'] = {k: np.round(np.mean(v), 2) for k, v in class_sir.items()} class_sar_out['median'] = {k: np.round(np.median(v), 2) for k, v in class_sar.items()} class_sar_out['mean'] = {k: np.round(np.mean(v), 2) for k, v in class_sar.items()} return class_sdr_out, class_sir_out, class_sar_out, cmb_sdr
def evaluate_test_dataset(test_dataset, models_to_evaluate, nmf): models = read_models(os.environ['RESULTS_SHEET_ID']) parts_dir = f'{dataset_base}/{test_dataset}/audio_mono' evaluation_dir = 'nmf_evaluation' if nmf else 'test' dataset_dir = f'{evaluation_dir}/{test_dataset}' if nmf: dataset_sr = 22050 else: dataset_sr = dataset_sample_rates[test_dataset] frame_size_seconds = 1 frame_size_samples = frame_size_seconds * dataset_sr print( f'Evaluating on dataset {test_dataset}, evaluation frame size: {frame_size_samples} samples.' ) chorale_reference_sources = {} for chorale in test_chorales: chorale_reference_sources[chorale] = read_sources( f'{parts_dir}/chorale_{chorale}_%s.wav', all_source_names, dataset_sr) Metrics = NamedTuple('Metrics', [ ('sdr', np.ndarray), ('isr', np.ndarray), ('sir', np.ndarray), ('sar', np.ndarray), ]) for model in models_to_evaluate: full_model_name, model_name, checkpoint, extracted_sources, multi_source = get_model_params( model, models, nmf, dataset_dir) print(f'Model {model}:') model_test_dir = f'{dataset_dir}/{full_model_name}' chorale_metrics: Dict[str, Metrics] = {} print(f'Evaluating sources: {", ".join(extracted_sources)}') model_source_indices = [ all_source_names.index(s) for s in extracted_sources ] for chorale in test_chorales: print(f'\tChorale {chorale}') reference_sources = chorale_reference_sources[chorale] estimates_template = f'{model_test_dir}/{chorale}/{get_estimates_template(nmf, multi_source, chorale)}' # Initialize estimates to random because `museval.estimate` raises an exception # if any estimate is all-zeros. However, we do want to supply all _reference_ # sources in order to correctly calculate SIR (interference from other sources), # and the shape of `estimated_sources` and `reference_sources` must be identical. estimated_sources = np.random.uniform(-1, 1, reference_sources.shape) estimated_sources[model_source_indices] = read_sources( estimates_template, extracted_sources, dataset_sr) # Return shape of each metric: (nsrc, nwin) sdr, isr, sir, sar = museval.evaluate(reference_sources, estimated_sources, padding=False, win=frame_size_samples, hop=frame_size_samples) chorale_metrics[chorale] = Metrics(sdr, isr, sir, sar) chorale_source_metrics_dfs = [] for chorale, metrics in chorale_metrics.items(): for source, source_index in zip(extracted_sources, model_source_indices): columns = { 'model': model_name, 'checkpoint': checkpoint, 'chorale': chorale, 'source': source } for metric, values in metrics._asdict().items(): columns[metric] = values[source_index] df = pandas.DataFrame(columns) df.insert(3, 'frame', df.index) chorale_source_metrics_dfs.append(df) model_metrics = pandas.concat(chorale_source_metrics_dfs, ignore_index=True) output_path = f'{model_test_dir}/evaluation.csv' model_metrics.to_csv(output_path)
def evaluation(dnn_model, device, test_tracks, writer, full_evaluation=True, trained_on="vocals"): """ This function performs the evaluation on the provided tracks and saves the logs the tensorboard summarywriter events. Parameters ---------- dnn_model : Generalised_Recurrent_Model Model to use for prediction test_tracks : list[Track] list of tracks to be evaluated. device : torch.device device to use writer : SummaryWriter summary writer for writing TensorBoard summaries full_evaluation : bool True if full evaluation is to be performed, False if only one track needs to be evaluated trained_on : str Labels of the trained model "vocals" or "accompaniment" """ # setting the evaluation mode dnn_model.eval() # gradients are not needed for evaulation to turning it off with torch.no_grad(): sdr_means = [] sir_means = [] isr_means = [] sar_means = [] # iterate over sample the tracks for track_number, track in enumerate(test_tracks): # getting predicted estimates of accompaniment and vocals acc_estimate, vocals_estimate = predict(dnn_model, device, data=track.mixture.data, sr=track.mixture.sr, trained_on=trained_on) # adding it to list for evaluating metrics estimates_list = np.array([vocals_estimate, acc_estimate]) reference_list = np.array([ np.copy(track.sources["vocals"].data), np.copy(track.sources["accompaniment"].data) ]) # evaluating the metrics SDR, ISR, SIR, SAR = museval.evaluate(reference_list, estimates_list) # getting mean of the metrics SDR_mean = np.mean(SDR, axis=1) SIR_mean = np.mean(SIR, axis=1) ISR_mean = np.mean(ISR, axis=1) SAR_mean = np.mean(SAR, axis=1) # print(track_number, ": ", SDR_mean.shape, ", ", SDR_mean.shape) # logging METRICS for vocals writer.add_scalar('vocals/SDR_mean', SDR_mean[0], track_number) writer.add_scalar('vocals/SIR_mean', SIR_mean[0], track_number) writer.add_scalar('vocals/SAR_mean', SAR_mean[0], track_number) writer.add_scalar('vocals/ISR_mean', ISR_mean[0], track_number) # logging METRICS for accompaniment writer.add_scalar('accompaniment/SDR_mean', SDR_mean[1], track_number) writer.add_scalar('accompaniment/SIR_mean', SIR_mean[1], track_number) writer.add_scalar('accompaniment/SAR_mean', SAR_mean[1], track_number) writer.add_scalar('accompaniment/ISR_mean', ISR_mean[1], track_number) # appending it to the means of the all tracks sdr_means.append(SDR_mean) sir_means.append(SIR_mean) isr_means.append(ISR_mean) sar_means.append(SAR_mean) # saving the first sample if track_number == 0: mono_vocals_estimate_normalized = lib.util.normalize( sp.to_mono(vocals_estimate)) writer.add_audio(tag="vocals", snd_tensor=torch.from_numpy( mono_vocals_estimate_normalized), global_step=1, sample_rate=track.mixture.sr) mono_acc_estimate_normalized = lib.util.normalize( sp.to_mono(acc_estimate)) writer.add_audio( tag="accompaniment", snd_tensor=torch.from_numpy(mono_acc_estimate_normalized), global_step=1, sample_rate=track.mixture.sr) print("FIRST TRACK EVALUATION COMPLETE") if not full_evaluation: print("ENDING FULL EVALUATION!!") break # END OF FOR of test samples # calculating mean over all tracks and saving it sdr_total_mean = np.mean(np.array(sdr_means), axis=0) sir_total_mean = np.mean(np.array(sir_means), axis=0) isr_total_mean = np.mean(np.array(isr_means), axis=0) sar_total_mean = np.mean(np.array(sar_means), axis=0) # min sdr_total_min = np.min(np.array(sdr_means), axis=0) sir_total_min = np.min(np.array(sir_means), axis=0) isr_total_min = np.min(np.array(isr_means), axis=0) sar_total_min = np.min(np.array(sar_means), axis=0) # max sdr_total_max = np.max(np.array(sdr_means), axis=0) sir_total_max = np.max(np.array(sir_means), axis=0) isr_total_max = np.max(np.array(isr_means), axis=0) sar_total_max = np.max(np.array(sar_means), axis=0) writer.add_text( 'Accompaniment', "SDR: " + str(sdr_total_min[1]) + " +- " + str(sdr_total_max[1]) + ", mean: " + str(sdr_total_mean[1]) + " \nSIR: " + str(sir_total_min[1]) + " +- " + str(sir_total_max[1]) + ", mean: " + str(sir_total_mean[1]) + " \nISR: " + str(isr_total_min[1]) + " +- " + str(isr_total_max[1]) + ", mean: " + str(isr_total_mean[1]) + " \nSAR: " + str(sar_total_min[1]) + " +- " + str(sar_total_max[1]) + ", mean: " + str(sar_total_mean[1]), 0) writer.add_text( 'Vocals', "SDR: " + str(sdr_total_min[0]) + " +- " + str(sdr_total_max[0]) + ", mean: " + str(sdr_total_mean[0]) + " \nSIR: " + str(sir_total_min[0]) + " +- " + str(sir_total_max[0]) + ", mean: " + str(sir_total_mean[0]) + " \nISR: " + str(isr_total_min[0]) + " +- " + str(isr_total_max[0]) + ", mean: " + str(isr_total_mean[0]) + " \nSAR: " + str(sar_total_min[0]) + " +- " + str(sar_total_max[0]) + ", mean: " + str(sar_total_mean[0]), 0)
def evaluate_mia(ref, est, track_name, source_names, eval_silence, conf): references = ref.copy() estimates = est.copy() # If evaluate silence, skip examples with a silent source skip = False silence_frames = pd.DataFrame({ 'target': [], 'PES': [], 'EPS': [], 'track': [] }) if eval_silence: PES, EPS, _, __ = eval_silent_frames( true_source=references, predicted_source=estimates, window_size=int(conf['win'] * conf['sample_rate']), hop_size=int(conf['hop'] * conf['sample_rate'])) for i, target in enumerate(source_names): reference_energy = np.sum(references[i, :, :]**2) # estimate_energy = np.sum(estimates[i, :, :]**2) if reference_energy == 0: # or estimate_energy == 0: skip = True sdr = isr = sir = sar = (np.ones( (1, )) * (-np.inf), np.ones((1, )) * (-np.inf)) print("skip {}, {} source is all zero".format( track_name, target)) print("mean over evaluation frames, mean over channels") for target in source_names: silence_frames = silence_frames.append( { 'target': target, 'PES': PES[i], 'EPS': EPS[i], 'track': track_name }, ignore_index=True) print( target + ' ==>', silence_frames.loc[silence_frames['target'] == target].mean( axis=0, skipna=True)) # Compute metrics for a given song using window and ho size if not skip: sdr, isr, sir, sar = museval.evaluate( references, estimates, win=int(conf['win'] * conf['sample_rate']), hop=int(conf['hop'] * conf['sample_rate'])) # Save results over the track track_store = museval.TrackStore(win=conf['win'], hop=conf['hop'], track_name=track_name) for index, target in enumerate(source_names): values = { "SDR": sdr[index].tolist(), "SIR": sir[index].tolist(), "ISR": isr[index].tolist(), "SAR": sar[index].tolist() } track_store.add_target(target_name=target, values=values) track_store.validate() return track_store, silence_frames
def evaluate(references, estimates, output_dir, track_name, sample_rate, win=1.0, hop=1.0, mode='v4'): """ Compute the BSS_eval metrics as well as PES and EPS. It is following the design concept of museval.eval_mus_track :param references: dict of reference sources {target_name: signal}, signal has shape: (nb_timesteps, np_channels) :param estimates: dict of user estimates {target_name: signal}, signal has shape: (nb_timesteps, np_channels) :param output_dir: path to output directory used to save evaluation results :param track_name: name that is assigned to TrackStore object for evaluated track :param win: evaluation window length in seconds, default 1 :param hop: evaluation window hop length in second, default 1 :param sample_rate: sample rate of test tracks (should be same as rate the model has been trained on) :param mode: BSSEval version, default to `v4` :return: bss_eval_data: museval.TrackStore object containing bss_eval evaluation scores silent_frames_data: Pandas data frame containing EPS and PES scores """ eval_targets = list(estimates.keys()) estimates_list = [] references_list = [] for target in eval_targets: estimates_list.append(estimates[target]) references_list.append(references[target]) # eval bass_eval and EPS, PES metrics # save in TrackStore object bss_eval_data = museval.TrackStore(win=win, hop=hop, track_name=track_name) # skip examples with a silent source because BSSeval metrics are not defined in this case skip = False for target in eval_targets: reference_energy = np.sum(references[target]**2) estimate_energy = np.sum(estimates[target]**2) if reference_energy == 0 or estimate_energy == 0: skip = True SDR = ISR = SIR = SAR = (np.ones((1, )) * (-np.inf), np.ones( (1, )) * (-np.inf)) print("skip {}, {} source is all zero".format(track_name, target)) if not skip: SDR, ISR, SIR, SAR = museval.evaluate(references_list, estimates_list, win=int(win * sample_rate), hop=int(hop * sample_rate), mode=mode, padding=True) # add evaluation of ESP and PES PES, EPS, _, __ = silent_frames_evaluation.eval_silent_frames( true_source=np.array(references_list), predicted_source=np.array(estimates_list), window_size=int(win * sample_rate), hop_size=int(hop * sample_rate)) # iterate over all targets for i, target in enumerate(eval_targets): values = { "SDR": SDR[i].tolist(), "SIR": SIR[i].tolist(), "ISR": ISR[i].tolist(), "SAR": SAR[i].tolist(), } bss_eval_data.add_target(target_name=target, values=values) silent_frames_data = pd.DataFrame({ 'target': [], 'PES': [], 'EPS': [], 'track': [] }) for i, target in enumerate(eval_targets): silent_frames_data = silent_frames_data.append( { 'target': target, 'PES': PES[i], 'EPS': EPS[i], 'track': track_name }, ignore_index=True) # save evaluation results if output directory is defined if output_dir: # validate against the schema bss_eval_data.validate() try: if not os.path.exists(output_dir): os.makedirs(output_dir) with open( os.path.join(output_dir, track_name.replace('/', '_')) + '.json', 'w+') as f: f.write(bss_eval_data.json) except (IOError): pass return bss_eval_data, silent_frames_data