예제 #1
0
def evaluate_sdr_chorale(evaluation_dataset,
                         model,
                         chorale,
                         source,
                         models,
                         regularization=0):
    import museval
    dataset_sr = dataset_sample_rates[evaluation_dataset]
    frame_size = dataset_sr
    estimate_filename = get_estimate_filename(evaluation_dataset, model,
                                              chorale, source,
                                              models.loc[model].multi_source)
    estimate, estimate_sr = soundfile.read(estimate_filename)
    reference, reference_sr = read_chorale(evaluation_dataset, chorale, source)
    assert reference_sr == estimate_sr == dataset_sr
    assert reference.shape == estimate.shape
    model_params = models.loc[model]
    reference_sources = np.array([reference])
    estimated_sources = np.array([estimate])
    sdr, _isr, _sir, _sar = museval.evaluate(reference_sources,
                                             estimated_sources,
                                             padding=False,
                                             win=frame_size,
                                             hop=frame_size)
    snr = evaluate_snr(reference_sources, estimated_sources, frame_size)
    snr_reg = evaluate_snr(reference_sources, estimated_sources, frame_size,
                           regularization)
    assert sdr.shape == snr.shape == snr_reg.shape
    return pandas.DataFrame({
        'sdr_bsseval': sdr[0],
        'sdr_mine': snr[0],
        'sdr_reg': snr_reg[0],
        'frame': range(sdr.shape[1])
    })
예제 #2
0
def bss_eval_sdr_v4_2s(src_list, pred_src_list, do_remove_abnormal=True):
    import museval
    len_cropped = pred_src_list.shape[-1]
    src_list = src_list[:, :len_cropped]
    sdr, isr, sir, sar = museval.evaluate(src_list,
                                          pred_src_list,
                                          win=44100 * 2,
                                          hop=44100 * 2,
                                          mode="v4",
                                          padding=True)
    if do_remove_abnormal:
        sdr = [remove_abnormal(x) for x in sdr]
        isr = [remove_abnormal(x) for x in isr]
        sir = [remove_abnormal(x) for x in sir]
        sar = [remove_abnormal(x) for x in sar]
    return sdr, isr, sir, sar
예제 #3
0
def eval(reference_dir, estimates_dir, output_dir=None, target='vocals'):

    scores = {}
    reference_glob = os.path.join(reference_dir, '*.wav')

    for reference_file in tqdm(glob(reference_glob), desc='reference_files'):

        track_name = os.path.basename(reference_file).split('.wav')[0]
        estimate_file = os.path.join(estimates_dir, f'{track_name}.wav')

        if os.path.exists(estimate_file):

            reference = []
            estimates = []

            ref_audio, rate = sf.read(reference_file, always_2d=True)
            est_audio, rate = sf.read(estimate_file, always_2d=True)

            reference.append(ref_audio)
            estimates.append(est_audio)

            SDR, ISR, SIR, SAR = museval.evaluate(
                reference,
                estimates,
            )
            m, s = divmod(ref_audio.shape[0] // rate, 60)
            scores[track_name] = {
                'duration': f'{m}:{s}',
                "SDR(dB)": np.nanmedian(SDR[0].tolist()),
                "Target": target
            }
        else:
            print(f'\nEstimated file of {track_name} not found')

    df = pd.DataFrame.from_dict(scores, orient='index')
    if output_dir is None:
        df.to_csv(f'{target}_results.csv')
    else:
        df.to_csv(f'{output_dir}/{target}_results.csv')

    print(f'Avg of {target}: {df["SDR(dB)"].mean()}')
    return scores
예제 #4
0
        for i in range(0, ln - seglen, seglen):
            if (np.mean(np.abs(yr_vocals[i:i + seglen])) > t) and (np.mean(
                    np.abs(yr_accomp[i:i + seglen])) > t) and (np.mean(
                        np.abs(ye_vocals[i:i + seglen])) > t) and (np.mean(
                            np.abs(ye_accomp[i:i + seglen])) > t):
                references = np.concatenate(
                    (np.reshape(yr_vocals[i:i + seglen], (1, seglen)),
                     np.reshape(yr_accomp[i:i + seglen], (1, seglen))),
                    axis=0)
                estimates = np.concatenate(
                    (np.reshape(ye_vocals[i:i + seglen], (1, seglen)),
                     np.reshape(ye_accomp[i:i + seglen], (1, seglen))),
                    axis=0)
                [SDR, _, SIR,
                 SAR] = museval.evaluate(references,
                                         estimates)  #sdr, isr, sir, sar
                vocal_SDR.append(SDR[0])
                vocal_SIR.append(SIR[0])
                vocal_SAR.append(SAR[0])

        print("Current vocal SDR median/mad/mean/std",
              np.median(np.asarray(vocal_SDR)),
              robust.mad(np.asarray(vocal_SDR)),
              np.mean(np.asarray(vocal_SDR)), np.std(np.asarray(vocal_SDR)))
        print("Current vocal SIR median/mad/mean/std",
              np.median(np.asarray(vocal_SIR)),
              robust.mad(np.asarray(vocal_SIR)),
              np.mean(np.asarray(vocal_SIR)), np.std(np.asarray(vocal_SIR)))
        print("Current vocal SAR median/mad/mean/std",
              np.median(np.asarray(vocal_SAR)),
              robust.mad(np.asarray(vocal_SAR)),
예제 #5
0
def evaluate(model,
             musdb_path,
             eval_folder,
             workers=2,
             device="cpu",
             rank=0,
             save=False,
             shifts=0,
             split=False,
             overlap=0.25,
             is_wav=False,
             world_size=1):
    """
    Evaluate model using museval. Run the model
    on a single GPU, the bottleneck being the call to museval.
    """
    output_dir = eval_folder / "results"
    output_dir.mkdir(exist_ok=True, parents=True)
    json_folder = eval_folder / "results/test"
    json_folder.mkdir(exist_ok=True, parents=True)

    # we load tracks from the original musdb set
    test_set = musdb.DB(musdb_path, subsets=["test"], is_wav=is_wav)
    src_rate = 44100  # hardcoded for now...

    for p in model.parameters():
        p.requires_grad = False
        p.grad = None

    pendings = []
    with futures.ProcessPoolExecutor(workers or 1) as pool:
        for index in tqdm.tqdm(range(rank, len(test_set), world_size),
                               file=sys.stdout):
            track = test_set.tracks[index]

            out = json_folder / f"{track.name}.json.gz"
            if out.exists():
                continue

            mix = th.from_numpy(track.audio).t().float()
            ref = mix.mean(dim=0)  # mono mixture
            mix = (mix - ref.mean()) / ref.std()
            mix = convert_audio(mix, src_rate, model.samplerate,
                                model.audio_channels)
            estimates = apply_model(model,
                                    mix.to(device),
                                    shifts=shifts,
                                    split=split,
                                    overlap=overlap)
            estimates = estimates * ref.std() + ref.mean()

            estimates = estimates.transpose(1, 2)
            references = th.stack([
                th.from_numpy(track.targets[name].audio).t()
                for name in model.sources
            ])
            references = convert_audio(references, src_rate, model.samplerate,
                                       model.audio_channels)
            references = references.transpose(1, 2).numpy()
            estimates = estimates.cpu().numpy()
            win = int(1. * model.samplerate)
            hop = int(1. * model.samplerate)
            if save:
                folder = eval_folder / "wav/test" / track.name
                folder.mkdir(exist_ok=True, parents=True)
                for name, estimate in zip(model.sources, estimates):
                    wavfile.write(str(folder / (name + ".wav")), 44100,
                                  estimate)

            if workers:
                pendings.append((track.name,
                                 pool.submit(museval.evaluate,
                                             references,
                                             estimates,
                                             win=win,
                                             hop=hop)))
            else:
                pendings.append((track.name,
                                 museval.evaluate(references,
                                                  estimates,
                                                  win=win,
                                                  hop=hop)))
            del references, mix, estimates, track

        for track_name, pending in tqdm.tqdm(pendings, file=sys.stdout):
            print(track_name)
            if workers:
                pending = pending.result()
            print('pending')
            sdr, isr, sir, sar = pending
            print('track_store')
            track_store = museval.TrackStore(win=44100,
                                             hop=44100,
                                             track_name=track_name)
            for idx, target in enumerate(model.sources):
                print(target)
                values = {
                    "SDR": sdr[idx].tolist(),
                    "SIR": sir[idx].tolist(),
                    "ISR": isr[idx].tolist(),
                    "SAR": sar[idx].tolist()
                }

                track_store.add_target(target_name=target, values=values)
                json_path = json_folder / f"{track_name}.json.gz"
                gzip.open(json_path,
                          "w").write(track_store.json.encode('utf-8'))
    if world_size > 1:
        distributed.barrier()
예제 #6
0
    def score(self, loader, framewise=False, save_dir=None):
        """
        Score the model.

        Args
        ----
          loader : PyTorch DataLoader.

        """
        self.model.eval()
        class_sdr = defaultdict(list)
        class_sir = defaultdict(list)
        class_sar = defaultdict(list)

        # only perform framewise evaluation at testing time
        if self.n_fft == 1025:
            rate = 22050
            hop = 512
            win = 2048
        elif self.n_fft == 2049:
            rate = 44100
            hop = 1024
            win = 4096
        if not framewise:
            rate = np.inf

        if save_dir:
            class_map = {0: 'bass', 1: 'drums', 2: 'other', 3: 'vocals'}
            mus = musdb.DB(root_dir="data/musdb18")

        # list of batches
        preds, ys, cs, ts, _, nm = self.predict(loader)

        # for each batch
        for b_preds, b_ys, b_cs, b_ts, b_nm in tqdm(list(zip(preds, ys, cs, ts, nm))):
            # for each sample
            for pred, y, c, t, n in zip(b_preds, b_ys, b_cs, b_ts, b_nm):
                pred_recons = []
                y_recons = []
                pred_cs = []
                pred_recons_dict = defaultdict(list)
                y_recons_dict = defaultdict(list)
                # for each class
                for i, (c_pred, c_y, c_c) in enumerate(zip(pred, y, c)):
                    # if the class exists in the source signal
                    if c_c == 1 and np.abs(c_y).sum() > 0:
                        c_pred = c_pred[..., :t]
                        c_y = c_y[..., :t]
                        # predictions can be over multiple channels
                        pred_recon = []
                        y_recon = []
                        for c_pred_chan, c_y_chan in zip(c_pred, c_y):
                            pred_recon += [istft(
                                c_pred_chan, hop_length=hop, win_length=win)]
                            y_recon += [istft(
                                c_y_chan, hop_length=hop, win_length=win)]
                        pred_recon = np.stack(pred_recon, axis=-1)
                        y_recon = np.stack(y_recon, axis=-1)
                        # accumulate list of reconstructions for stacking
                        pred_recons += [pred_recon]
                        y_recons += [y_recon]
                        pred_cs += [i]
                        if save_dir:
                            pred_recons_dict[class_map[i]] = pred_recon
                            y_recons_dict[class_map[i]] = y_recon
                # possible to sample from targets that are all zeros
                if pred_recons:
                    pred_recons = np.stack(pred_recons)
                    # possible to predict all zeros...
                    # TODO: Figure out how to handle this case properly
                    if np.abs(pred_recons.sum()) > 0:
                        y_recons = np.stack(y_recons)
                        # nclassex x time
                        if self.eval_version == 'v3':
                            sdr, sir, sar, _ = bss_eval_sources(
                                y_recons, pred_recons,
                                compute_permutation=False)
                        elif self.eval_version == 'v4':
                            if save_dir:
                                name = loader.dataset.metadata.at[
                                    int(n.cpu().numpy()), 'urlId']
                                track = mus.load_mus_tracks(
                                    tracknames=[name])[0]
                                sdr, isr, sir, sar = evaluate(
                                    y_recons, pred_recons, win=rate, hop=rate,
                                    padding=True)
                                data = self._to_evalstore(
                                    sdr, sir, isr, sar, rate, rate, class_map)
                                self._save_framewise(data, save_dir, track)
                                continue
                            else:
                                sdr, isr, sir, sar = evaluate(
                                    y_recons, pred_recons, win=rate, hop=rate,
                                    padding=True)
                                cmb_sdr = np.concatenate([x for x in sdr])
                                sdr = np.nanmean(sdr, axis=1)
                                sir = np.nanmean(sir, axis=1)
                                sar = np.nanmean(sar, axis=1)
                        for m1, m2, m3, cl in zip(sdr, sir, sar, pred_cs):
                            class_sdr[cl] += [m1]
                            class_sir[cl] += [m2]
                            class_sar[cl] += [m3]

        class_sdr_out = defaultdict(list)
        class_sir_out = defaultdict(list)
        class_sar_out = defaultdict(list)

        class_sdr_out['median'] = {k: np.round(np.median(v), 2)
                                   for k, v in class_sdr.items()}
        class_sdr_out['mean'] = {k: np.round(np.mean(v), 2)
                                 for k, v in class_sdr.items()}
        class_sir_out['median'] = {k: np.round(np.median(v), 2)
                                   for k, v in class_sir.items()}
        class_sir_out['mean'] = {k: np.round(np.mean(v), 2)
                                 for k, v in class_sir.items()}
        class_sar_out['median'] = {k: np.round(np.median(v), 2)
                                   for k, v in class_sar.items()}
        class_sar_out['mean'] = {k: np.round(np.mean(v), 2)
                                 for k, v in class_sar.items()}

        return class_sdr_out, class_sir_out, class_sar_out, cmb_sdr
예제 #7
0
def evaluate_test_dataset(test_dataset, models_to_evaluate, nmf):
    models = read_models(os.environ['RESULTS_SHEET_ID'])
    parts_dir = f'{dataset_base}/{test_dataset}/audio_mono'
    evaluation_dir = 'nmf_evaluation' if nmf else 'test'
    dataset_dir = f'{evaluation_dir}/{test_dataset}'
    if nmf:
        dataset_sr = 22050
    else:
        dataset_sr = dataset_sample_rates[test_dataset]
    frame_size_seconds = 1
    frame_size_samples = frame_size_seconds * dataset_sr
    print(
        f'Evaluating on dataset {test_dataset}, evaluation frame size: {frame_size_samples} samples.'
    )

    chorale_reference_sources = {}
    for chorale in test_chorales:
        chorale_reference_sources[chorale] = read_sources(
            f'{parts_dir}/chorale_{chorale}_%s.wav', all_source_names,
            dataset_sr)

    Metrics = NamedTuple('Metrics', [
        ('sdr', np.ndarray),
        ('isr', np.ndarray),
        ('sir', np.ndarray),
        ('sar', np.ndarray),
    ])
    for model in models_to_evaluate:
        full_model_name, model_name, checkpoint, extracted_sources, multi_source = get_model_params(
            model, models, nmf, dataset_dir)
        print(f'Model {model}:')
        model_test_dir = f'{dataset_dir}/{full_model_name}'
        chorale_metrics: Dict[str, Metrics] = {}
        print(f'Evaluating sources: {", ".join(extracted_sources)}')
        model_source_indices = [
            all_source_names.index(s) for s in extracted_sources
        ]
        for chorale in test_chorales:
            print(f'\tChorale {chorale}')
            reference_sources = chorale_reference_sources[chorale]
            estimates_template = f'{model_test_dir}/{chorale}/{get_estimates_template(nmf, multi_source, chorale)}'
            # Initialize estimates to random because `museval.estimate` raises an exception
            # if any estimate is all-zeros. However, we do want to supply all _reference_
            # sources in order to correctly calculate SIR (interference from other sources),
            # and the shape of `estimated_sources` and `reference_sources` must be identical.
            estimated_sources = np.random.uniform(-1, 1,
                                                  reference_sources.shape)
            estimated_sources[model_source_indices] = read_sources(
                estimates_template, extracted_sources, dataset_sr)
            # Return shape of each metric: (nsrc, nwin)
            sdr, isr, sir, sar = museval.evaluate(reference_sources,
                                                  estimated_sources,
                                                  padding=False,
                                                  win=frame_size_samples,
                                                  hop=frame_size_samples)
            chorale_metrics[chorale] = Metrics(sdr, isr, sir, sar)

        chorale_source_metrics_dfs = []
        for chorale, metrics in chorale_metrics.items():
            for source, source_index in zip(extracted_sources,
                                            model_source_indices):
                columns = {
                    'model': model_name,
                    'checkpoint': checkpoint,
                    'chorale': chorale,
                    'source': source
                }
                for metric, values in metrics._asdict().items():
                    columns[metric] = values[source_index]

                df = pandas.DataFrame(columns)
                df.insert(3, 'frame', df.index)
                chorale_source_metrics_dfs.append(df)

        model_metrics = pandas.concat(chorale_source_metrics_dfs,
                                      ignore_index=True)
        output_path = f'{model_test_dir}/evaluation.csv'
        model_metrics.to_csv(output_path)
예제 #8
0
def evaluation(dnn_model,
               device,
               test_tracks,
               writer,
               full_evaluation=True,
               trained_on="vocals"):
    """
    This function performs the evaluation on the provided tracks and saves the logs the tensorboard summarywriter events.

    Parameters
    ----------
    dnn_model : Generalised_Recurrent_Model
        Model to use for prediction
    test_tracks : list[Track]
        list of tracks to be evaluated.
    device : torch.device
        device to use
    writer : SummaryWriter
        summary writer for writing TensorBoard summaries
    full_evaluation : bool
        True if full evaluation is to be performed, False if only one track needs to be evaluated
    trained_on : str
        Labels of the trained model "vocals" or "accompaniment"
    """
    # setting the evaluation mode
    dnn_model.eval()

    # gradients are not needed for evaulation to turning it off
    with torch.no_grad():
        sdr_means = []
        sir_means = []
        isr_means = []
        sar_means = []

        # iterate over sample the tracks
        for track_number, track in enumerate(test_tracks):
            # getting predicted estimates of accompaniment and vocals
            acc_estimate, vocals_estimate = predict(dnn_model,
                                                    device,
                                                    data=track.mixture.data,
                                                    sr=track.mixture.sr,
                                                    trained_on=trained_on)

            # adding it to list for evaluating metrics
            estimates_list = np.array([vocals_estimate, acc_estimate])
            reference_list = np.array([
                np.copy(track.sources["vocals"].data),
                np.copy(track.sources["accompaniment"].data)
            ])

            # evaluating the metrics
            SDR, ISR, SIR, SAR = museval.evaluate(reference_list,
                                                  estimates_list)

            # getting mean of the metrics
            SDR_mean = np.mean(SDR, axis=1)
            SIR_mean = np.mean(SIR, axis=1)
            ISR_mean = np.mean(ISR, axis=1)
            SAR_mean = np.mean(SAR, axis=1)
            # print(track_number, ": ", SDR_mean.shape, ", ", SDR_mean.shape)

            # logging METRICS for vocals
            writer.add_scalar('vocals/SDR_mean', SDR_mean[0], track_number)
            writer.add_scalar('vocals/SIR_mean', SIR_mean[0], track_number)
            writer.add_scalar('vocals/SAR_mean', SAR_mean[0], track_number)
            writer.add_scalar('vocals/ISR_mean', ISR_mean[0], track_number)

            # logging METRICS for accompaniment
            writer.add_scalar('accompaniment/SDR_mean', SDR_mean[1],
                              track_number)
            writer.add_scalar('accompaniment/SIR_mean', SIR_mean[1],
                              track_number)
            writer.add_scalar('accompaniment/SAR_mean', SAR_mean[1],
                              track_number)
            writer.add_scalar('accompaniment/ISR_mean', ISR_mean[1],
                              track_number)

            # appending it to the means of the all tracks
            sdr_means.append(SDR_mean)
            sir_means.append(SIR_mean)
            isr_means.append(ISR_mean)
            sar_means.append(SAR_mean)
            # saving the first sample
            if track_number == 0:
                mono_vocals_estimate_normalized = lib.util.normalize(
                    sp.to_mono(vocals_estimate))
                writer.add_audio(tag="vocals",
                                 snd_tensor=torch.from_numpy(
                                     mono_vocals_estimate_normalized),
                                 global_step=1,
                                 sample_rate=track.mixture.sr)
                mono_acc_estimate_normalized = lib.util.normalize(
                    sp.to_mono(acc_estimate))
                writer.add_audio(
                    tag="accompaniment",
                    snd_tensor=torch.from_numpy(mono_acc_estimate_normalized),
                    global_step=1,
                    sample_rate=track.mixture.sr)
                print("FIRST TRACK EVALUATION COMPLETE")
            if not full_evaluation:
                print("ENDING FULL EVALUATION!!")
                break
            # END OF FOR of test samples

        # calculating mean over all tracks and saving it
        sdr_total_mean = np.mean(np.array(sdr_means), axis=0)
        sir_total_mean = np.mean(np.array(sir_means), axis=0)
        isr_total_mean = np.mean(np.array(isr_means), axis=0)
        sar_total_mean = np.mean(np.array(sar_means), axis=0)
        # min
        sdr_total_min = np.min(np.array(sdr_means), axis=0)
        sir_total_min = np.min(np.array(sir_means), axis=0)
        isr_total_min = np.min(np.array(isr_means), axis=0)
        sar_total_min = np.min(np.array(sar_means), axis=0)
        # max
        sdr_total_max = np.max(np.array(sdr_means), axis=0)
        sir_total_max = np.max(np.array(sir_means), axis=0)
        isr_total_max = np.max(np.array(isr_means), axis=0)
        sar_total_max = np.max(np.array(sar_means), axis=0)
        writer.add_text(
            'Accompaniment', "SDR: " + str(sdr_total_min[1]) + " +- " +
            str(sdr_total_max[1]) + ", mean: " + str(sdr_total_mean[1]) +
            "  \nSIR: " + str(sir_total_min[1]) + " +- " +
            str(sir_total_max[1]) + ", mean: " + str(sir_total_mean[1]) +
            "  \nISR: " + str(isr_total_min[1]) + " +- " +
            str(isr_total_max[1]) + ", mean: " + str(isr_total_mean[1]) +
            "  \nSAR: " + str(sar_total_min[1]) + " +- " +
            str(sar_total_max[1]) + ", mean: " + str(sar_total_mean[1]), 0)
        writer.add_text(
            'Vocals', "SDR: " + str(sdr_total_min[0]) + " +- " +
            str(sdr_total_max[0]) + ", mean: " + str(sdr_total_mean[0]) +
            "  \nSIR: " + str(sir_total_min[0]) + " +- " +
            str(sir_total_max[0]) + ", mean: " + str(sir_total_mean[0]) +
            "  \nISR: " + str(isr_total_min[0]) + " +- " +
            str(isr_total_max[0]) + ", mean: " + str(isr_total_mean[0]) +
            "  \nSAR: " + str(sar_total_min[0]) + " +- " +
            str(sar_total_max[0]) + ", mean: " + str(sar_total_mean[0]), 0)
예제 #9
0
def evaluate_mia(ref, est, track_name, source_names, eval_silence, conf):
    references = ref.copy()
    estimates = est.copy()

    # If evaluate silence, skip examples with a silent source
    skip = False
    silence_frames = pd.DataFrame({
        'target': [],
        'PES': [],
        'EPS': [],
        'track': []
    })
    if eval_silence:
        PES, EPS, _, __ = eval_silent_frames(
            true_source=references,
            predicted_source=estimates,
            window_size=int(conf['win'] * conf['sample_rate']),
            hop_size=int(conf['hop'] * conf['sample_rate']))

        for i, target in enumerate(source_names):
            reference_energy = np.sum(references[i, :, :]**2)
            # estimate_energy = np.sum(estimates[i, :, :]**2)
            if reference_energy == 0:  # or estimate_energy == 0:
                skip = True
                sdr = isr = sir = sar = (np.ones(
                    (1, )) * (-np.inf), np.ones((1, )) * (-np.inf))
                print("skip {}, {} source is all zero".format(
                    track_name, target))

        print("mean over evaluation frames, mean over channels")
        for target in source_names:
            silence_frames = silence_frames.append(
                {
                    'target': target,
                    'PES': PES[i],
                    'EPS': EPS[i],
                    'track': track_name
                },
                ignore_index=True)
            print(
                target + ' ==>',
                silence_frames.loc[silence_frames['target'] == target].mean(
                    axis=0, skipna=True))

    # Compute metrics for a given song using window and ho size
    if not skip:
        sdr, isr, sir, sar = museval.evaluate(
            references,
            estimates,
            win=int(conf['win'] * conf['sample_rate']),
            hop=int(conf['hop'] * conf['sample_rate']))

    # Save results over the track
    track_store = museval.TrackStore(win=conf['win'],
                                     hop=conf['hop'],
                                     track_name=track_name)
    for index, target in enumerate(source_names):
        values = {
            "SDR": sdr[index].tolist(),
            "SIR": sir[index].tolist(),
            "ISR": isr[index].tolist(),
            "SAR": sar[index].tolist()
        }
        track_store.add_target(target_name=target, values=values)
    track_store.validate()

    return track_store, silence_frames
예제 #10
0
def evaluate(references,
             estimates,
             output_dir,
             track_name,
             sample_rate,
             win=1.0,
             hop=1.0,
             mode='v4'):
    """
    Compute the BSS_eval metrics as well as PES and EPS. It is following the design concept of museval.eval_mus_track
    :param references: dict of reference sources {target_name: signal}, signal has shape: (nb_timesteps, np_channels)
    :param estimates: dict of user estimates {target_name: signal}, signal has shape: (nb_timesteps, np_channels)
    :param output_dir: path to output directory used to save evaluation results
    :param track_name: name that is assigned to TrackStore object for evaluated track
    :param win: evaluation window length in seconds, default 1
    :param hop: evaluation window hop length in second, default 1
    :param sample_rate: sample rate of test tracks (should be same as rate the model has been trained on)
    :param mode: BSSEval version, default to `v4`
    :return:
        bss_eval_data: museval.TrackStore object containing bss_eval evaluation scores
        silent_frames_data: Pandas data frame containing EPS and PES scores
    """

    eval_targets = list(estimates.keys())

    estimates_list = []
    references_list = []
    for target in eval_targets:
        estimates_list.append(estimates[target])
        references_list.append(references[target])

    # eval bass_eval and EPS, PES metrics
    # save in TrackStore object
    bss_eval_data = museval.TrackStore(win=win, hop=hop, track_name=track_name)

    # skip examples with a silent source because BSSeval metrics are not defined in this case
    skip = False
    for target in eval_targets:
        reference_energy = np.sum(references[target]**2)
        estimate_energy = np.sum(estimates[target]**2)
        if reference_energy == 0 or estimate_energy == 0:
            skip = True
            SDR = ISR = SIR = SAR = (np.ones((1, )) * (-np.inf), np.ones(
                (1, )) * (-np.inf))
            print("skip {}, {} source is all zero".format(track_name, target))

    if not skip:

        SDR, ISR, SIR, SAR = museval.evaluate(references_list,
                                              estimates_list,
                                              win=int(win * sample_rate),
                                              hop=int(hop * sample_rate),
                                              mode=mode,
                                              padding=True)

    # add evaluation of ESP and PES
    PES, EPS, _, __ = silent_frames_evaluation.eval_silent_frames(
        true_source=np.array(references_list),
        predicted_source=np.array(estimates_list),
        window_size=int(win * sample_rate),
        hop_size=int(hop * sample_rate))

    # iterate over all targets
    for i, target in enumerate(eval_targets):
        values = {
            "SDR": SDR[i].tolist(),
            "SIR": SIR[i].tolist(),
            "ISR": ISR[i].tolist(),
            "SAR": SAR[i].tolist(),
        }

        bss_eval_data.add_target(target_name=target, values=values)

    silent_frames_data = pd.DataFrame({
        'target': [],
        'PES': [],
        'EPS': [],
        'track': []
    })
    for i, target in enumerate(eval_targets):
        silent_frames_data = silent_frames_data.append(
            {
                'target': target,
                'PES': PES[i],
                'EPS': EPS[i],
                'track': track_name
            },
            ignore_index=True)

    # save evaluation results if output directory is defined
    if output_dir:
        # validate against the schema
        bss_eval_data.validate()

        try:
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            with open(
                    os.path.join(output_dir, track_name.replace('/', '_')) +
                    '.json', 'w+') as f:
                f.write(bss_eval_data.json)
        except (IOError):
            pass

    return bss_eval_data, silent_frames_data