def performance_metrics(df_labels, df_embeddings_verification, track_embedding, cfg, frame_list, iteration): speaker_list = df_labels.columns.tolist() df_precision = pd.DataFrame(columns=speaker_list, index=cfg.audio.threshold) df_roc = pd.DataFrame(columns=speaker_list, index=cfg.audio.threshold) df_recall = pd.DataFrame(columns=speaker_list, index=cfg.audio.threshold) df_far = pd.DataFrame(columns=speaker_list, index=cfg.audio.threshold) df_frr = pd.DataFrame(columns=speaker_list, index=cfg.audio.threshold) der = [] metric = DiarizationErrorRate(skip_overlap=True, collar=cfg.audio.collar) groundtruth = load_rttm(cfg.audio.rttm_path)[cfg.audio.uri[iteration]] for threshold in cfg.audio.threshold: df_output = multi_speaker_verification( track_embedding=track_embedding, df_labels=df_labels, df_embeddings_verification=df_embeddings_verification, threshold=threshold) for speaker in speaker_list: try: df_precision.loc[threshold, speaker] = precision_score(df_labels[speaker], df_output[speaker], average='binary') except: df_precision.loc[threshold, speaker] = 0 try: df_recall.loc[threshold, speaker] = recall_score(df_labels[speaker], df_output[speaker], average='binary') except: df_recall.loc[threshold, speaker] = 0 try: df_roc.loc[threshold, speaker] = roc_auc_score(df_labels[speaker], df_output[speaker], average=None) except: df_roc.loc[threshold, speaker] = 0 try: far, frr = FAR_FRR(y_true=df_labels[speaker], y_pred=df_output[speaker]) df_far.loc[threshold, speaker] = far df_frr.loc[threshold, speaker] = frr except: df_far.loc[threshold, speaker] = 0 df_frr.loc[threshold, speaker] = 0 components = metric(groundtruth, merge_frames(df_outputs=df_output, frame_list=frame_list, filename=cfg.audio.uri[iteration] + '_' + str(threshold)), detailed=True) components = metric[:] der.append(components) return df_precision, df_recall, df_roc, df_far, df_frr, der
def DER(df_labels, df_outputs, frame_list, cfg, collar): speaker_list = df_labels.columns.tolist() rttm_segment = load_rttm(cfg.audio.rttm_path)[cfg.audio.uri[0]] E_MISS = 0 E_FA = 0 E_Spk = 0 reference_length = 0 length = (len(frame_list)) for i, frame in enumerate(frame_list): frame_start, frame_end = float(frame[0]), float(frame[1]) segments = [] for segment in rttm_segment.get_timeline(): if list(rttm_segment.get_labels(segment))[0] in speaker_list: intersection = max( 0, min(float(frame[1]), segment.end) - max(float(frame[0]), segment.start)) if intersection > collar: segments.append(segment) #print('start', segment.start) #print('end', segment.end) reference_length = reference_length + intersection if len(segments) == 0: if 1 in df_outputs.iloc[i, :].to_numpy(): E_FA = E_FA + (float(frame[1]) - float(frame[0])) if len(segments) > 0: if 1 not in (df_outputs.iloc[i, :].to_numpy()): E_MISS = E_MISS + (float(frame[1]) - float(frame[0])) else: active_speakers = [] for interval in segments: intersection = max( 0, min(float(frame[1]), interval.end) - max(float(frame[0]), interval.start)) active_speakers.append( list(rttm_segment.get_labels(interval))[0]) for active_spk in active_speakers: if (df_outputs.loc[i, active_spk] == 0): E_Spk = E_Spk + ( segments[active_speakers.index(active_spk)].end - segments[active_speakers.index(active_spk)].start) inactive_speakers = list( set(speaker_list) - set(active_speakers)) for spk in inactive_speakers: if (df_outputs.loc[i, spk] == 1): E_Spk = E_Spk + (float(frame[1]) - float(frame[0])) print(reference_length) print(E_MISS) print(E_FA) print(E_Spk) return (E_MISS + E_Spk + E_FA) / reference_length
def gecko(args): hypotheses_path = args['<hypotheses_path>'] uri = args['<uri>'] colors = get_colors(uri) distances = {} if Path(hypotheses_path).exists(): hypotheses = load_rttm(hypotheses_path) hypothesis = hypotheses[uri] else: # protocol protocol = get_protocol(args['<hypotheses_path>']) reference = get_file(protocol, uri) hypothesis = reference['annotation'] annotated = get_annotated(reference) hypotheses_path = Path(hypotheses_path) protocol = args['--database.task.protocol'] features = None if protocol: protocol = get_protocol(protocol) embeddings = args['--embeddings'] reference, features = get_file(protocol, uri, embeddings=embeddings) if args['--map']: print(f"mapping {uri} with {protocol}") diarizationErrorRate = DiarizationErrorRate() annotated = get_annotated(reference) optimal_mapping = diarizationErrorRate.optimal_mapping( reference['annotation'], hypothesis, annotated) hypothesis = hypothesis.rename_labels(mapping=optimal_mapping) hypothesis = update_labels(hypothesis, distances) # tag unsure clusters distances_per_speaker = get_distances_per_speaker( features, hypothesis) if features else {} if args['--tag_na']: whole_file = Segment(0., annotated.segments_boundaries_[-1]) not_annotated = annotated.gaps(whole_file).to_annotation(na()) hypothesis = hypothesis.crop(annotated).update(not_annotated) gecko_json = annotation_to_GeckoJSON(hypothesis, distances_per_speaker, colors) if hypotheses_path.exists(): dir_path = hypotheses_path.parent else: dir_path = Path(".") json_path = os.path.join(dir_path, f'{uri}.json') with open(json_path, 'w') as file: json.dump(gecko_json, file) print(f"succefully dumped {json_path}")
def speakers(args): hypotheses_path = args['<hypotheses_path>'] uri = args['<uri>'] if Path(hypotheses_path).exists(): hypotheses = load_rttm(hypotheses_path) hypothesis = hypotheses[uri] else: # protocol distances = {} protocol = get_protocol(args['<hypotheses_path>']) reference = get_file(protocol, uri) hypothesis = reference['annotation'] annotated = get_annotated(reference) print(uri) print(f"Number of speakers: {len(hypothesis.labels())}") print(f"Chart:\n{hypothesis.chart()}")
def from_rttm(cls, path: Union[str, Path]) -> 'Continuum': """ Load annotations from a RTTM file. The file name field will be used as an annotation's annotator Parameters ---------- path: Path or str Path to the CSV file storing annotations Returns ------- continuum : Continuum New continuum object loaded from the RTTM file """ annotations = load_rttm(str(path)) continuum = cls() for uri, annot in annotations.items(): continuum.add_annotation(uri, annot) return continuum
def get_friends_per_speaker(rttm_files, BABYTRAIN): """ Given a list of .rttm files, return the dictionnary whose : - key is a speaker. - value is a list of speakers that appear in the same file than the key. If BABYTRAIN = True, skip the speakers whose name does not start by ! """ friends_per_speaker = {} for rttm in rttm_files: basename = os.path.splitext(os.path.basename(rttm))[0] data = pyda.load_rttm(rttm) if data != {}: annotation = data[basename] # Get the list of speakers participating to this file speakers = annotation.labels() if BABYTRAIN: speakers = [s for s in speakers if s.startswith("!")] # For each speaker, add his/her friends. for speaker in speakers: if speaker not in friends_per_speaker.keys(): friends_per_speaker[speaker] = set(speakers) else: friends_per_speaker[speaker] |= set(speakers) # Replace friends of investigators by empty set if BABYTRAIN: for k, v in friends_per_speaker.items(): if k.startswith("!INV"): friends_per_speaker[k] = set() return friends_per_speaker
def get_der(cfg, rttm, output_annotations): metric = DiarizationErrorRate(skip_overlap=True, collar=cfg.audio.collar) groundtruth = load_rttm(rttm)[rttm[rttm.rfind('/')+1:rttm.find('.')]] der = metric(groundtruth, output_annotations, detailed=False) return der
def main(): arguments = docopt(__doc__, version='Evaluation') collar = float(arguments['--collar']) skip_overlap = arguments['--skip-overlap'] tolerance = float(arguments['--tolerance']) # protocol protocol_name = arguments['<database.task.protocol>'] preprocessors = dict() if arguments['overlap']: if skip_overlap: msg = ('Option --skip-overlap is not supported ' 'when evaluating overlapped speech detection.') sys.exit(msg) preprocessors = {'annotation': to_overlap} protocol = get_protocol(protocol_name, progress=True, preprocessors=preprocessors) # subset (train, development, or test) subset = arguments['--subset'] if arguments['spotting']: hypothesis_json = arguments['<hypothesis.json>'] with open(hypothesis_json, mode='r') as fp: hypotheses = json.load(fp) output_prefix = hypothesis_json[:-5] latencies = [float(l) for l in arguments['--latency']] filters = arguments['--filter'] if filters: from sympy import sympify, lambdify, symbols speech = symbols('speech') filter_funcs = [] filter_funcs = [ lambdify([speech], sympify(expression)) for expression in filters ] filter_func = lambda speech: \ any(~func(speech) for func in filter_funcs) else: filter_func = None spotting(protocol, subset, latencies, hypotheses, output_prefix, filter_func=filter_func) sys.exit(0) hypothesis_rttm = arguments['<hypothesis.rttm>'] try: hypotheses = load_rttm(hypothesis_rttm) except FileNotFoundError: msg = f'Could not find file {hypothesis_rttm}.' sys.exit(msg) except: msg = (f'Failed to load {hypothesis_rttm}, please check its format ' f'(only RTTM files are supported).') sys.exit(msg) if arguments['detection']: detection(protocol, subset, hypotheses, collar=collar, skip_overlap=skip_overlap) if arguments['overlap']: detection(protocol, subset, hypotheses, collar=collar, skip_overlap=skip_overlap) if arguments['segmentation']: segmentation(protocol, subset, hypotheses, tolerance=tolerance) if arguments['diarization']: greedy = arguments['--greedy'] diarization(protocol, subset, hypotheses, greedy=greedy, collar=collar, skip_overlap=skip_overlap) if arguments['identification']: identification(protocol, subset, hypotheses, collar=collar, skip_overlap=skip_overlap)
lambdify([speech], sympify(expression)) for expression in filters] filter_func = lambda speech: \ any(~func(speech) for func in filter_funcs) else: filter_func = None spotting(protocol, subset, latencies, hypotheses, output_prefix, filter_func=filter_func) sys.exit(0) hypothesis_rttm = arguments['<hypothesis.rttm>'] try: hypotheses = load_rttm(hypothesis_rttm) except FileNotFoundError: msg = f'Could not find file {hypothesis_rttm}.' sys.exit(msg) except: msg = ( f'Failed to load {hypothesis_rttm}, please check its format ' f'(only RTTM files are supported).' ) sys.exit(msg) if arguments['detection']: detection(protocol, subset, hypotheses, collar=collar, skip_overlap=skip_overlap)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "path", type=str, help= "Relative path to the database folder, containing" # Positional Argument "train, dev, and test sub-folders.") parser.add_argument("-d", "--duration", type=int, default=60, help="Duration of a trial. (Default to 60 seconds)") parser.add_argument( '--bbt', action='store_true', help="Indicates whether the corpora is BabyTrain or not. If true," "skip all the speakers whose name doesn't start by '!'.") args = parser.parse_args() # Parameters DURATION_TRIAL = args.duration DATABASE_PATH = os.path.join( os.getcwd(), args.path) # needs to loop through dev and test BABYTRAIN = args.bbt # Header trials_txt = "target_speaker\tfile_basename\tbeginning_time\tend_time\tduration_total_speech\tduration_overlapping_speech\n" # Extract target and non-target trials rttm_files = utils.get_dev_test_rttm(DATABASE_PATH) ## First, get the dictionnary of every friends of every speakers. friends_per_speaker = get_friends_per_speaker(rttm_files, BABYTRAIN) for rttm in rttm_files: basename = os.path.splitext(os.path.basename(rttm))[0] data = pyda.load_rttm(rttm) if data != {}: annotation = data[basename] participants = annotation.labels() if BABYTRAIN: participants = [p for p in participants if p.startswith("!")] all_friends = get_friends_of_participants(friends_per_speaker, participants) last_offset = annotation.get_timeline()[-1][1] for end in range(DURATION_TRIAL, int(last_offset), DURATION_TRIAL): beg = end - DURATION_TRIAL chunk = annotation.crop(Segment(beg, end)) targets = chunk.labels() if BABYTRAIN: targets = [t for t in targets if t.startswith("!")] overlapping_chunk = utils.overlapping_annotation(chunk) # A speaker is defined as a target speaker for a chunk c, # when he/she is speaking in c. for target in targets: tot_speech = chunk.label_duration(target) overlapping_speech = overlapping_chunk.label_duration( target) trials_txt += "%s\t%s\t%d\t%d\t%.3f\t%.3f\n" % ( target, basename, beg, end, tot_speech, overlapping_speech) non_targets = list(set(all_friends) - set(targets)) # A speaker is defined as a non-target speaker for a chunk c, # when he/she is not speaking in c, but speaks somewhere in whatever file # where one of the target speaker is also participating for non_target in non_targets: trials_txt += "%s\t%s\t%d\t%d\t%.1f\t%.1f\n" % ( non_target, basename, beg, end, 0.0, 0.0) with open(os.path.join(DATABASE_PATH, "trials_%d.txt" % DURATION_TRIAL), "w") as f: f.write(trials_txt[:-1]) print("trials.txt generated in %s" % DATABASE_PATH)
device=device, verbose=verbose, rate=encoder_rate) if save_test_emb: np.save(inf_path + '{}_embeds.npy'.format(case), embed) np.save(inf_path + '{}_embeds_times.npy'.format(case), info[0]) timelst = Diarize(scotus_ral, embed, info[0], sim_thresh=diar, score_thresh=score) diar_to_rttm(timelst, case, di_path) rttmto_RALrttm(case, scotus_ral, rttm_path, di_path) predict = case + '_rdsv.rttm' ral_label = case + '_ral.rttm' predictions = load_rttm(di_path + predict)[case] groundtruths = load_rttm(di_path + ral_label)[case] der.append( metric(groundtruths, predictions, detailed=True)['diarization error rate']) size.append(sz) bycase = list(zip([item.split('.')[0] for item in set_dict['t']], der, size)) desc = stats.describe(der) settings = ['Param:', encoder_rate, '|', diar, '-', score] with open(test_eval_path, 'w') as f: write = csv.writer(f) write.writerow(settings) write.writerows(bycase) write.writerow(desc)
from pyannote.database.util import load_rttm from pyannote.core import Segment, notebook from pyannote.audio.features import RawAudio #from IPython.display import Audio import torch from pyannote.metrics.diarization import DiarizationErrorRate Audio_File = { 'uri': 'ES2011a.Mix-Headset', 'audio': '/home/lucas/PycharmProjects/Data/pyannote/amicorpus/ES2011a/audio/ES2011a.Mix-Headset.wav' } groundtruth = load_rttm( '/home/lucas/PycharmProjects/Data/pyannote/AMI/MixHeadset.development.rttm' )[Audio_File['uri']] for segment in groundtruth.get_timeline(): print(list(groundtruth.get_labels(segment))[0]) pipeline = torch.hub.load('pyannote/pyannote-audio', 'dia_ami') diarization = pipeline(Audio_File) #print(diarization) metric = DiarizationErrorRate(collar=0.25, skip_overlap=True) der = metric(groundtruth, diarization) print(der) #print('done')
def performance_metrics(df_labels, df_embeddings_verification, track_embedding, cfg, frame_list, iteration): speaker_list = df_labels.columns.tolist() df_precision = pd.DataFrame(columns=speaker_list, index=cfg.audio.threshold) df_roc = pd.DataFrame(columns=speaker_list, index=cfg.audio.threshold) df_recall = pd.DataFrame(columns=speaker_list, index=cfg.audio.threshold) df_far = pd.DataFrame(columns=speaker_list, index=cfg.audio.threshold) df_frr = pd.DataFrame(columns=speaker_list, index=cfg.audio.threshold) der = [] metric = DiarizationErrorRate(skip_overlap=True, collar=cfg.audio.collar) groundtruth = load_rttm(cfg.audio.rttm_path)[cfg.audio.uri[iteration]] for threshold in cfg.audio.threshold: df_output = speaker_verification( track_embedding=track_embedding, df_labels=df_labels, df_embeddings_verification=df_embeddings_verification, threshold=threshold) for speaker in speaker_list: try: df_precision.loc[threshold, speaker] = precision_score(df_labels[speaker], df_output[speaker], average='binary') except: df_precision.loc[threshold, speaker] = 0 try: df_recall.loc[threshold, speaker] = recall_score(df_labels[speaker], df_output[speaker], average='binary') except: df_recall.loc[threshold, speaker] = 0 try: df_roc.loc[threshold, speaker] = roc_auc_score(df_labels[speaker], df_output[speaker], average=None) except: df_roc.loc[threshold, speaker] = 0 try: far, frr = FAR_FRR(y_true=df_labels[speaker], y_pred=df_output[speaker]) df_far.loc[threshold, speaker] = far df_frr.loc[threshold, speaker] = frr except: df_far.loc[threshold, speaker] = 0 df_frr.loc[threshold, speaker] = 0 #der.append(metric(groundtruth, merge_frames(df_outputs=df_output, frame_list=frame_list, filename='try1_'+str(threshold)))) components = metric(groundtruth, merge_frames(df_outputs=df_output, frame_list=frame_list, filename=cfg.audio.uri[iteration] + '_' + str(threshold)), detailed=True) components = metric[:] # print('False alarm: {}, Missed_Detection: {}, Confusion{}, Total {}'.format(DER['false alarm'], DER['missed detection'], DER['confusion'], DER['total'])) #if DER <= 1: der.append(components) #else: # der.append(1.0) return df_precision, df_recall, df_roc, df_far, df_frr, der
def main(): arguments = docopt(__doc__, version="Evaluation") collar = float(arguments["--collar"]) skip_overlap = arguments["--skip-overlap"] tolerance = float(arguments["--tolerance"]) # protocol protocol_name = arguments["<database.task.protocol>"] preprocessors = dict() if arguments["overlap"]: if skip_overlap: msg = ("Option --skip-overlap is not supported " "when evaluating overlapped speech detection.") sys.exit(msg) preprocessors = {"annotation": to_overlap} protocol = get_protocol(protocol_name, preprocessors=preprocessors) # subset (train, development, or test) subset = arguments["--subset"] if arguments["spotting"]: hypothesis_json = arguments["<hypothesis.json>"] with open(hypothesis_json, mode="r") as fp: hypotheses = json.load(fp) output_prefix = hypothesis_json[:-5] latencies = [float(l) for l in arguments["--latency"]] filters = arguments["--filter"] if filters: from sympy import sympify, lambdify, symbols speech = symbols("speech") filter_funcs = [] filter_funcs = [ lambdify([speech], sympify(expression)) for expression in filters ] filter_func = lambda speech: any(~func(speech) for func in filter_funcs) else: filter_func = None spotting( protocol, subset, latencies, hypotheses, output_prefix, filter_func=filter_func, ) sys.exit(0) hypothesis_rttm = arguments["<hypothesis.rttm>"] try: hypotheses = load_rttm(hypothesis_rttm) except FileNotFoundError: msg = f"Could not find file {hypothesis_rttm}." sys.exit(msg) except: msg = (f"Failed to load {hypothesis_rttm}, please check its format " f"(only RTTM files are supported).") sys.exit(msg) if arguments["detection"]: detection(protocol, subset, hypotheses, collar=collar, skip_overlap=skip_overlap) if arguments["overlap"]: detection(protocol, subset, hypotheses, collar=collar, skip_overlap=skip_overlap) if arguments["segmentation"]: segmentation(protocol, subset, hypotheses, tolerance=tolerance) if arguments["diarization"]: greedy = arguments["--greedy"] diarization( protocol, subset, hypotheses, greedy=greedy, collar=collar, skip_overlap=skip_overlap, ) if arguments["identification"]: identification(protocol, subset, hypotheses, collar=collar, skip_overlap=skip_overlap)
def __init__(self, sad_rttm: Path = None): self.sad_rttm = sad_rttm self.sad_ = load_rttm(self.sad_rttm)
def main(): argparser = argparse.ArgumentParser() argparser.add_argument('system', type=str, help='Path to the system\'s output') argparser.add_argument('protocol', type=str, help='The protocol on which you want to evaluate' 'your system') argparser.add_argument('subset', type=str, help='The subset of the database on which you want' 'to evaluate your system.\n' 'Choose between [train, test, development].\n' 'Default is test.') argparser.add_argument('--vad', action='store_false', help='(OPTIONNAL) Enable if Evaluation a VAD system' ', this way only speech/non speech metrics ' 'will be reported.') args = argparser.parse_args() # Create timeline for both reference & system system = load_rttm(args.system) #system_sils = system.get_timeline().gaps() #system_spch = system.get_timeline() # get Reference using Pyannote Protocol protocol = get_protocol(args.protocol) items = list(getattr(protocol, args.subset)()) reference = {item['uri']: item['annotation'] for item in items} results = dict() for uri in reference: # preffix r: reference # prefix s: system r_annot = reference[uri] # In case the uri was not evaluated, skip this one and go to the next try: s_annot = system[uri] except: continue r_labels = { lab: r_annot.label_timeline(lab) for lab in r_annot.labels() } s_labels = { lab: s_annot.label_timeline(lab) for lab in s_annot.labels() } if not args.vad: mapping = get_mapping(r_annot, s_annot) else: mapping = None # accumulate results, reference side dur = get_speech_duration(r_annot, uri) print(uri) print(dur) correct, miss_spk, miss_speech = accumulate_reference( r_labels, s_labels, mapping, dur) # Both "correct" should be the same _, FA_spk, FA_speech = accumulate_system(r_labels, s_labels, mapping, dur) results[uri] = (correct, FA_spk, FA_speech, miss_spk, miss_speech) # evaluate each wav referenced in system: # IF not vad: # for each label (FEM, MAL, CHI, KCHI), measure the time # in Correct/False alarm Speaker, False alarm Speech/Missed speaker/ # Missed Speech write_evaluation(results, args.vad)