def _xxx_iter(self, subset): if not isinstance(subset, list): subsets = [subset] else: subsets = subset data_dir = op.join(op.dirname(op.realpath(__file__)), 'data') data_csv = op.join(data_dir, 'voxceleb1.csv') data = pd.read_csv(data_csv, index_col=['segment']) data = data.groupby('verification') for subset in subsets: subset_data.get_group(subset) for uri, rows in subset_data.groupby('uri'): annotation = Annotation(uri=uri) for row in rows.itertuples(): segment = Segment(row.start, row.end) annotation[segment] = row.speaker annotated = annotation.get_timeline() current_file = { 'uri': uri, 'database': 'VoxCeleb', 'annotation': annotation, 'annotated': annotated, } yield current_file
def predict(self, features, min_duration=None, constraint=None): """ Parameters ---------- min_duration : float or dict, optional Minimum duration for each label, in seconds. """ constraint_ = self._constraint(constraint, features) consecutive = self._consecutive(min_duration, features) X = self.X(features, unknown='keep') sliding_window = features.sliding_window converted_y = self.classifier_.predict(X, consecutive=consecutive, constraint=constraint_) annotation = Annotation() diff = list(np.where(np.diff(converted_y))[0]) diff = [-1] + diff + [len(converted_y)] for t, T in pairwise(diff): segment = sliding_window.rangeToSegment(t, T - t) annotation[segment] = converted_y[t + 1] translation = self.label_converter_.inverse_mapping() return annotation.translate(translation)
def _xxx_iter(self, subset): if not isinstance(subset, list): subsets = [subset] else: subsets = subset data_dir = op.join(op.dirname(op.realpath(__file__)), 'data') data_csv = op.join(data_dir, 'voxceleb1.csv') data = pd.read_csv(data_csv, index_col=['segment']) data = data.groupby('verification') # segment uri start end speaker verification identification # A.J._Buckley/1zcIwhmdeo4_0000001 A.J._Buckley/1zcIwhmdeo4 14.7 22.8 A.J._Buckley dev trn for subset in subsets: subset_data = data.get_group(subset) for uri, datum in subset_data.iterrows(): annotation = Annotation(uri=uri) segment = Segment(0., datum.end - datum.start) annotation[segment] = datum.speaker annotated = annotation.get_timeline() current_file = { 'uri': uri, 'database': 'VoxCeleb', 'annotation': annotation, 'annotated': annotated, } yield current_file
def vad_construct_pyannote_object_per_file( vad_table_filepath: str, groundtruth_RTTM_file: str ) -> Tuple[Annotation, Annotation]: """ Construct a Pyannote object for evaluation. Args: vad_table_filepath(str) : path of vad rttm-like table. groundtruth_RTTM_file(str): path of groundtruth rttm file. Returns: reference(pyannote.Annotation): groundtruth hypothesis(pyannote.Annotation): prediction """ pred = pd.read_csv(vad_table_filepath, sep=" ", header=None) label = pd.read_csv(groundtruth_RTTM_file, sep=" ", delimiter=None, header=None) label = label.rename(columns={3: "start", 4: "dur", 7: "speaker"}) # construct reference reference = Annotation() for index, row in label.iterrows(): reference[Segment(row['start'], row['start'] + row['dur'])] = row['speaker'] # construct hypothsis hypothesis = Annotation() for index, row in pred.iterrows(): hypothesis[Segment(float(row[0]), float(row[0]) + float(row[1]))] = 'Speech' return reference, hypothesis
def vad_metrics(predictions, reference_segments, sr=22050, window_length=int(np.floor(0.032 * 22050)), hop_length=int(np.floor(0.016 * 22050))): frame_times = librosa.frames_to_time(range(len(predictions)), sr=sr, hop_length=hop_length, n_fft=window_length) predicted_segments = voice_segments(predictions, frame_times) hypothesis = Annotation() for seg in predicted_segments: hypothesis[Segment(seg[0], seg[1])] = 1 reference = Annotation() for seg in reference_segments: reference[Segment(seg[0], seg[1])] = 1 precision = DetectionPrecision()(reference, hypothesis) error = DetectionErrorRate()(reference, hypothesis) recall = DetectionRecall()(reference, hypothesis) accuracy = DetectionAccuracy()(reference, hypothesis) metrics = { "precision": precision, "error": error, "recall": recall, "accuracy": accuracy } print(metrics) return metrics
def predict(self, features, min_duration=None, constraint=None): """ Parameters ---------- min_duration : float or dict, optional Minimum duration for each label, in seconds. """ constraint_ = self._constraint(constraint, features) consecutive = self._consecutive(min_duration, features) X = self.X(features, unknown="keep") sliding_window = features.sliding_window converted_y = self.classifier_.predict(X, consecutive=consecutive, constraint=constraint_) annotation = Annotation() diff = list(np.where(np.diff(converted_y))[0]) diff = [-1] + diff + [len(converted_y)] for t, T in pairwise(diff): segment = sliding_window.rangeToSegment(t, T - t) annotation[segment] = converted_y[t + 1] translation = self.label_converter_.inverse_mapping() return annotation.translate(translation)
def DER(outfile, AudioDataSet, annotationlist, audioLength): reference = Annotation() if not AudioDataSet == 'DiaExample': treeA = ET.parse(annotationlist[0]) rootA = treeA.getroot() for child in rootA.findall('segment'): start, end = float(child.get('transcriber_start')), float( child.get('transcriber_end')) reference[Segment(start, end)] = 'A' treeB = ET.parse(annotationlist[1]) rootB = treeB.getroot() for child in rootB.findall('segment'): start, end = float(child.get('transcriber_start')), float( child.get('transcriber_end')) reference[Segment(start, end)] = 'B' treeC = ET.parse(annotationlist[2]) rootC = treeC.getroot() for child in rootC.findall('segment'): start, end = float(child.get('transcriber_start')), float( child.get('transcriber_end')) reference[Segment(start, end)] = 'C' treeD = ET.parse(annotationlist[3]) rootD = treeD.getroot() for child in rootD.findall('segment'): start, end = float(child.get('transcriber_start')), float( child.get('transcriber_end')) reference[Segment(start, end)] = 'D' else: reference = Annotation() reference[Segment(0.15, 3.41)] = 'A' reference[Segment(3.83, 5.82)] = 'A' reference[Segment(6.75, 11.10)] = 'B' reference[Segment(11.32, 15.8)] = 'C' reference[Segment(15.9, 18.8)] = 'B' reference[Segment(18.8, 27.8)] = 'C' reference[Segment(27.8, 34.4)] = 'B' reference[Segment(34.4, 42)] = 'D' hypothesis = Annotation() f = open(outfile, 'r') for line in f.readlines(): start = float(line.split(' ')[3]) end = start + float(line.split(' ')[4]) annotation = line.split(' ')[5][0:-1] hypothesis[Segment(start, end)] = annotation f.close() metric = DiarizationErrorRate() metricPurity = DiarizationPurity() uem = Timeline([Segment(0, audioLength)]) print('DER: %.2f %%' % (metric(reference, hypothesis, uem=uem) * 100)) print('Cluster Purity: %.2f %%' % (metricPurity(reference, hypothesis, uem=uem) * 100)) return metric, reference, hypothesis
def load_speaker(self, uri): speaker = Annotation(uri=uri) path = self.get_audio_path(uri) with open(path, 'r') as fp: for line in fp: start, duration, name, _, _ = line.strip().split() start = float(start) end = start + float(duration) speaker[Segment(start, end)] = name return speaker.smooth()
def _turn_level(self, current_file: dict, speech_turns: Annotation) -> Annotation: """Apply clustering at speech turn level Parameters ---------- current_file : `dict` File as provided by a pyannote.database protocol. speech_turns : `Annotation` Speech turns. Should only contain `str` labels. Returns ------- hypothesis : `pyannote.core.Annotation` Clustering result. """ assert_string_labels(speech_turns, "speech_turns") embedding = self._embedding(current_file) labels = speech_turns.labels() X, clustered_labels, skipped_labels = [], [], [] for l, label in enumerate(labels): timeline = speech_turns.label_timeline(label, copy=False) # be more and more permissive until we have # at least one embedding for current speech turn for mode in ["strict", "center", "loose"]: x = embedding.crop(timeline, mode=mode) if len(x) > 0: break # skip labels so small we don't have any embedding for it if len(x) < 1: skipped_labels.append(label) continue clustered_labels.append(label) X.append(np.mean(x, axis=0)) # apply clustering of label embeddings clusters = self.clustering(np.vstack(X)) # map each clustered label to its cluster (between 1 and N_CLUSTERS) mapping = {label: k for label, k in zip(clustered_labels, clusters)} # map each skipped label to its own cluster # (between -1 and -N_SKIPPED_LABELS) for l, label in enumerate(skipped_labels): mapping[label] = -(l + 1) # do the actual mapping return speech_turns.rename_labels(mapping=mapping)
def test_combi_categorical_dissimilarity(): continuum = Continuum() annotation = Annotation() annotation[Segment(1, 5)] = 'Carol' annotation[Segment(6, 8)] = 'Bob' annotation[Segment(12, 18)] = 'Carol' annotation[Segment(7, 20)] = 'Alice' continuum.add_annotation('liza', annotation) annotation = Annotation() annotation[Segment(2, 6)] = 'Carol' annotation[Segment(7, 8)] = 'Bob' annotation[Segment(12, 18)] = 'Alice' annotation[Segment(8, 10)] = 'Alice' annotation[Segment(7, 19)] = 'Jeremy' continuum.add_annotation('pierrot', annotation) categories = ['Carol', 'Bob', 'Alice', 'Jeremy'] cat = np.array([[0, 0.5, 0.3, 0.7], [0.5, 0., 0.6, 0.4], [0.3, 0.6, 0., 0.7], [0.7, 0.4, 0.7, 0.]]) combi_dis = CombinedCategoricalDissimilarity(categories=categories, delta_empty=0.5, cat_dissimilarity_matrix=cat, alpha=3, beta=1) list_dis = [] for liza_unit in continuum['liza']: for pierrot_unit in continuum['pierrot']: unit_alignment = UnitaryAlignment( (("liza", liza_unit), ("pierrot", pierrot_unit))) list_dis.append(unit_alignment.compute_disorder(combi_dis)) print(len(list_dis)) assert list_dis == pytest.approx([ 0.09375, 5.11, 2.69375, 6.15, 8.790000000000001, 1.75, 0.16666666666666666, 1.3020408163265305, 1.8, 6.3, 2.0237024221453286, 1.4020408163265305, 0.3524, 0.8066666666666665, 0.20360110803324097, 7.260000000000002, 7.137755102040815, 0.5166666666666666, 3.525, 0.15 ], 0.001) unit_align_a = UnitaryAlignment( (("liza", Unit(Segment(1, 5), "Carol")), ("pierrot", Unit(Segment(7, 19), "Jeremy")))) unit_align_b = UnitaryAlignment(( ("pierrot", Unit(Segment(7, 19), "Jeremy")), ("liza", Unit(Segment(1, 5), "Carol")), )) assert (unit_align_a.compute_disorder(combi_dis) == unit_align_b.compute_disorder(combi_dis)) same_align = UnitaryAlignment( (("liza", Unit(Segment(1, 5), "Carol")), ("pierrot", Unit(Segment(1, 5), "Carol")))) assert same_align.compute_disorder(combi_dis) == np.float32(0.0)
def test_bug_16(): reference = Annotation() reference[Segment(0, 10)] = 'A' hypothesis = Annotation() metric = DiarizationErrorRate(collar=1) total = metric(reference, hypothesis, detailed=True)['total'] npt.assert_almost_equal(total, 9, decimal=3) metric = DiarizationErrorRate(collar=0) total = metric(reference, hypothesis, detailed=True)['total'] npt.assert_almost_equal(total, 10, decimal=3)
def init_annotations(self): ref, hyp = {}, {} for ivecset in self.ivecs: if ivecset.size() > 0: name = ivecset.name # dirty trick, will be removed, watch out if 'beamformed' in name: name = re.sub('beamformed/', '', name) # # # # # # # # # # # # # # # # # # # # # name = re.sub('/.*', '', name) ref[name], hyp[name] = Annotation(), Annotation() return ref, hyp
def convert_labels(y_true, y_pred): reference = Annotation() hypothesis = Annotation() for i, (r, h) in enumerate(zip(y_true, y_pred)): segment = Segment(i, i + 1) if h != SILENCE: hypothesis[segment] = h if r != SILENCE: reference[segment] = r return hypothesis, reference
def test_extrude(): annotation = Annotation() annotation[Segment(0, 10)] = "A" annotation[Segment(15, 20)] = "A" annotation[Segment(20, 35)] = "B" annotation[Segment(15, 25)] = "C" annotation[Segment(30, 35)] = "C" extrusion_tl = Timeline([Segment(5, 12), Segment(14, 25)]) intersection_expected = Annotation() intersection_expected[Segment(0, 5)] = "A" intersection_expected[Segment(25, 35)] = "B" intersection_expected[Segment(30, 35)] = "C" assert (annotation.extrude(extrusion_tl, mode="intersection") == intersection_expected) loose_expected = Annotation() loose_expected[Segment(30, 35)] = "C" assert (annotation.extrude(extrusion_tl, mode="loose") == loose_expected) strict_expected = Annotation() strict_expected[Segment(0, 10)] = "A" strict_expected[Segment(20, 35)] = "B" strict_expected[Segment(30, 35)] = "C" assert (annotation.extrude(extrusion_tl, mode="strict") == strict_expected)
def calculate_der(reference_filename, hypothesis_filename): lbls = Util.read_audacity_labels(reference_filename) reference = Annotation() for lbl in lbls: reference[Segment(lbl.start_seconds, lbl.end_seconds)] = lbl.label predicted_lbls = Util.read_audacity_labels(hypothesis_filename) hypothesis = Annotation() for lbl in predicted_lbls: if lbl.label != 'non_speech': hypothesis[Segment(lbl.start_seconds, lbl.end_seconds)] = lbl.label metric = DiarizationErrorRate() der = metric(reference, hypothesis) return der
def _partition(self, timeline, coverage): # boundaries (as set of timestamps) boundaries = set([]) for segment in timeline: boundaries.add(segment.start) boundaries.add(segment.end) # partition (as timeline) partition = Annotation() for start, end in pairwise(sorted(boundaries)): segment = Segment(start, end) partition[segment] = '_' return partition.crop(coverage, mode='intersection').anonymize_tracks()
def test_crop_strict(annotation): expected = Annotation( uri='TheBigBangTheory.Season01.Episode01', modality='speaker') expected[Segment(5.5, 7), '_'] = 'Leonard' actual = annotation.crop(Segment(5, 9), mode='strict') assert actual == expected, str(actual)
def test_from_json(annotation): # Check that we can reconstruct an annotation from the dict # returned by for_json. data = annotation.for_json() actual = Annotation.from_json(data) expected = annotation assert actual == expected
def test_from_records(annotation): # Check that we can reconstruct an annotation from the # output of itertracks. records = annotation.itertracks(yield_label=True) actual = Annotation.from_records(records) expected = annotation assert actual == expected
def reference(): reference = Annotation() reference[Segment(0, 5)] = 'A' reference[Segment(6, 10)] = 'B' reference[Segment(12, 14)] = 'A' reference[Segment(15, 20)] = 'C' return reference
def clip_to_annotations(clip_number, lena_mappings, human_mappings): """ Returns (human_annotation, lena_annotation) """ df = pd.read_csv(METADATA_PATH, index_col='ClipNumber') its_filename = df.loc[clip_number].ProcessingFile chat_filename = 'e{}.cha'.format(its_filename.split('.')[0]) textgrid_filename = 'Clip{}.TextGrid'.format(clip_number) lena_dict = lena_chat_to_dict(os.path.join(CHAT_PATH, chat_filename)) textgrid_dict = textgrid_to_dict(os.path.join(TEXTGRID_PATH, textgrid_filename)) # remap lena_dict = remap(lena_dict, lena_mappings) textgrid_dict = remap(textgrid_dict, human_mappings) # set default (silence) class lena_annotation = dict_to_annotation(lena_dict, lena_mappings['SIL']) human_annotation = dict_to_annotation(textgrid_dict, human_mappings['Silence']) start_time = df.loc[clip_number].StartTimeS end_time = start_time + 300 # 5 minutes # The crop doesn't begin at 0, but at start_time, so we need to shift it left. lena_cropped = lena_annotation.crop(Segment(start_time, end_time)) lena_annotation_shifted = Annotation() for segment, track, label in lena_cropped.itertracks(yield_label=True): shifted_segment = Segment(segment.start - start_time, segment.end - start_time) lena_annotation_shifted[shifted_segment, track] = label return human_annotation, lena_annotation_shifted
def run(self): with self.in_subtitles().open('r') as fp: transcription = pyannote.core.json.load(fp) annotation = Annotation() label = 0 for start, end, edge in transcription.ordered_edges_iter(data=True): if 'subtitle' not in edge: continue segment = Segment(start, end) annotation[segment] = label label += 1 annotation = annotation.anonymize_labels(generator='string') with self.out_put().open('w') as fp: pyannote.core.json.dump(annotation, fp)
def _xxx_iter(self, subset): data = self._load_data(subset) AnnotatedGroups = data['annotated'].groupby(by='uri') AnnotationGroups = data['annotation'].groupby(by='uri') for raw_uri, annotated in AnnotatedGroups: uri = f'{raw_uri}.Mix-Headset' segments = [] for segment in annotated.itertuples(): segments.append(Segment(start=segment.start, end=segment.end)) annotation = Annotation(uri=uri) for t, turn in enumerate( AnnotationGroups.get_group(raw_uri).itertuples()): segment = Segment(start=turn.start, end=turn.start + turn.duration) annotation[segment, t] = turn.speaker current_file = { 'database': 'Test', 'uri': uri, 'annotated': Timeline(uri=uri, segments=segments), 'annotation': annotation } yield current_file
def _as_scores(self, raw, features, segmentation): if isinstance(segmentation, Timeline): annotation = Annotation(uri=segmentation.uri) for segment in segmentation: annotation[segment] = '?' segmentation = annotation # convert to pyannote-style & aggregate over each segment scores = Scores(uri=segmentation.uri, modality=segmentation.modality, annotation=segmentation, labels=list(self.label_converter_)) sliding_window = features.sliding_window for segment, track in segmentation.itertracks(): # extract raw for all features in segment and aggregate i_start, i_duration = sliding_window.segmentToRange(segment) p = np.mean(raw[i_start:i_start + i_duration, :], axis=0) for i, label in enumerate(self.label_converter_): scores[segment, track, label] = p[i] return scores
def reference(): reference = Annotation() reference[Segment(0, 10)] = 'A' reference[Segment(12, 20)] = 'B' reference[Segment(24, 27)] = 'A' reference[Segment(30, 40)] = 'C' return reference
def reference_with_overlap(): reference = Annotation() reference[Segment(0, 13)] = 'A' reference[Segment(12, 20)] = 'B' reference[Segment(24, 27)] = 'A' reference[Segment(30, 40)] = 'C' return reference
def __call__(self): # list of chronologically sorted list of shots graph = self._threads_graph() threads = [sorted(cc) for cc in nx.connected_components(graph)] annotation = Annotation() labelGenerator = getLabelGenerator() # chronologically sorted threads (based on their first shot) for thread in sorted(threads, key=lambda thread: thread[0]): label = next(labelGenerator) for shot in thread: annotation[shot] = label return annotation.smooth()
def _decode( self, current_file: ProtocolFile, hypothesis: Annotation, scores: SlidingWindowFeature, labels: Iterable, ) -> Annotation: N, K = scores.data.shape if self.allow_overlap: active_speakers = scores.data > 0.5 else: if self.lock_speech: active_speakers = np.argmax(scores.data, axis=1) + 1 else: active_speakers = np.argmax(scores.data, axis=1) # reconstruct annotation new_hypothesis = one_hot_decoding(active_speakers, scores.sliding_window, labels=labels) new_hypothesis.uri = hypothesis.uri if self.lock_speech: speech = hypothesis.get_timeline().support() new_hypothesis = new_hypothesis.crop(speech) return new_hypothesis
def load_mdtm(file_mdtm): """Load MDTM file Parameter --------- file_mdtm : `str` Path to MDTM file. Returns ------- annotations : `dict` Speaker diarization as a {uri: pyannote.core.Annotation} dictionary. """ names = ['uri', 'NA1', 'start', 'duration', 'NA2', 'NA3', 'NA4', 'speaker'] dtype = {'uri': str, 'start': float, 'duration': float, 'speaker': str} data = pd.read_csv(file_mdtm, names=names, dtype=dtype, delim_whitespace=True) annotations = dict() for uri, turns in data.groupby('uri'): annotation = Annotation(uri=uri) for i, turn in turns.iterrows(): segment = Segment(turn.start, turn.start + turn.duration) annotation[segment, i] = turn.speaker annotations[uri] = annotation return annotations
def rttm_to_annotation(input_rttm, collapse_to_speech=False, class_to_keep=None): """ Given a path to a rttm file, create the corresponding Annotation objects containing the triplets (t_beg, t_end, activity) Parameters ---------- input_rttm A path to a rttm file that must exist. Returns ------- An Annotation object. """ anno = Annotation(uri=input_rttm) if os.path.isfile(input_rttm): with open(input_rttm) as fn: for line in fn: row = line.split('\t') t_beg, t_dur, spkr = float(row[3]), float(row[4]), row[7] if row[7] == "": raise ValueError("Speaker role is empty in %s" % os.path.basename(input_rttm)) if class_to_keep is not None and spkr == class_to_keep: # Keep only class of interest anno[Segment(t_beg, t_beg + t_dur)] = spkr elif class_to_keep is None: # Keep all classes anno[Segment(t_beg, t_beg + t_dur)] = spkr return anno
def preprocess(self, openface): """ Parameters ---------- openface : str Path to Openface features """ # TODO : option to only keep 'detections' # (make sure it does not alter 'starting_point' segments) names = ['time', 'track'] for i in range(128): names += ['d{0}'.format(i)] data = read_table(openface, delim_whitespace=True, header=None, names=names) features = data.groupby('track') starting_point = Annotation(modality='face') for track, segment in features.apply(self._to_segment).iteritems(): if not segment: continue starting_point[segment, track] = track return starting_point, features
def load_mdtm(file_mdtm): """Load MDTM file Parameter --------- file_mdtm : `str` Path to MDTM file. Returns ------- annotations : `dict` Speaker diarization as a {uri: pyannote.core.Annotation} dictionary. """ names = ["uri", "NA1", "start", "duration", "NA2", "NA3", "NA4", "speaker"] dtype = {"uri": str, "start": float, "duration": float, "speaker": str} data = pd.read_csv( file_mdtm, names=names, dtype=dtype, delim_whitespace=True, keep_default_na=False, ) annotations = dict() for uri, turns in data.groupby("uri"): annotation = Annotation(uri=uri) for i, turn in turns.iterrows(): segment = Segment(turn.start, turn.start + turn.duration) annotation[segment, i] = turn.speaker annotations[uri] = annotation return annotations
def _partition(self, timeline, coverage): # boundaries (as set of timestamps) boundaries = set([]) for segment in timeline: boundaries.add(segment.start) boundaries.add(segment.end) # partition (as timeline) partition = Annotation() for start, end in pairwise(sorted(boundaries)): segment = Segment(start, end) partition[segment] = '_' cropped = partition.crop(coverage, mode='intersection') return partition.crop(coverage, mode='intersection').anonymize_tracks()
def run(self): # wav file duration wav = self.in_wav().path with contextlib.closing(wave.open(wav, 'r')) as f: frames = f.getnframes() rate = f.getframerate() duration = frames / rate extent = Segment(0., duration) with self.in_speaker().open('r') as fp: speaker = pyannote.core.json.load(fp) timeline = Timeline() for segment, _ in speaker.itertracks(): timeline.add(segment) # fill gaps for gap in timeline.gaps(extent): if gap.duration < self.fill_gaps: timeline.add(gap) timeline = timeline.coverage() # dump as annotation... if self.to_annotation: annotation = Annotation() for s, segment in enumerate(timeline): annotation[segment] = s annotation = annotation.anonymize_labels(generator='string') with self.out_put().open('w') as fp: pyannote.core.json.dump(annotation, fp) # ... or as timeline else: with self.out_put().open('w') as fp: pyannote.core.json.dump(timeline, fp)
def trn_iter(self): data_dir = op.join(op.dirname(op.realpath(__file__)), 'data') data_csv = op.join(data_dir, 'voxceleb1.csv') data = pd.read_csv(data_csv, index_col=['segment']) data = data.groupby('identification').get_group('trn') for uri, rows in data.groupby('uri'): annotation = Annotation(uri=uri) for row in rows.itertuples(): segment = Segment(row.start, row.end) annotation[segment] = row.speaker annotated = annotation.get_timeline() current_file = { 'uri': uri, 'database': 'VoxCeleb', 'annotation': annotation, 'annotated': annotated, } yield current_file
def run(self): # wav file duration wav = self.in_wav().path with contextlib.closing(wave.open(wav, 'r')) as f: frames = f.getnframes() rate = f.getframerate() duration = frames / rate extent = Segment(0., duration) with self.in_speaker().open('r') as fp: speaker = pyannote.core.json.load(fp) segmentation = Annotation() for segment, _ in speaker.itertracks(): segmentation[segment] = 'speech' segmentation = segmentation.smooth() for gap in segmentation.get_timeline().gaps(extent): segmentation[gap] = 'non_speech' segmentation = segmentation.smooth() with self.out_put().open('w') as fp: pyannote.core.json.dump(segmentation, fp)
def iter_triplets(self, from_annotation): """Yield (anchor, positive, negative) segment triplets Parameters ---------- from_annotation : Annotation Annotation from which triplets are obtained. """ t = RandomTrackTriplets(per_label=self.per_label, yield_label=self.yield_label) annotation = Annotation(uri=from_annotation.uri, modality=from_annotation.modality) for segment, track, label in from_annotation.itertracks(label=True): if segment.duration < self.duration: continue annotation[segment, track] = label if len(annotation.labels()) < 2: return triplets = t.iter_triplets(annotation) for triplet in triplets: a, p, n = [item[0] for item in triplet] if self.duration: a, p, n = [self.pick(s) for s in (a, p, n)] if self.yield_label: a_, p_, n_ = [item[2] for item in triplet] yield (a, a_), (p, p_), (n, n_) else: yield a, p, n
def run(self): # wav file duration wav = self.in_wav().path with contextlib.closing(wave.open(wav, 'r')) as f: frames = f.getnframes() rate = f.getframerate() duration = frames / rate extent = Segment(0., duration) with self.in_subtitles().open('r') as fp: transcription = pyannote.core.json.load(fp) annotation = Annotation() for start, end, edge in transcription.ordered_edges_iter(data=True): if 'subtitle' not in edge: continue segment = Segment(start, end) annotation[segment] = 'speech' for gap in annotation.get_timeline().gaps(extent): annotation[gap] = 'non_speech' with self.out_put().open('w') as fp: pyannote.core.json.dump(annotation, fp)
def regression(self, reference, before, after, uem=None, uemified=False): _, before, errors_before = self.difference( reference, before, uem=uem, uemified=True) reference, after, errors_after = self.difference( reference, after, uem=uem, uemified=True) behaviors = Annotation(uri=reference.uri, modality=reference.modality) # common (up-sampled) timeline common_timeline = errors_after.get_timeline().union( errors_before.get_timeline()) common_timeline = common_timeline.segmentation() # align 'before' errors on common timeline B = self._tagger(errors_before, common_timeline) # align 'after' errors on common timeline A = self._tagger(errors_after, common_timeline) for segment in common_timeline: old_errors = B.get_labels(segment, unique=False) new_errors = A.get_labels(segment, unique=False) n1 = len(old_errors) n2 = len(new_errors) n = max(n1, n2) match = np.zeros((n, n), dtype=int) for i1, e1 in enumerate(old_errors): for i2, e2 in enumerate(new_errors): match[i1, i2] = self._match_errors(e1, e2) mapping = self.munkres.compute(2 - match) for i1, i2 in mapping: if i1 >= n1: track = behaviors.new_track(segment, candidate=REGRESSION, prefix=REGRESSION) behaviors[segment, track] = ( REGRESSION, None, new_errors[i2]) elif i2 >= n2: track = behaviors.new_track(segment, candidate=IMPROVEMENT, prefix=IMPROVEMENT) behaviors[segment, track] = ( IMPROVEMENT, old_errors[i1], None) elif old_errors[i1][0] == MATCH_CORRECT: if new_errors[i2][0] == MATCH_CORRECT: track = behaviors.new_track(segment, candidate=BOTH_CORRECT, prefix=BOTH_CORRECT) behaviors[segment, track] = ( BOTH_CORRECT, old_errors[i1], new_errors[i2]) else: track = behaviors.new_track(segment, candidate=REGRESSION, prefix=REGRESSION) behaviors[segment, track] = ( REGRESSION, old_errors[i1], new_errors[i2]) else: if new_errors[i2][0] == MATCH_CORRECT: track = behaviors.new_track(segment, candidate=IMPROVEMENT, prefix=IMPROVEMENT) behaviors[segment, track] = ( IMPROVEMENT, old_errors[i1], new_errors[i2]) else: track = behaviors.new_track(segment, candidate=BOTH_INCORRECT, prefix=BOTH_INCORRECT) behaviors[segment, track] = ( BOTH_INCORRECT, old_errors[i1], new_errors[i2]) behaviors = behaviors.smooth() if uemified: return reference, before, after, behaviors else: return behaviors
names = ['time', 'track_id', 'left', 'top', 'right', 'bottom'] face_tracking = pd.read_table(path, delim_whitespace=True, header=None, names=names) pyannote_face = Annotation(uri=uri) for track_id, track in face_tracking.groupby('track_id'): start = track['time'].min() end = track['time'].max() label = mapping.get(track_id, None) if label is None: SKIP = 'Skipping track #{track_id} ({duration:d} ms) in {video_id}' print(SKIP.format(track_id=track_id, duration=int(1000*(end-start)), video_id=video_id)) pyannote_face[Segment(start, end), track_id] = label # load names as pyannote.Annotation path = OCR.format(repository=REPOSITORY, uri=uri) names = ['start', 'end', 'start_frame', 'end_frame', 'name', 'confidence'] pyannote_ocr = Annotation(uri=uri) try: ocr = pd.read_table(path, delim_whitespace=True, header=None, names=names) for _, (start, end, _, _, name, _) in ocr.iterrows(): pyannote_ocr[Segment(start, end)] = name except pandas.parser.CParserError as e: pass # name each person by most co-occurring OCR name if not pyannote_ocr: named_face = Annotation(uri=uri) else: named_face = argmax_tagger(pyannote_ocr, pyannote_face) named_face = named_face.subset(pyannote_ocr.labels()) path = FUSION.format(repository=REPOSITORY, uri=uri)
def read(self, path, uri=None, modality=None, **kwargs): """ Parameters ---------- path : str modality : str, optional Force all entries to be considered as coming from this modality. Only taken into account when file format does not provide any field related to modality (e.g. .seg files) """ # load whole file df = pandas.read_table(path, delim_whitespace=True, header=None, names=self.fields(), comment=self.comment(), converters=self.converters(), dtype={PYANNOTE_LABEL: object}) # remove comment lines # (i.e. lines for which all fields are either None or NaN) keep = [not all(pandas.isnull(item) for item in row[1:]) for row in df.itertuples()] df = df[keep] # add 'segment' column build from start time & duration df[PYANNOTE_SEGMENT] = [self.get_segment(row) for row in df.itertuples()] # add unique track numbers if they are not read from file if PYANNOTE_TRACK not in self.fields(): df[PYANNOTE_TRACK] = range(df.shape[0]) # add uri column in case it does not exist if PYANNOTE_URI not in df: if uri is None: raise ValueError('missing uri -- use uri=') df[PYANNOTE_URI] = uri # obtain list of resources uris = list(df[PYANNOTE_URI].unique()) # add modality column in case it does not exist if PYANNOTE_MODALITY not in df: if modality is None: raise ValueError('missing modality -- use modality=') df[PYANNOTE_MODALITY] = modality if modality is not None else "" # obtain list of modalities modalities = list(df[PYANNOTE_MODALITY].unique()) self._loaded = {} # loop on resources for uri in uris: # filter based on resource df_ = df[df[PYANNOTE_URI] == uri] # loop on modalities for modality in modalities: # filter based on modality modality = modality if modality is not None else "" df__ = df_[df_[PYANNOTE_MODALITY] == modality] a = Annotation.from_df(df__, modality=modality, uri=uri) self._loaded[uri, modality] = a return self
def __call__(self, reference, hypothesis): if isinstance(reference, Annotation): reference = reference.get_timeline() if isinstance(hypothesis, Annotation): hypothesis = hypothesis.get_timeline() # over-segmentation over = Timeline(uri=reference.uri) prev_r = reference[0] intersection = [] for r, h in reference.co_iter(hypothesis): if r != prev_r: intersection = sorted(intersection) for _, segment in intersection[:-1]: over.add(segment) intersection = [] prev_r = r segment = r & h intersection.append((segment.duration, segment)) intersection = sorted(intersection) for _, segment in intersection[:-1]: over.add(segment) # under-segmentation under = Timeline(uri=reference.uri) prev_h = hypothesis[0] intersection = [] for h, r in hypothesis.co_iter(reference): if h != prev_h: intersection = sorted(intersection) for _, segment in intersection[:-1]: under.add(segment) intersection = [] prev_h = h segment = h & r intersection.append((segment.duration, segment)) intersection = sorted(intersection) for _, segment in intersection[:-1]: under.add(segment) # extent extent = reference.extent() # correct (neither under- nor over-segmented) correct = under.union(over).gaps(focus=extent) # frontier error (both under- and over-segmented) frontier = under.crop(over) # under-segmented not_over = over.gaps(focus=extent) only_under = under.crop(not_over) # over-segmented not_under = under.gaps(focus=extent) only_over = over.crop(not_under) status = Annotation(uri=reference.uri) for segment in correct: status[segment, '_'] = 'correct' for segment in frontier: status[segment, '_'] = 'frontier' for segment in only_over: status[segment, '_'] = 'over' for segment in only_under: status[segment, '_'] = 'under' return status.smooth()
dic_trackID_st_to_speakingFace = {} for s, t, st in sd.itertracks(label=True): dic_trackID_st_to_speakingFace[t] = ['', thr_propagation] for line in open(args['<mat_speaking_face>']+'/'+videoID+'.mat').read().splitlines(): TrackID_st, TrackID_Face, proba = line.split(' ') if float(proba) > dic_trackID_st_to_speakingFace[int(TrackID_st)][1]: dic_trackID_st_to_speakingFace[int(TrackID_st)] = [int(TrackID_Face), float(proba)] trackID_face_to_name = {} for s, t, name in NamedSpk.itertracks(label=True): if dic_trackID_st_to_speakingFace[t][0] != '': trackID_face_to_name[dic_trackID_st_to_speakingFace[t][0]] = name namedFaces = Annotation(uri=videoID) for s, t, faceID in faces.itertracks(label=True): if t in trackID_face_to_name: namedFaces[s, t] = trackID_face_to_name[t] # write person visible and speaking in a shot: for sshot, tshot, shot in shots.itertracks(label=True): NamedSpkShot = NamedSpk.crop(sshot) NamedFaceShot = namedFaces.crop(sshot) PersonShot = set(NamedSpkShot.labels()) & set(NamedFaceShot.labels()) for p in (PersonShot & set(evidences.keys())): conf = 0.0 for sSpk in NamedSpkShot.label_timeline(p): for sON, tON, name in ON.itertracks(label=True): if name == p:
uri = corpus_id + '/' + video_id # load shots as pyannote.Annotation path = SHOTS.format(repository=REPOSITORY, uri=uri) names = ['corpus_id', 'video_id', 'shot_id', 'start', 'end'] dtype = {'shot_id': str} shots = pd.read_table(path, delim_whitespace=True, header=None, names=names, dtype=dtype) pyannote_shots = Annotation(uri=uri) for _, (_, _, shot_id, start, end) in shots.iterrows(): pyannote_shots[Segment(start, end), shot_id] = shot_id # load speaker diarization as pyannote.Annotation path = SPEAKERS.format(repository=REPOSITORY, uri=uri) names = ['corpus_id', 'video_id', 'start', 'end', 'label', 'gender'] speakers = pd.read_table(path, delim_whitespace=True, header=None, names=names) pyannote_speakers = Annotation(uri=uri) for _, (_, _, start, end, label, _) in speakers.iterrows(): pyannote_speakers[Segment(start, end)] = label pyannote_speakers = pyannote_speakers.anonymize_labels(generator='int') # load names as pyannote.Annotation path = OCR.format(repository=REPOSITORY, uri=uri) names = ['start', 'end', 'start_frame', 'end_frame', 'name', 'confidence'] pyannote_ocr = Annotation(uri=uri) try: ocr = pd.read_table(path, delim_whitespace=True, header=None, names=names) for _, (start, end, _, _, name, _) in ocr.iterrows(): pyannote_ocr[Segment(start, end)] = name except pandas.parser.CParserError as e: pass
def difference(self, reference, hypothesis, uem=None, uemified=False): """Get error analysis as `Annotation` Labels are (status, reference_label, hypothesis_label) tuples. `status` is either 'correct', 'confusion', 'missed detection' or 'false alarm'. `reference_label` is None in case of 'false alarm'. `hypothesis_label` is None in case of 'missed detection'. Parameters ---------- uemified : bool, optional Returns "uemified" version of reference and hypothesis. Defaults to False. Returns ------- errors : `Annotation` """ reference, hypothesis = self.uemify( reference, hypothesis, uem=uem, collar=self.collar) reference, hypothesis = self._handle_unknowns(reference, hypothesis) # common (up-sampled) timeline common_timeline = reference.get_timeline().union( hypothesis.get_timeline()) common_timeline = common_timeline.segmentation() # align reference on common timeline R = self._tagger(reference, common_timeline) # translate and align hypothesis on common timeline H = self._tagger(hypothesis, common_timeline) errors = Annotation(uri=reference.uri, modality=reference.modality) # loop on all segments for segment in common_timeline: # list of labels in reference segment rlabels = R.get_labels(segment, unknown=self.unknown, unique=False) # list of labels in hypothesis segment hlabels = H.get_labels(segment, unknown=self.unknown, unique=False) _, details = self.matcher(rlabels, hlabels) for r, h in details[MATCH_CORRECT]: track = errors.new_track(segment, prefix=MATCH_CORRECT) errors[segment, track] = (MATCH_CORRECT, r, h) for r, h in details[MATCH_CONFUSION]: track = errors.new_track(segment, prefix=MATCH_CONFUSION) errors[segment, track] = (MATCH_CONFUSION, r, h) for r in details[MATCH_MISSED_DETECTION]: track = errors.new_track(segment, prefix=MATCH_MISSED_DETECTION) errors[segment, track] = (MATCH_MISSED_DETECTION, r, None) for h in details[MATCH_FALSE_ALARM]: track = errors.new_track(segment, prefix=MATCH_FALSE_ALARM) errors[segment, track] = (MATCH_FALSE_ALARM, None, h) if uemified: return reference, hypothesis, errors else: return errors