def get_annotated(current_file): # if protocol provides 'annotated' key, use it if 'annotated' in current_file: annotated = current_file['annotated'] return annotated # if it does not, but does provide 'wav' key # try and use wav duration if 'wav' in current_file: wav = current_file['wav'] try: from pyannote.audio.features.utils import get_wav_duration duration = get_wav_duration(wav) except ImportError as e: pass else: warnings.warn('"annotated" was approximated by "wav" duration.') annotated = Timeline([Segment(0, duration)]) return annotated warnings.warn('"annotated" was approximated by "annotation" extent.') extent = current_file['annotation'].get_timeline().extent() annotated = Timeline([extent]) return annotated
def test_get_overlap(): annotation = Annotation() annotation[Segment(0, 5)] = "A" annotation[Segment(10, 15)] = "A" annotation[Segment(20, 25)] = "A" annotation[Segment(0, 10)] = "B" annotation[Segment(15, 25)] = "B" annotation[Segment(5, 10)] = "C" annotation[Segment(20, 30)] = "C" assert (annotation.get_overlap() == Timeline([Segment(0, 10), Segment(20, 25)])) assert (annotation.get_overlap(["A", "B"]) == Timeline([Segment(0, 5), Segment(20, 25)])) assert (annotation.get_overlap(["A", "C"]) == Timeline([Segment(20, 25)])) assert (annotation.get_overlap(["B", "C"]) == Timeline([Segment(5, 10), Segment(20, 25)]))
def apply(self, features, segmentation=None): """ Parameters ---------- features : Features segmentation : Timeline, optional """ if segmentation is None: segmentation = Timeline(segments=[features.getExtent()]) sliding_window = features.sliding_window min_samples = sliding_window.durationToSamples(self.min_duration) precision = sliding_window.durationToSamples(self.precision) segmenter = SKLearnBICSegmentation( penalty_coef=self.penalty_coef, covariance_type=self.covariance_type, min_samples=min_samples, precision=precision) result = Timeline() for long_segment in segmentation: X = features.crop(long_segment) boundaries = segmenter.apply(X) for t, T in pairwise(boundaries): segment = sliding_window.rangeToSegment(t, T - t) shifted_segment = Segment(long_segment.start + segment.start, long_segment.start + segment.end) result.add(shifted_segment) return result
def test_union_extent(): first_timeline = Timeline([Segment(0, 1), Segment(2, 3), Segment(4, 5)]) second_timeline = Timeline([Segment(1.5, 6)]) union_timeline = first_timeline.union(second_timeline) assert union_timeline.extent() == Segment(0, 6)
def test_initialized_with_empty_segments(): # The first timeline includes empty segments. first_timeline = Timeline([Segment(1, 5), Segment(6, 6), Segment(7, 7), Segment(8, 10)]) # The second has no empty segments. second_timeline = Timeline([Segment(1, 5), Segment(8, 10)]) assert first_timeline == second_timeline
def test_crop_mapping(): timeline = Timeline([Segment(0, 2), Segment(1, 2), Segment(3, 4)]) cropped, mapping = timeline.crop(Segment(1, 2), returns_mapping=True) expected_cropped = Timeline([Segment(1, 2)]) assert cropped == expected_cropped expected_mapping = {Segment(1, 2): [Segment(0, 2), Segment(1, 2)]} assert mapping == expected_mapping
def test_consistent_timelines_with_empty_segments(): # The first timeline is initialized with Segments, some empty. first_timeline = Timeline([Segment(1, 5), Segment(6, 6), Segment(7, 7), Segment(8, 10)]) # The second timeline adds one Segment at a time, including empty ones. second_timeline = Timeline() second_timeline.add(Segment(1, 5)) second_timeline.add(Segment(6, 6)) second_timeline.add(Segment(7, 7)) second_timeline.add(Segment(8, 10)) assert first_timeline == second_timeline
def test_timeline_overlaps(): overlapped_tl = Timeline(uri="La menuiserie mec") overlapped_tl.add(Segment(0, 10)) overlapped_tl.add(Segment(5, 10)) overlapped_tl.add(Segment(15, 20)) overlapped_tl.add(Segment(18, 23)) expected_overlap = Timeline() expected_overlap.add(Segment(5, 10)) expected_overlap.add(Segment(18, 20)) assert expected_overlap == overlapped_tl.get_overlap()
def test_added_empty_segments(): # The first timeline includes empty segments. first_timeline = Timeline() first_timeline.add(Segment(1, 5)) first_timeline.add(Segment(6, 6)) first_timeline.add(Segment(7, 7)) first_timeline.add(Segment(8, 10)) # The second has no empty segments. second_timeline = Timeline() second_timeline.add(Segment(1, 5)) second_timeline.add(Segment(8, 10)) assert first_timeline == second_timeline
def tst_iter(self): # absolute path to 'data' directory where annotations are stored data_dir = Path(__file__).parent / 'data' / 'speaker_diarization' annotated = data_dir / 'fullset.uem' names = ['uri', 'NA0', 'start', 'end'] annotated = read_table(annotated, delim_whitespace=True, names=names) annotated_segments = {} for segment in annotated.itertuples(): annotated_segments[segment.uri] = Segment(start=segment.start, end=segment.end) # iterate through the text annotation files for filename in os.listdir(data_dir): if filename.endswith(".txt"): uri, _ = os.path.splitext(os.path.basename(filename)) annotation = Annotation(uri=uri) names = ['start', 'end', 'speaker', 'speakerID'] parsed_file = read_table(os.path.join(data_dir, filename), delim_whitespace=True, names=names) for t, turn in enumerate(parsed_file.itertuples()): segment = Segment(start=turn.start, end=turn.end) annotation[segment, t] = turn.speakerID current_file = { 'database': 'Odessa', 'uri': uri, 'annotated': Timeline(uri=uri, segments=[annotated_segments[uri]]), 'annotation': annotation} yield current_file
def tst_enrol_iter(self): # load enrolments data_dir = Path(__file__).parent / 'data' / 'speaker_spotting' enrolments = data_dir / 'tst.enrol.txt' names = ['uri', 'NA0', 'start', 'duration', 'NA1', 'NA2', 'NA3', 'model_id'] enrolments = read_table(enrolments, delim_whitespace=True, names=names) for model_id, turns in enrolments.groupby(by=['uri', 'model_id']): # gather enrolment data segments = [] uri = '' for t, turn in enumerate(turns.itertuples()): if t == 0: uri = turn.uri segment = Segment(start=turn.start, end=turn.start + turn.duration) if segment: segments.append(segment) enrol_with = Timeline(segments=segments, uri=uri) current_enrolment = { 'database': 'Odessa', 'uri': uri, 'model_id': model_id[1], # model_id 'enrol_with': enrol_with, } yield current_enrolment
def _subset_enrollment(self, protocol, subset): data_dir = op.join(op.dirname(op.realpath(__file__)), 'data') enrolments = op.join(data_dir, '{protocol}.{subset}.txt'.format(subset=subset, protocol=protocol)) names = ['uri', 'NA0', 'start', 'duration', 'NA1', 'NA2', 'NA3', 'model_id'] enrolments = read_table(enrolments, delim_whitespace=True, names=names) for model_id, turns in enrolments.groupby(by='model_id'): # gather enrolment data segments = [] for t, turn in enumerate(turns.itertuples()): if t == 0: raw_uri = turn.uri uri = f'{raw_uri}' segment = Segment(start=turn.start, end=turn.start + turn.duration) if segment: segments.append(segment) enrol_with = Timeline(segments=segments, uri=uri) current_enrolment = { 'database': 'RTVE2018', 'uri': uri, 'model_id': model_id, 'enrol_with': enrol_with, } yield current_enrolment
def _subset(self, protocol, subset): data_dir = op.join(op.dirname(op.realpath(__file__)), 'data') # load annotations path = op.join( data_dir, 'librispeech-{protocol}.{subset}.mdtm'.format(subset=subset, protocol=protocol)) mdtms = self.mdtm_parser_.read(path) for uri in sorted(mdtms.uris): annotation = mdtms(uri) current_file = { 'database': 'LibriSpeech', 'uri': uri, 'annotation': annotation, # annotated part as pyannote.core.Timeline instance 'annotated': Timeline(uri=uri, segments=[annotation.get_timeline().extent()]) } yield current_file
def __call__(self, sequence=Stream.NoNewData): if isinstance(sequence, More): sequence = sequence.output if sequence in [Stream.EndOfStream, Stream.NoNewData]: return sequence data = sequence.data active = data[0] sw = sequence.sliding_window start = sw[0].middle timeline = Timeline() timeline.start = start for i, y in enumerate(data): if active and not y: segment = Segment(start, sw[i].middle) timeline.add(segment) active = False elif not active and y: active = True start = sw[i].middle if active: segment = Segment(start, sw[i].middle) timeline.add(segment) timeline.end = sw[i].middle return timeline
def predict(audio, algorithm='SpectralClustering'): # Speech Activation Detection sad_scores = sad(audio) binarize_sad = Binarize(offset=0.52, onset=0.52, log_scale=True, min_duration_off=0.1, min_duration_on=0.1) speech = binarize_sad.apply(sad_scores, dimension=1) # Speaker Change Detection scd_scores = scd(audio) peak = Peak(alpha=0.10, min_duration=0.10, log_scale=True) partition = peak.apply(scd_scores, dimension=1) # Overlapped Speech Detection # ovl_scores = ovl(audio) # binarize_ovl = Binarize(offset=0.55, onset=0.55, log_scale=True, # min_duration_off=0.1, min_duration_on=0.1) # overlap = binarize_ovl.apply(ovl_scores, dimension=1) # Speaker Embedding speech_turns = partition.crop(speech) embeddings = emb(audio) long_turns = Timeline( segments=[s for s in speech_turns if s.duration > .5]) return long_turns, sad_scores, scd_scores, embeddings
def _preprocess(self, reference, hypothesis): if not isinstance(reference, Annotation): raise TypeError('reference must be an instance of `Annotation`') if isinstance(hypothesis, Annotation): hypothesis = hypothesis.get_timeline() # reference where short intra-label gaps are removed filled = Timeline() for label in reference.labels(): label_timeline = reference.label_timeline(label) for gap in label_timeline.gaps(): if gap.duration < self.tolerance: label_timeline.add(gap) for segment in label_timeline.support(): filled.add(segment) # reference coverage after filling gaps coverage = filled.support() reference_partition = self._partition(filled, coverage) hypothesis_partition = self._partition(hypothesis, coverage) return reference_partition, hypothesis_partition
def test_union(): first_timeline = Timeline([Segment(0, 1), Segment(2, 3), Segment(4, 5)]) second_timeline = Timeline([Segment(1.5, 4.5)]) assert first_timeline.union(second_timeline) == Timeline( [Segment(0, 1), Segment(1.5, 4.5), Segment(2, 3), Segment(4, 5)]) assert second_timeline.crop(first_timeline) == Timeline( [Segment(2, 3), Segment(4, 4.5)]) assert list(first_timeline.co_iter(second_timeline)) == [ (Segment(2, 3), Segment(1.5, 4.5)), (Segment(4, 5), Segment(1.5, 4.5)) ]
def _xxx_enrol_iter(self, subset): data_dir = op.join(op.dirname(op.realpath(__file__)), 'data') data_csv = op.join(data_dir, 'voxceleb1.csv') data = pd.read_csv(data_csv, index_col=['segment']) trial_csv = op.join( data_dir, 'voxceleb1.verification.{subset}.csv'.format(subset=subset)) trials = pd.read_csv(trial_csv) for model_id in trials['enrolment'].unique(): try: row = data.ix[model_id] except KeyError as e: # file_id = model_id.split('/')[1][:-8] # msg = '{file_id} marked as duplicate in VoxCeleb 1.1' # warnings.warn(msg.format(file_id=file_id)) continue uri = model_id segment = Segment(0., row.end - row.start) current_enrolment = { 'database': 'VoxCeleb', 'uri': uri, 'model_id': model_id, 'enrol_with': Timeline(uri=uri, segments=[segment]), } yield current_enrolment
def load_uem(file_uem): """Load UEM file Parameter --------- file_uem : `str` Path to UEM file. Returns ------- timelines : `dict` Evaluation map as a {uri: pyannote.core.Timeline} dictionary. """ names = ['uri', 'NA1', 'start', 'end'] dtype = {'uri': str, 'start': float, 'end': float} data = pd.read_csv(file_uem, names=names, dtype=dtype, delim_whitespace=True) timelines = dict() for uri, parts in data.groupby('uri'): segments = [Segment(part.start, part.end) for i, part in parts.iterrows()] timelines[uri] = Timeline(segments=segments, uri=uri) return timelines
def _xxx_iter(self, subset): data = self._load_data(subset) AnnotatedGroups = data['annotated'].groupby(by='uri') AnnotationGroups = data['annotation'].groupby(by='uri') for raw_uri, annotated in AnnotatedGroups: uri = f'{raw_uri}.Mix-Headset' segments = [] for segment in annotated.itertuples(): segments.append(Segment(start=segment.start, end=segment.end)) annotation = Annotation(uri=uri) for t, turn in enumerate( AnnotationGroups.get_group(raw_uri).itertuples()): segment = Segment(start=turn.start, end=turn.start + turn.duration) annotation[segment, t] = turn.speaker current_file = { 'database': 'Test', 'uri': uri, 'annotated': Timeline(uri=uri, segments=segments), 'annotation': annotation } yield current_file
def common_enrol_iter(self): data_dir = op.join(op.dirname(op.realpath(__file__)), 'data') data_csv = op.join(data_dir, 'voxceleb1.csv') data = pd.read_csv(data_csv, index_col=['segment']) data = data.groupby('identification').get_group('trn') for model_id, model_rows in data.groupby('speaker'): uris = [] enrol_with = [] for uri, rows in model_rows.groupby('uri'): uris.append(uri) segments = [] for row in rows.itertuples(): segments.append(Segment(row.start, row.end)) enrol_with.append(Timeline(uri=uri, segments=segments)) current_enrolment = { 'database': 'VoxCeleb', 'model_id': model_id, 'uri': uris, 'enrol_with': enrol_with } yield current_enrolment
def _xxx_enrol_iter(self, subset): # load enrolments data_dir = Path(__file__).parent / 'data' / 'speaker_spotting' enrolments = data_dir / f'{subset}.enrol.txt' names = [ 'uri', 'NA0', 'start', 'duration', 'NA1', 'NA2', 'NA3', 'model_id' ] enrolments = read_table(enrolments, delim_whitespace=True, names=names) for model_id, turns in enrolments.groupby(by='model_id'): # gather enrolment data segments = [] for t, turn in enumerate(turns.itertuples()): if t == 0: raw_uri = turn.uri uri = f'{raw_uri}.Mix-Headset' segment = Segment(start=turn.start, end=turn.start + turn.duration) if segment: segments.append(segment) enrol_with = Timeline(segments=segments, uri=uri) current_enrolment = { 'database': 'Test', 'uri': uri, 'model_id': model_id, 'enrol_with': enrol_with, } yield current_enrolment
def write_test_file(data_dir, output_file, trial_length): annotations, max_length, speakers = read_annotaitons(data_dir) # create an artificial non-overlapping segments each of the trial_length size trial_segments = Timeline() for i in range(0, int(max_length) // trial_length): trial_segments.add(Segment(start=i*trial_length, end=(i+1)*trial_length)) with open(output_file, 'w') as f: for label in speakers.keys(): for annotation in annotations: # make sure our trial segments are not extending beyond the total length of the speech data support = annotation.get_timeline().extent() # we consider smaller segment here to make sure an embedding of 3 seconds can be computed adjusted_trial_segments = trial_segments.crop(Segment(start=support.start, end=support.end - 3.), mode='loose') uri = annotation.uri cur_timeline = annotation.label_timeline(label, copy=False) for trial_segment in adjusted_trial_segments: cropped_speaker = cur_timeline.crop(trial_segment, mode='intersection') if not cropped_speaker: f.write('{0} {1} {2:0>7.2f} {3:0>7.2f} nontarget - -\n'.format( label, uri, trial_segment.start, trial_segment.end)) else: f.write('{0} {1} {2:0>7.2f} {3:0>7.2f} target {4:0>7.2f} {5:0>7.2f}\n'.format( label, uri, trial_segment.start, trial_segment.end, cropped_speaker[0].start, cropped_speaker[0].duration))
def test_extrude(): annotation = Annotation() annotation[Segment(0, 10)] = "A" annotation[Segment(15, 20)] = "A" annotation[Segment(20, 35)] = "B" annotation[Segment(15, 25)] = "C" annotation[Segment(30, 35)] = "C" extrusion_tl = Timeline([Segment(5, 12), Segment(14, 25)]) intersection_expected = Annotation() intersection_expected[Segment(0, 5)] = "A" intersection_expected[Segment(25, 35)] = "B" intersection_expected[Segment(30, 35)] = "C" assert (annotation.extrude(extrusion_tl, mode="intersection") == intersection_expected) loose_expected = Annotation() loose_expected[Segment(30, 35)] = "C" assert (annotation.extrude(extrusion_tl, mode="loose") == loose_expected) strict_expected = Annotation() strict_expected[Segment(0, 10)] = "A" strict_expected[Segment(20, 35)] = "B" strict_expected[Segment(30, 35)] = "C" assert (annotation.extrude(extrusion_tl, mode="strict") == strict_expected)
def DER(outfile, AudioDataSet, annotationlist, audioLength): reference = Annotation() if not AudioDataSet == 'DiaExample': treeA = ET.parse(annotationlist[0]) rootA = treeA.getroot() for child in rootA.findall('segment'): start, end = float(child.get('transcriber_start')), float( child.get('transcriber_end')) reference[Segment(start, end)] = 'A' treeB = ET.parse(annotationlist[1]) rootB = treeB.getroot() for child in rootB.findall('segment'): start, end = float(child.get('transcriber_start')), float( child.get('transcriber_end')) reference[Segment(start, end)] = 'B' treeC = ET.parse(annotationlist[2]) rootC = treeC.getroot() for child in rootC.findall('segment'): start, end = float(child.get('transcriber_start')), float( child.get('transcriber_end')) reference[Segment(start, end)] = 'C' treeD = ET.parse(annotationlist[3]) rootD = treeD.getroot() for child in rootD.findall('segment'): start, end = float(child.get('transcriber_start')), float( child.get('transcriber_end')) reference[Segment(start, end)] = 'D' else: reference = Annotation() reference[Segment(0.15, 3.41)] = 'A' reference[Segment(3.83, 5.82)] = 'A' reference[Segment(6.75, 11.10)] = 'B' reference[Segment(11.32, 15.8)] = 'C' reference[Segment(15.9, 18.8)] = 'B' reference[Segment(18.8, 27.8)] = 'C' reference[Segment(27.8, 34.4)] = 'B' reference[Segment(34.4, 42)] = 'D' hypothesis = Annotation() f = open(outfile, 'r') for line in f.readlines(): start = float(line.split(' ')[3]) end = start + float(line.split(' ')[4]) annotation = line.split(' ')[5][0:-1] hypothesis[Segment(start, end)] = annotation f.close() metric = DiarizationErrorRate() metricPurity = DiarizationPurity() uem = Timeline([Segment(0, audioLength)]) print('DER: %.2f %%' % (metric(reference, hypothesis, uem=uem) * 100)) print('Cluster Purity: %.2f %%' % (metricPurity(reference, hypothesis, uem=uem) * 100)) return metric, reference, hypothesis
def test_remove_and_extent(): t = Timeline(uri='MyAudioFile') t.add(Segment(6, 8)) t.add(Segment(7, 9)) t.add(Segment(6, 9)) t.remove(Segment(6, 9)) assert t.extent() == Segment(6, 9)
def load_sad_manual(dataset: Text, path: Text) -> Dict: """Load accepted pyannote.sad.manual examples Parameters ---------- dataset : str Dataset containing annotations. path : str Path to annotated file Returns ------- file : dict Dictionary containing the following keys: "audio" (Path) : path to audio file "annotated" (Timeline) : part of the audio annotated and accepted "speech" (Timeline) : part of the audio accepted as speech """ db = connect() examples = [ eg for eg in db.get_dataset(dataset) if eg["recipe"] == "pyannote.sad.manual" and eg["path"] == path and eg["answer"] == "accept" ] speech = Timeline( segments=[ Segment(span["start"], span["end"]) for eg in examples for span in eg["audio_spans"] ], ).support() annotated = Timeline(segments=[Segment(**eg["chunk"]) for eg in examples]).support() prodigy.log(f"RECIPE: {path}: loaded speech regions") return { "audio": Path(path), "speech": speech, "annotated": annotated, }
def apply(self, predictions, dimension=0): """Peak detection Parameter --------- predictions : SlidingWindowFeature Predictions returned by segmentation approaches. Returns ------- segmentation : Timeline Partition. """ if len(predictions.data.shape) == 1: y = predictions.data elif predictions.data.shape[1] == 1: y = predictions.data[:, 0] else: y = predictions.data[:, dimension] if self.log_scale: y = np.exp(y) sw = predictions.sliding_window precision = sw.step order = max(1, int(np.rint(self.min_duration / precision))) indices = scipy.signal.argrelmax(y, order=order)[0] if self.scale == 'absolute': mini = 0 maxi = 1 elif self.scale == 'relative': mini = np.nanmin(y) maxi = np.nanmax(y) elif self.scale == 'percentile': mini = np.nanpercentile(y, 1) maxi = np.nanpercentile(y, 99) threshold = mini + self.alpha * (maxi - mini) peak_time = np.array( [sw[i].middle for i in indices if y[i] > threshold]) n_windows = len(y) start_time = sw[0].start end_time = sw[n_windows].end boundaries = np.hstack([[start_time], peak_time, [end_time]]) segmentation = Timeline() for i, (start, end) in enumerate(pairwise(boundaries)): segment = Segment(start, end) segmentation.add(segment) return segmentation
def get_annotated(current_file): """Get part of the file that is annotated. Parameters ---------- current_file : `dict` File generated by a ` pyannote.database` protocol. Returns ------- annotated : `pyannote.core.Timeline` Part of the file that is annotated. Defaults to `current_file["annotated"]`. When it does not exist, try to use the full audio extent. When that fails, use "annotation" extent. """ # if protocol provides 'annotated' key, use it if "annotated" in current_file: annotated = current_file["annotated"] return annotated # if it does not, but does provide 'audio' key # try and use wav duration if "duration" in current_file: try: duration = current_file["duration"] except ImportError: pass else: annotated = Timeline([Segment(0, duration)]) msg = '"annotated" was approximated by [0, audio duration].' warnings.warn(msg) return annotated extent = current_file["annotation"].get_timeline().extent() annotated = Timeline([extent]) msg = ('"annotated" was approximated by "annotation" extent. ' 'Please provide "annotated" directly, or at the very ' 'least, use a "duration" preprocessor.') warnings.warn(msg) return annotated
def extrude(self, uem, reference, collar=0.0, skip_overlap=False): """Extrude reference boundary collars from uem reference |----| |--------------| |-------------| uem |---------------------| |-------------------------------| extruded |--| |--| |---| |-----| |-| |-----| |-----------| |-----| Parameters ---------- uem : Timeline Evaluation map. reference : Annotation Reference annotation. collar : float, optional When provided, set the duration of collars centered around reference segment boundaries that are extruded from both reference and hypothesis. Defaults to 0. (i.e. no collar). skip_overlap : bool, optional Set to True to not evaluate overlap regions. Defaults to False (i.e. keep overlap regions). Returns ------- extruded_uem : Timeline """ if collar == 0. and not skip_overlap: return uem collars, overlap_regions = [], [] # build list of collars if needed if collar > 0.: # iterate over all segments in reference for segment in reference.itersegments(): # add collar centered on start time t = segment.start collars.append(Segment(t - .5 * collar, t + .5 * collar)) # add collar centered on end time t = segment.end collars.append(Segment(t - .5 * collar, t + .5 * collar)) # build list of overlap regions if needed if skip_overlap: # iterate over pair of intersecting segments for (segment1, track1), (segment2, track2) in reference.co_iter(reference): if segment1 == segment2 and track1 == track2: continue # add their intersection overlap_regions.append(segment1 & segment2) segments = collars + overlap_regions return Timeline(segments=segments).support().gaps(support=uem)