def _turn_level(self, current_file: dict, speech_turns: Annotation) -> Annotation: """Apply clustering at speech turn level Parameters ---------- current_file : `dict` File as provided by a pyannote.database protocol. speech_turns : `Annotation` Speech turns. Should only contain `str` labels. Returns ------- hypothesis : `pyannote.core.Annotation` Clustering result. """ assert_string_labels(speech_turns, "speech_turns") embedding = self._embedding(current_file) labels = speech_turns.labels() X, clustered_labels, skipped_labels = [], [], [] for l, label in enumerate(labels): timeline = speech_turns.label_timeline(label, copy=False) # be more and more permissive until we have # at least one embedding for current speech turn for mode in ["strict", "center", "loose"]: x = embedding.crop(timeline, mode=mode) if len(x) > 0: break # skip labels so small we don't have any embedding for it if len(x) < 1: skipped_labels.append(label) continue clustered_labels.append(label) X.append(np.mean(x, axis=0)) # apply clustering of label embeddings clusters = self.clustering(np.vstack(X)) # map each clustered label to its cluster (between 1 and N_CLUSTERS) mapping = {label: k for label, k in zip(clustered_labels, clusters)} # map each skipped label to its own cluster # (between -1 and -N_SKIPPED_LABELS) for l, label in enumerate(skipped_labels): mapping[label] = -(l + 1) # do the actual mapping return speech_turns.rename_labels(mapping=mapping)
def __call__( self, current_file: dict, speech_turns: Annotation, targets: Annotation ) -> Annotation: """Assign each speech turn to closest target (if close enough) Parameters ---------- current_file : `dict` File as provided by a pyannote.database protocol. speech_turns : `Annotation` Speech turns. Should only contain `int` labels. targets : `Annotation` Targets. Should only contain `str` labels. Returns ------- assigned : `Annotation` Assigned speech turns. """ assert_string_labels(targets, "targets") assert_int_labels(speech_turns, "speech_turns") embedding = self._embedding(current_file) # gather targets embedding labels = targets.labels() X_targets, targets_labels = [], [] for l, label in enumerate(labels): timeline = targets.label_timeline(label, copy=False) # be more and more permissive until we have # at least one embedding for current speech turn for mode in ["center", "loose"]: x = embedding.crop(timeline, mode=mode) if len(x) > 0: break # skip labels so small we don't have any embedding for it if len(x) < 1: continue targets_labels.append(label) X_targets.append(np.mean(x, axis=0)) # gather speech turns embedding labels = speech_turns.labels() X, assigned_labels, skipped_labels = [], [], [] for l, label in enumerate(labels): timeline = speech_turns.label_timeline(label, copy=False) # be more and more permissive until we have # at least one embedding for current speech turn for mode in ["center", "loose"]: x = embedding.crop(timeline, mode=mode) if len(x) > 0: break # skip labels so small we don't have any embedding for it if len(x) < 1: skipped_labels.append(label) continue assigned_labels.append(label) X.append(np.mean(x, axis=0)) # assign speech turns to closest class assignments = self.closest_assignment(np.vstack(X_targets), np.vstack(X)) mapping = { label: targets_labels[k] for label, k in zip(assigned_labels, assignments) if not k < 0 } return speech_turns.rename_labels(mapping=mapping)