def _decode( self, current_file: ProtocolFile, hypothesis: Annotation, scores: SlidingWindowFeature, labels: Iterable, ) -> Annotation: N, K = scores.data.shape if self.allow_overlap: active_speakers = scores.data > 0.5 else: if self.lock_speech: active_speakers = np.argmax(scores.data, axis=1) + 1 else: active_speakers = np.argmax(scores.data, axis=1) # reconstruct annotation new_hypothesis = one_hot_decoding(active_speakers, scores.sliding_window, labels=labels) new_hypothesis.uri = hypothesis.uri if self.lock_speech: speech = hypothesis.get_timeline().support() new_hypothesis = new_hypothesis.crop(speech) return new_hypothesis
def _window_level(self, current_file: dict, speech_regions: Timeline) -> Annotation: """Apply clustering at window level Parameters ---------- current_file : `dict` File as provided by a pyannote.database protocol. speech_regions : `Timeline` Speech regions. Returns ------- hypothesis : `pyannote.core.Annotation` Clustering result. """ # load embeddings embedding = self._embedding(current_file) window = embedding.sliding_window # extract and stack embeddings of speech regions X = np.vstack([ embedding.crop(segment, mode="center", fixed=segment.duration) for segment in speech_regions ]) # apply clustering y_pred = self.clustering(X) # reconstruct y = np.zeros(len(embedding), dtype=np.int8) # n = total number of "speech" embeddings # s_pred = current position in y_pred s_pred, n = 0, len(y_pred) for segment in speech_regions: # get indices of current speech segment ((s, e), ) = window.crop(segment, mode="center", fixed=segment.duration, return_ranges=True) # hack for the very last segment that might overflow by 1 e_pred = min(s_pred + e - s, n - 1) e = s + (e_pred - s_pred) # assign y_pred to the corresponding speech regions y[s:e] = y_pred[s_pred:e_pred] # increment current position in y_red s_pred += e - s # reconstruct hypothesis return one_hot_decoding(y, window)
def apply(self, annotation, features): """ Parameters ---------- annotation : `pyannote.core.Annotation` Original annotation to be resegmented. features : `SlidingWindowFeature` Features Returns ------- hypothesis : `pyannote.core.Annotation` Resegmented annotation. """ sliding_window = features.sliding_window window = np.ones((1, sliding_window.samples(self.window))) log_probs = [] labels = annotation.labels() # FIXME: embarrasingly parallel for label in labels: # gather all features for current label span = annotation.label_timeline(label) data = features.crop(span, mode='center') # train a GMM gmm = GaussianMixture(n_components=self.n_components, covariance_type='diag', tol=0.001, reg_covar=1e-06, max_iter=self.n_iter, n_init=1, init_params='kmeans', weights_init=None, means_init=None, precisions_init=None, random_state=None, warm_start=False, verbose=0, verbose_interval=10).fit(data) # compute log-probability across the whole file log_prob = gmm.score_samples(features.data) log_probs.append(log_prob) # smooth log-probability using a sliding window log_probs = scipy.signal.convolve(np.vstack(log_probs), window, mode='same') # assign each frame to the most likely label y = np.argmax(log_probs, axis=0) # reconstruct the annotation hypothesis = one_hot_decoding(y, sliding_window, labels=labels) # remove original non-speech regions return hypothesis.crop(annotation.get_timeline().support())
def _decode( self, current_file: ProtocolFile, hypothesis: Annotation, scores: SlidingWindowFeature, labels: Iterable, ) -> Annotation: # obtain overlapped speech regions overlap = self.binarizer_.apply(current_file["overlap"], dimension=1) frames = scores.sliding_window N, K = scores.data.shape if self.lock_speech: # K = 1 <~~> only non-speech # K = 2 <~~> just one speaker if K < 3: return hypothesis # sequence of two most likely speaker indices # (even when non-speech is in fact the most likely class) best_speakers_indices = np.argsort(-scores.data[:, 1:], axis=1)[:, :2] active_speakers = np.zeros((N, K - 1), dtype=np.int64) # start by assigning most likely speaker... for t, k in enumerate(best_speakers_indices[:, 0]): active_speakers[t, k] = 1 # ... then add second most likely speaker in overlap regions T = frames.crop(overlap, mode="strict") # because overlap may use a different feature extraction step # it might happen that T contains indices slightly large than # the actual number of frames. the line below remove any such # indices. T = T[T < N] # mark second most likely speaker as active active_speakers[T, best_speakers_indices[T, 1]] = 1 # reconstruct annotation new_hypothesis = one_hot_decoding(active_speakers, frames, labels=labels) # revert non-speech regions back to original speech = hypothesis.get_timeline().support() new_hypothesis = new_hypothesis.crop(speech) else: # K = 1 <~~> only non-speech if K < 2: return hypothesis # sequence of two most likely class indices # sequence of two most likely class indices # (including 0=non-speech) best_speakers_indices = np.argsort(-scores.data, axis=1)[:, :2] active_speakers = np.zeros((N, K - 1), dtype=np.int64) # start by assigning the most likely speaker... for t, k in enumerate(best_speakers_indices[:, 0]): # k = 0 is for non-speech if k > 0: active_speakers[t, k - 1] = 1 # ... then add second most likely speaker in overlap regions T = frames.crop(overlap, mode="strict") # because overlap may use a different feature extraction step # it might happen that T contains indices slightly large than # the actual number of frames. the line below remove any such # indices. T = T[T < N] # remove timesteps where second most likely class is non-speech T = T[best_speakers_indices[T, 1] > 0] # mark second most likely speaker as active active_speakers[T, best_speakers_indices[T, 1] - 1] = 1 # reconstruct annotation new_hypothesis = one_hot_decoding(active_speakers, frames, labels=labels) new_hypothesis.uri = hypothesis.uri return new_hypothesis