def crop(self, current_file, segment, mode="center", fixed=None): """Fast version of self(current_file).crop(segment, **kwargs) Parameters ---------- current_file : dict `pyannote.database` file. segment : `pyannote.core.Segment` Segment from which to extract features. Returns ------- features : (n_frames, dimension) numpy array Extracted features See also -------- `pyannote.core.SlidingWindowFeature.crop` """ # match default FeatureExtraction.crop behavior if mode == "center" and fixed is None: fixed = segment.duration memmap = open_memmap(self.get_path(current_file), mode="r") swf = SlidingWindowFeature(memmap, self.sliding_window_) result = swf.crop(segment, mode=mode, fixed=fixed) del memmap return result
def crop(self, item, focus, mode='loose', fixed=None, return_data=True): """Faster version of precomputed(item).crop(...)""" memmap = open_memmap(self.get_path(item), mode='r') swf = SlidingWindowFeature(memmap, self.sliding_window_) result = swf.crop(focus, mode=mode, fixed=fixed, return_data=return_data) del memmap return result
def __call__(self, current_file, return_sr=False): """Obtain waveform Parameters ---------- current_file : dict `pyannote.database` files. return_sr : `bool`, optional Return sample rate. Defaults to False Returns ------- waveform : `pyannote.core.SlidingWindowFeature` Waveform sample_rate : `int` Only when `return_sr` is set to True """ if "waveform" in current_file: if self.sample_rate is None: msg = ("`RawAudio` needs to be instantiated with an actual " "`sample_rate` if one wants to use precomputed " "waveform.") raise ValueError(msg) sample_rate = self.sample_rate y = current_file["waveform"] if len(y.shape) != 2: msg = (f"Precomputed waveform should be provided as a " f"(n_samples, n_channels) `np.ndarray`.") raise ValueError(msg) else: y, sample_rate = sf.read(current_file["audio"], dtype="float32", always_2d=True) # extract specific channel if requested channel = current_file.get("channel", None) if channel is not None: y = y[:, channel - 1:channel] y = self.get_features(y, sample_rate) sliding_window = SlidingWindow(start=-0.5 / sample_rate, duration=1.0 / sample_rate, step=1.0 / sample_rate) if return_sr: return ( SlidingWindowFeature(y, sliding_window), sample_rate if self.sample_rate is None else self.sample_rate, ) return SlidingWindowFeature(y, sliding_window)
def apply(self, current_file): """Compute predictions on a sliding window Parameter --------- current_file : dict Returns ------- predictions : SlidingWindowFeature """ # frame and sub-sequence sliding windows frames = self.feature_extraction.sliding_window() batches = [ batch for batch in self.from_file(current_file, incomplete=True) ] if not batches: data = np.zeros((0, self.dimension), dtype=np.float32) return SlidingWindowFeature(data, frames) fX = np.vstack(batches) subsequences = SlidingWindow(duration=self.duration, step=self.step) # get total number of frames if isinstance(self.feature_extraction, Precomputed): n_frames, _ = self.feature_extraction.shape(current_file) else: uri = get_unique_identifier(current_file) n_frames, _ = self.preprocessed_[uri].data # data[i] is the sum of all predictions for frame #i data = np.zeros((n_frames, self.dimension), dtype=np.float32) # k[i] is the number of sequences that overlap with frame #i k = np.zeros((n_frames, 1), dtype=np.int8) for subsequence, fX_ in zip(subsequences, fX): # indices of frames overlapped by subsequence indices = frames.crop(subsequence, mode='center', fixed=self.duration) # accumulate the outputs data[indices] += fX_ # keep track of the number of overlapping sequence # TODO - use smarter weights (e.g. Hamming window) k[indices] += 1 # compute average embedding of each frame data = data / np.maximum(k, 1) return SlidingWindowFeature(data, frames)
def __call__(self, sequence=Stream.NoNewData): if isinstance(sequence, More): sequence = sequence.output # no input ==> no output if sequence is Stream.NoNewData: return Stream.NoNewData if sequence is Stream.EndOfStream: if not self.initialized_: return Stream.EndOfStream self.initialized_ = False data = self.agg_func(self.buffer_, axis=0) return SlidingWindowFeature(data, self.frames_) if not self.initialized_: return self.initialize(sequence) # check that feature sequence uses the common time base sw = sequence.sliding_window assert sw.duration == self.frames_.duration assert sw.step == self.frames_.step assert sw.start > self.frames_.start delta_start = sw.start - self.frames_.start ready = self.frames_.samples(delta_start, mode='center') data = self.agg_func(self.buffer_[:, :ready], axis=0) output = SlidingWindowFeature(data, self.frames_) self.buffer_ = self.buffer_[:, ready:] self.frames_ = SlidingWindow(start=sw.start, duration=sw.duration, step=sw.step) # remove empty (all NaN) buffers n_buffers = self.buffer_.shape[0] for i in range(n_buffers): if np.any(~np.isnan(self.buffer_[i])): break self.buffer_ = self.buffer_[i:] n_samples = self.buffer_.shape[1] n_new_samples = sequence.data.shape[0] pad_width = ((0, 1), (0, max(0, n_new_samples - n_samples))) for _ in sequence.data.shape[1:]: pad_width += ((0, 0), ) self.buffer_ = np.pad(self.buffer_, pad_width, 'constant', constant_values=np.NAN) self.buffer_[-1] = sequence.data return output
def apply(self, current_file): # Speech Activity Detection # get raw SAD scores soft_sad = self.sad_(current_file) # check once and for all whether SAD scores are log-scaled if not hasattr(self, 'sad_log_scale_'): if np.nanmean(soft_sad.data) < 0: self.sad_log_scale_ = True else: self.sad_log_scale_ = False # get SAD probability prob_sad = np.exp(soft_sad.data) if self.sad_log_scale_ \ else soft_sad.data # support both non-speech/speech & non-speech/single/overlap prob_sad = 1. - prob_sad[:, 0] prob_sad = SlidingWindowFeature(prob_sad, soft_sad.sliding_window) # binarization hard_sad = self.sad_binarize_.apply(prob_sad) # Speaker Change Detection # get raw SCD scores soft_scd = self.scd_(current_file) # check once and for all whether SCD scores are log-scaled if not hasattr(self, 'scd_log_scale_'): if np.nanmean(soft_scd.data) < 0: self.scd_log_scale_ = True else: self.scd_log_scale_ = False # get SCD probability prob_scd = np.exp(soft_scd.data) if self.scd_log_scale_ \ else soft_scd.data # take the final dimension # (in order to support both classification and regression scores) prob_scd = prob_scd[:, -1] prob_scd = SlidingWindowFeature(prob_scd, soft_scd.sliding_window) # peak detection hard_scd = self.scd_peak_.apply(prob_scd) speech_turns = hard_scd.crop(hard_sad) # only process the annotated part speech_turns = speech_turns.crop(get_annotated(current_file)) return speech_turns
def apply(self, current_file): """Compute embeddings on a sliding window Parameter --------- current_file : dict Returns ------- embedding : SlidingWindowFeature """ # compute embedding on sliding window # over the whole duration of the file fX = np.vstack( [batch for batch in self.from_file(current_file, incomplete=True)]) subsequences = SlidingWindow(duration=self.duration, step=self.step) if not self.internal: return SlidingWindowFeature(fX, subsequences) # get total number of frames identifier = get_unique_identifier(current_file) n_frames = self.preprocessed_['X'][identifier].data.shape[0] # data[i] is the sum of all embeddings for frame #i data = np.zeros((n_frames, self.dimension), dtype=np.float32) # k[i] is the number of sequences that overlap with frame #i k = np.zeros((n_frames, 1), dtype=np.int8) # frame and sub-sequence sliding windows frames = self.feature_extractor.sliding_window() for subsequence, fX_ in zip(subsequences, fX): # indices of frames overlapped by subsequence indices = frames.crop(subsequence, mode='center', fixed=self.duration) # accumulate their embedding data[indices] += fX_ # keep track of the number of overlapping sequence k[indices] += 1 # compute average embedding of each frame data = data / np.maximum(k, 1) return SlidingWindowFeature(data, frames)
def __call__(self, features, sliding_window=None): """Apply short-term standardization Parameters ---------- features : `SlidingWindowFeature` or (n_samples, n_features ) `numpy.ndarray` Features. sliding_window : `SlidingWindow`, optional Sliding window when `features` is a `numpy.ndarray`. Not used when `features` is a `SlidingWindowFeature` instance. Returns ------- normalized : `SlidingWindowFeature` or (n_samples, n_features ) `numpy.ndarray` Standardized features """ if isinstance(features, SlidingWindowFeature): features_ = features else: features_ = SlidingWindowFeature(features, sliding_window) window = features_.sliding_window.samples(self.duration, mode="center") if not window % 2: window += 1 rolling = pd.DataFrame(features_.data).rolling(window=window, center=True, min_periods=window) mu = np.array(rolling.mean()) sigma = np.array(rolling.std(ddof=1)) for i in range(window // 2): data = features_.data[:i + window // 2 + 1, :] mu[i] = np.mean(data, axis=0) sigma[i] = np.std(data, axis=0, ddof=1) data = features_.data[-i - window // 2 - 1:, :] mu[-i - 1] = np.mean(data, axis=0) sigma[-i - 1] = np.std(data, axis=0, ddof=1) sigma[sigma == 0.0] = 1e-6 normalized_ = (features_.data - mu) / sigma if isinstance(features, SlidingWindowFeature): return SlidingWindowFeature(normalized_, features.sliding_window) else: return normalized_
def __call__(self, current_file): """Obtain features for file Parameters ---------- current_file : dict `pyannote.database` files. Returns ------- features : `pyannote.core.SlidingWindowFeature` Features """ path = Path(self.get_path(current_file)) if not path.exists(): uri = current_file["uri"] database = current_file["database"] msg = ( f"Directory {self.root_dir} does not contain " f'precomputed features for file "{uri}" of ' f'"{database}" database.' ) raise PyannoteFeatureExtractionError(msg) if self.use_memmap: data = np.load(str(path), mmap_mode="r") else: data = np.load(str(path)) return SlidingWindowFeature(data, self.sliding_window_)
def apply(self, current_file): """Computes BIC distance between sliding windows Parameter --------- current_file : dict Returns ------- predictions : SlidingWindowFeature """ from pyannote.algorithms.stats.gaussian import Gaussian t, left, right = next(self.from_file(current_file)) y = [] for xL, xR in zip(left, right): gL = Gaussian(covariance_type=self.covariance_type).fit(xL) gR = Gaussian(covariance_type=self.covariance_type).fit(xR) y.append(gL.bic(gR, penalty_coef=0)[0]) y = np.array(y) window = SlidingWindow(duration=2 * self.duration, step=self.step, start=0.) return SlidingWindowFeature(y, window)
def apply(self, wav): """Computes distance between sliding windows embeddings Parameter --------- wav : str Path to wav audio file Returns ------- predictions : SlidingWindowFeature """ from pyannote.algorithms.stats.gaussian import Gaussian current_file = {'uri': wav, 'medium': {'wav': wav}} t, left, right = next(self.from_file(current_file)) y = [] for xL, xR in zip(left, right): gL = Gaussian(covariance_type='diag').fit(xL) gR = Gaussian(covariance_type='diag').fit(xR) y.append(gL.divergence(gR)) y = np.array(y) window = SlidingWindow(duration=2 * self.duration, step=self.step, start=0.) return SlidingWindowFeature(y, window)
def __call__(self, sequence=Stream.NoNewData): if isinstance(sequence, More): sequence = sequence.output if sequence in [Stream.EndOfStream, Stream.NoNewData]: return sequence # append to buffer if self.initialized_: # check that feature sequence uses the common time base sw = sequence.sliding_window assert sw.duration == self.frames_.duration assert sw.step == self.frames_.step # check that first frame is exactly the one that is expected expected = self.frames_[len(self.buffer_)] assert np.allclose(expected, sw[0]) # append the new samples at the end of buffer self.buffer_ = np.concatenate([self.buffer_, sequence.data], axis=0) # initialize buffer else: self.initialize(sequence) return SlidingWindowFeature(self.buffer_, self.frames_)
def __call__(self, features, sliding_window=None): """Apply global standardization Parameters ---------- features : `SlidingWindowFeature` or (n_samples, n_features ) `numpy.ndarray` Features. sliding_window : `SlidingWindow`, optional Not used. Returns ------- normalized : `SlidingWindowFeature` or (n_samples, n_features ) `numpy.ndarray` Standardized features """ if isinstance(features, SlidingWindowFeature): data = features.data else: data = features mu = np.mean(data, axis=0) sigma = np.std(data, axis=0, ddof=1) sigma[sigma == 0.0] = 1e-6 normalized = (data - mu) / sigma if isinstance(features, SlidingWindowFeature): return SlidingWindowFeature(normalized, features.sliding_window) else: return normalized
def preprocess(self, current_file, identifier=None): """Pre-compute file-wise X and y""" current_file = self.yaafe_preprocess( current_file, identifier=identifier) if identifier in self.preprocessed_.setdefault('y', {}): return current_file X = self.preprocessed_['X'][identifier] sw = X.sliding_window n_samples = X.getNumber() y = np.zeros((n_samples + 1, 2), dtype=np.int8) # [0,1] ==> speech / [1, 0] ==> non speech / [0, 0] ==> unknown annotated = current_file['annotated'] annotation = current_file['annotation'] coverage = annotation.get_timeline().coverage() for gap in coverage.gaps(annotated): indices = sw.crop(gap, mode='loose') y[indices, 0] = 1 for segment in coverage: indices = sw.crop(segment, mode='loose') y[indices, 1] = 1 y = SlidingWindowFeature(y[:-1], sw) self.preprocessed_['y'][identifier] = y return current_file
def apply(self, wav): """ Parameter --------- wav : str Path to wav audio file Returns ------- predictions : SlidingWindowFeature """ # apply sequence labeling to the whole file current_file = {'uri': wav, 'medium': {'wav': wav}} predictions = next(self.from_file(current_file)) n_sequences, _, n_classes = predictions.shape # estimate total number of frames (over the duration of the whole file) # based on feature extractor internal sliding window and file duration samples_window = self.feature_extractor.sliding_window() n_samples = samples_window.samples(get_wav_duration(wav)) + 3 # +3 is a hack to avoid later IndexError resulting from rounding error # when cropping samples_window # k[i] contains the number of sequences that overlap with frame #i k = np.zeros((n_samples, ), dtype=np.int8) # y[i] contains the sum of predictions for frame #i # over all overlapping samples y = np.zeros((n_samples, n_classes), dtype=np.float32) # sequence sliding window sequence_window = SlidingWindow(duration=self.duration, step=self.step) # accumulate predictions over all sequences for i in range(n_sequences): # position of sequence #i window = sequence_window[i] # indices of frames overlapped by sequence #i indices = samples_window.crop(window, mode='center', fixed=self.duration) # accumulate predictions # TODO - use smarter weights (e.g. Hamming window) k[indices] += 1 y[indices] += predictions[i, :, :] # average prediction y = (y.T / np.maximum(k, 1)).T # returns the whole thing as SlidingWindowFeature return SlidingWindowFeature(y, samples_window)
def __call__(self, current_file): y, sample_rate = read_audio(current_file, sample_rate=self.sample_rate, mono=self.mono) sliding_window = SlidingWindow(start=0., duration=1. / sample_rate, step=1. / sample_rate) return SlidingWindowFeature(y, sliding_window)
def get_features(self, y, sample_rate) -> np.ndarray: features = SlidingWindowFeature( self.feature_extraction_.get_features(y, sample_rate), self.feature_extraction_.sliding_window) return self.model_.slide(features, self.chunks_, batch_size=self.batch_size, device=self.device, return_intermediate=self.return_intermediate, progress_hook=self.progress_hook).data
def __call__(self, current_file: dict) -> Annotation: """Apply overlap detection Parameters ---------- current_file : `dict` File as provided by a pyannote.database protocol. May contain a 'ovl_scores' key providing precomputed scores. Returns ------- overlap : `pyannote.core.Annotation` Overlap regions. """ ovl_scores = self._scores(current_file) # if this check has not been done yet, do it once and for all if not hasattr(self, "log_scale_"): # heuristic to determine whether scores are log-scaled if np.nanmean(ovl_scores.data) < 0: self.log_scale_ = True else: self.log_scale_ = False data = np.exp(ovl_scores.data) if self.log_scale_ \ else ovl_scores.data # overlap vs. non-overlap if data.shape[1] > 1: overlap_prob = SlidingWindowFeature(1. - data[:, 0], ovl_scores.sliding_window) else: overlap_prob = SlidingWindowFeature(data, ovl_scores.sliding_window) overlap = self._binarize.apply(overlap_prob) overlap.uri = current_file['uri'] return overlap.to_annotation(generator='string', modality='overlap')
def __call__(self, current_file: dict) -> Annotation: """Apply speech activity detection Parameters ---------- current_file : `dict` File as provided by a pyannote.database protocol. May contain a 'sad_scores' key providing precomputed scores. Returns ------- speech : `pyannote.core.Annotation` Speech regions. """ sad_scores = self._scores(current_file) # if this check has not been done yet, do it once and for all if not hasattr(self, "log_scale_"): # heuristic to determine whether scores are log-scaled if np.nanmean(sad_scores.data) < 0: self.log_scale_ = True else: self.log_scale_ = False data = np.exp(sad_scores.data) if self.log_scale_ \ else sad_scores.data # speech vs. non-speech if data.shape[1] > 1: speech_prob = SlidingWindowFeature(1. - data[:, 0], sad_scores.sliding_window) else: speech_prob = SlidingWindowFeature(data, sad_scores.sliding_window) speech = self._binarize.apply(speech_prob) speech.uri = current_file.get('uri', None) return speech.to_annotation(generator='string', modality='speech')
def apply(self, current_file): # extract precomputed scores precomputed = self.precomputed_(current_file) # if this check has not been done yet, do it once and for all if not hasattr(self, "log_scale_"): # heuristic to determine whether scores are log-scaled if np.nanmean(precomputed.data) < 0: self.log_scale_ = True else: self.log_scale_ = False data = np.exp(precomputed.data) if self.log_scale_ \ else precomputed.data # speech vs. non-speech speech_prob = SlidingWindowFeature(1. - data[:, 0], precomputed.sliding_window) speech = self.speech_binarize_.apply(speech_prob) if self.has_overlap_: # overlap vs. non-overlap overlap_prob = SlidingWindowFeature(data[:, 2], precomputed.sliding_window) overlap = self.overlap_binarize_.apply(overlap_prob) # overlap speech can only happen in speech regions overlap = overlap.crop(speech) else: # empty timeline overlap = Timeline() speech = speech.to_annotation(generator='string') overlap = overlap.to_annotation(generator='int') hypothesis = speech.update(overlap) return hypothesis
def apply(self, current_file, crop=None): """Extract embeddings Can process either pyannote.database protocol items (as dict) or batch of precomputed feature sequences (as numpy array). Parameters ---------- current_file : dict or numpy array File (from pyannote.database protocol) or batch of precomputed feature sequences. crop : Segment or Timeline, optional When provided, only extract corresponding embeddings. Returns ------- embedding : SlidingWindowFeature or numpy array """ # if current_file is in fact a batch of feature sequences # use postprocess_ndarray directly. if isinstance(current_file, np.ndarray): return self.postprocess_ndarray(current_file) # HACK: change internal SlidingSegment's source to only extract # embeddings on provided "crop". keep track of original source # to set it back before the function returns source = self.generator.source if crop is not None: self.generator.source = crop # compute embedding on sliding window # over the whole duration of the source batches = [ batch for batch in self.from_file(current_file, incomplete=True) ] self.generator.source = source if not batches: fX = np.zeros((0, self.dimension)) else: fX = np.vstack(batches) if crop is not None: return fX subsequences = SlidingWindow(duration=self.duration, step=self.step) return SlidingWindowFeature(fX, subsequences)
def __call__(self, sequence=Stream.NoNewData): if isinstance(sequence, More): sequence = sequence.output if sequence in [Stream.NoNewData, Stream.EndOfStream]: return sequence X = sequence.data[np.newaxis, :, :] predicted = self.model.predict(X, batch_size=1)[0, :, :] if self.dimension is not None: predicted = predicted[:, self.dimension] return SlidingWindowFeature(predicted, sequence.sliding_window)
def __call__(self, item): path = Path(self.get_path(item)) if not path.exists(): uri = get_unique_identifier(item) msg = f'No precomputed features for "{uri}".' raise PyannoteFeatureExtractionError(msg) if self.use_memmap: data = np.load(str(path), mmap_mode='r') else: data = np.load(str(path)) return SlidingWindowFeature(data, self.sliding_window_)