def predict(audio, algorithm='SpectralClustering'): # Speech Activation Detection sad_scores = sad(audio) binarize_sad = Binarize(offset=0.52, onset=0.52, log_scale=True, min_duration_off=0.1, min_duration_on=0.1) speech = binarize_sad.apply(sad_scores, dimension=1) # Speaker Change Detection scd_scores = scd(audio) peak = Peak(alpha=0.10, min_duration=0.10, log_scale=True) partition = peak.apply(scd_scores, dimension=1) # Overlapped Speech Detection # ovl_scores = ovl(audio) # binarize_ovl = Binarize(offset=0.55, onset=0.55, log_scale=True, # min_duration_off=0.1, min_duration_on=0.1) # overlap = binarize_ovl.apply(ovl_scores, dimension=1) # Speaker Embedding speech_turns = partition.crop(speech) embeddings = emb(audio) long_turns = Timeline( segments=[s for s in speech_turns if s.duration > .5]) return long_turns, sad_scores, scd_scores, embeddings
import torch import glob, os import pickle sad = torch.hub.load('pyannote/pyannote-audio', 'sad_ami') scd = torch.hub.load('pyannote/pyannote-audio', 'scd_ami') os.chdir("/root/SpeechLab2020/Debates/demos/") for file in glob.glob("*.wav"): print(file) test_file = {'uri': 'filename', 'audio': file } sad_scores = sad(test_file) from pyannote.audio.utils.signal import Binarize binarize = Binarize(offset=0.52, onset=0.52, log_scale=True, scale='relative', min_duration_off=0.1, min_duration_on=0.1) speech = binarize.apply(sad_scores, dimension=1) f_out_speech = file.split('.')[0] + "_speech.pkl" pickle.dump(speech, open(f_out_speech, "wb")) scd_scores = scd(test_file) from pyannote.audio.utils.signal import Peak peak = Peak(alpha=0.10, min_duration=0.10, scale='relative', log_scale=True) partition = peak.apply(scd_scores, dimension=1) f_out_partition = file.split('.')[0] + "_part.pkl" pickle.dump(partition, open(f_out_partition, "wb"), protocol=2)
def initialize(self): """Initialize pipeline with current set of parameters""" self._peak = Peak(alpha=self.alpha, min_duration=self.min_duration)
syl_counts_prev = pd.concat([syl_counts_prev, rates.syl_counts_prev]) syl_counts_next = pd.concat([syl_counts_next, rates.syl_counts_next]) prev_stretch_duration = pd.concat([prev_stretch_duration, rates.prev_stretch_duration]) next_stretch_duration = pd.concat([next_stretch_duration, rates.next_stretch_duration]) end_time = timeit.default_timer() print("Finished calculating speaking rate for File %d/%d" % ((i+1), n_unique_files)) print("Time: ", end_time - start_time) speaking_rates_prev.to_csv(checkpoint_speaking_rate_prev) speaking_rates_next.to_csv(checkpoint_speaking_rate_next) syl_counts_prev.to_csv(checkpoint_syl_counts_prev) syl_counts_next.to_csv(checkpoint_syl_counts_next) prev_stretch_duration.to_csv(checkpoint_prev_stretch_duration) next_stretch_duration.to_csv(checkpoint_next_stretch_duration) with open(checkpoint_file, 'w') as file: writer = csv.writer(file) writer.writerow([i, source_file]) file.close() if __name__ == '__main__': #pip install git+https://github.com/pyannote/pyannote-audio.git@develop #pip install pyAudioAnalysis df_source = pd.read_pickle(DF_SOURCE_PATH) df_hom = pd.read_csv(DF_HOMOGRAPHS_PATH, index_col = "Unnamed: 0") celex_dict = read_celex_file() scd = torch.hub.load('pyannote/pyannote-audio', 'scd_ami') peak = Peak(alpha=0.2, min_duration=0.20, log_scale=True)
class SpeakerChangeDetection(Pipeline): """Speaker change detection pipeline Parameters ---------- scores : Wrappable, optional Describes how raw speaker change detection scores should be obtained. See pyannote.audio.features.wrapper.Wrapper documentation for details. Defaults to "@scd_scores" that indicates that protocol files provide the scores in the "scd_scores" key. purity : `float`, optional Target segments purity. Defaults to 0.95. fscore : bool, optional Optimize (precision/recall) fscore. Defaults to optimizing coverage at given target `purity`. diarization : bool, optional Use diarization purity and coverage. Defaults to segmentation purity and coverage. Hyper-parameters ---------------- alpha : `float` Peak detection threshold. min_duration : `float` Segment minimum duration. """ def __init__(self, scores: Wrappable = None, purity: Optional[float] = 0.95, fscore: bool = False, diarization: bool = False): super().__init__() if scores is None: scores = "@scd_scores" self.scores = scores self._scores = Wrapper(self.scores) self.purity = purity self.fscore = fscore self.diarization = diarization # hyper-parameters self.alpha = Uniform(0., 1.) self.min_duration = Uniform(0., 10.) def initialize(self): """Initialize pipeline with current set of parameters""" self._peak = Peak(alpha=self.alpha, min_duration=self.min_duration) def __call__(self, current_file: dict) -> Annotation: """Apply change detection Parameters ---------- current_file : `dict` File as provided by a pyannote.database protocol. May contain a 'scd_scores' key providing precomputed scores. Returns ------- speech : `pyannote.core.Annotation` Speech regions. """ scd_scores = self._scores(current_file) # if this check has not been done yet, do it once and for all if not hasattr(self, "log_scale_"): # heuristic to determine whether scores are log-scaled if np.nanmean(scd_scores.data) < 0: self.log_scale_ = True else: self.log_scale_ = False data = np.exp(scd_scores.data) if self.log_scale_ \ else scd_scores.data # take the final dimension # (in order to support both classification, multi-class classification, # and regression scores) change_prob = SlidingWindowFeature( data[:, -1], scd_scores.sliding_window) # peak detection change = self._peak.apply(change_prob) change.uri = current_file.get('uri', None) return change.to_annotation(generator='string', modality='audio') def get_metric(self, parallel=False) -> Union[DiarizationPurityCoverageFMeasure, SegmentationPurityCoverageFMeasure]: """Return new instance of f-score metric""" if not self.fscore: raise NotImplementedError() if self.diarization: return DiarizationPurityCoverageFMeasure(parallel=parallel) return SegmentationPurityCoverageFMeasure(tolerance=0.5, parallel=parallel) def loss(self, current_file: dict, hypothesis: Annotation) -> float: """Compute (1 - coverage) at target purity If purity < target, return 1 + (1 - purity) Parameters ---------- current_file : `dict` File as provided by a pyannote.database protocol. hypothesis : `pyannote.core.Annotation` Speech regions. Returns ------- error : `float` 1. - segment coverage. """ metric = SegmentationPurityCoverageFMeasure(tolerance=0.500, beta=1) reference = current_file['annotation'] uem = get_annotated(current_file) f_measure = metric(reference, hypothesis, uem=uem) purity, coverage, _ = metric.compute_metrics() if purity > self.purity: return 1. - coverage else: return 1. + (1. - purity)