def __init__(self, feature_extraction, sad__pre, scd__pre, emb__pre, sad__onset=0.7, sad__offset=0.7, sad__dimension=1, scd__alpha=0.5, scd__min_duration=1., scd__dimension=1, emb__internal=False, cls__damping=0.8, cls__preference=-20, cls__metric='cosine'): super(SpeakerDiarizationPreStages, self).__init__() self.feature_extraction = feature_extraction # speech activity detection hyper-parameters self.sad__onset = sad__onset self.sad__offset = sad__offset self.sad__dimension = sad__dimension # speaker change detection hyper-parameters self.scd__alpha = scd__alpha self.scd__min_duration = scd__min_duration self.scd__dimension = scd__dimension # embedding hyper-parameters self.emb__internal = emb__internal # clustering hyper-parameters self.cls__damping = cls__damping self.cls__preference = cls__preference self.cls__metric = cls__metric step = self.feature_extraction.sliding_window().step # initialize speech activity detection module self.sad_ = Precomputed(sad__pre) self.sad_binarize_ = Binarize(onset=self.sad__onset, offset=self.sad__offset) # initialize speaker change detection module self.scd_ = Precomputed(scd__pre) self.scd_peak_ = Peak(alpha=self.scd__alpha, min_duration=self.scd__min_duration, percentile=False) # initialize speech turn embedding module self.emb_ = Precomputed(emb__pre) # initialize clustering module self.cls_ = my_cluster.ClusteringAP(metric=self.cls__metric, damping=self.cls__damping, preference=self.cls__preference)
def __init__(self, feature_extraction, sad__pre, scd__pre, emb__pre, sad__onset=0.7, sad__offset=0.7, sad__dimension=1, scd__alpha=0.5, scd__min_duration=1., scd__dimension=1, emb__internal=False, cls__method='average', cls__threshold=5, cls__metric='cosine'): super(SpeakerDiarizationHACPre, self).__init__() self.feature_extraction = feature_extraction # speech activity detection hyper-parameters self.sad__onset = sad__onset self.sad__offset = sad__offset self.sad__dimension = sad__dimension # speaker change detection hyper-parameters self.scd__alpha = scd__alpha self.scd__min_duration = scd__min_duration self.scd__dimension = scd__dimension # embedding hyper-parameters self.emb__internal = emb__internal # clustering hyper-parameters self.cls__method = cls__method self.cls__threshold = cls__threshold self.cls__metric = cls__metric step = self.feature_extraction.sliding_window().step # initialize speech activity detection module self.sad_ = Precomputed(sad__pre) self.sad_binarize_ = Binarize(onset=self.sad__onset, offset=self.sad__offset) # initialize speaker change detection module self.scd_ = Precomputed(scd__pre) self.scd_peak_ = Peak(alpha=self.scd__alpha, min_duration=self.scd__min_duration, percentile=False) # initialize speech turn embedding module self.emb_ = Precomputed(emb__pre) # initialize clustering module self.cls_ = my_cluster.ClusteringHAC(metric=self.cls__metric, method=self.cls__method, threshold=self.cls__threshold)
def main(): usage = "%prog [options] database, raw_score_path" desc = "Write the output of the binary overlap detector into test based on a threshold" version = "%prog 0.1" parser = OptionParser(usage=usage, description=desc, version=version) parser.add_option("-t", "--onset", action="store", type="float", help="Onset Threshold", default=0.70) parser.add_option("-f", "--offset", action="store", type="float", help="Offset Threshold", default=0.70) parser.add_option("-d", "--dev", action="store_true", help="Print output based on development set", default=False) parser.add_option("-o", "--outputfile", action="store", type="string", help="Output file", default="./overlap.txt") (opt, args) = parser.parse_args() if(len(args)!=2): parser.error("Incorrect number of arguments") database, raw_score_path = args # get test file of protocol protocol = get_protocol(database) # load precomputed overlap scores as pyannote.core.SlidingWindowFeature precomputed = Precomputed(raw_score_path) # StackedRNN model # initialize binarizer # onset / offset are tunable parameters (and should be tuned for better # performance). we use log_scale=True because of the final log-softmax in the binarize = Binarize(onset=opt.onset, offset=opt.offset, log_scale=True) fw = open(opt.outputfile, 'wt') if opt.dev: for test_file in protocol.development(): ovl_scores = precomputed(test_file) # binarize overlap scores to obtain overlap regions as pyannote.core.Timeline ovl_regions = binarize.apply(ovl_scores, dimension=1) ovl_regions.uri = test_file['uri'] # write the output into text write_txt(fw, ovl_regions) else: for test_file in protocol.test(): ovl_scores = precomputed(test_file) # binarize overlap scores to obtain overlap regions as pyannote.core.Timeline ovl_regions = binarize.apply(ovl_scores, dimension=1) ovl_regions.uri = test_file['uri'] # write the output into text write_txt(fw, ovl_regions) fw.close()
def with_params(self, **params): # initialize speech/non-speech binarizer speech_params = { '_'.join(param.split('_')[1:]): value for param, value in params.items() if param.startswith('speech_')} self.speech_binarize_ = Binarize(**speech_params) # initialize overlap binarizer if self.has_overlap_: overlap_params = { '_'.join(param.split('_')[1:]): value for param, value in params.items() if param.startswith('overlap_')} self.overlap_binarize_ = Binarize(**overlap_params) return self
def with_params(self, sad_onset=0.7, sad_offset=0.7, scd_alpha=0.5, scd_min_duration=1.): # initialize speech activity detection self.sad_ = Precomputed(self.sad) self.sad_onset = sad_onset self.sad_offset = sad_offset self.sad_binarize_ = Binarize(onset=sad_onset, offset=sad_offset) # initialize speaker change detection self.scd_ = Precomputed(self.scd) self.scd_alpha = scd_alpha self.scd_min_duration = scd_min_duration self.scd_peak_ = Peak(alpha=scd_alpha, min_duration=scd_min_duration) return self
def with_params(self, **params): # initialize speech/non-speech binarizer speech_params = { '_'.join(param.split('_')[1:]): value for param, value in params.items() if param.startswith('speech_') } self.speech_binarize_ = Binarize(**speech_params) # initialize overlap binarizer if self.has_overlap_: overlap_params = { '_'.join(param.split('_')[1:]): value for param, value in params.items() if param.startswith('overlap_') } self.overlap_binarize_ = Binarize(**overlap_params) return self
def fun(threshold): binarizer = Binarize(onset=threshold, offset=threshold, log_scale=False) protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) metric = DetectionErrorRate() # NOTE -- embarrasingly parallel # TODO -- parallelize this file_generator = getattr(protocol, subset)() for current_file in file_generator: uri = get_unique_identifier(current_file) hypothesis = binarizer.apply( predictions[uri], dimension=0).to_annotation() reference = current_file['annotation'] uem = get_annotated(current_file) _ = metric(reference, hypothesis, uem=uem) return abs(metric)
def tune_binarizer(app, epoch, protocol_name, subset='development'): """Tune binarizer Parameters ---------- app : SpeechActivityDetection epoch : int Epoch number. protocol_name : str E.g. 'Etape.SpeakerDiarization.TV' subset : {'train', 'development', 'test'}, optional Defaults to 'development'. Returns ------- params : dict See Binarize.tune metric : float Best achieved detection error rate """ # initialize protocol protocol = get_protocol(protocol_name, progress=False, preprocessors=app.preprocessors_) # load model for epoch 'epoch' sequence_labeling = SequenceLabeling.from_disk(app.train_dir_, epoch) # initialize sequence labeling duration = app.config_['sequences']['duration'] step = app.config_['sequences']['step'] aggregation = SequenceLabelingAggregation(sequence_labeling, app.feature_extraction_, duration=duration, step=step) aggregation.cache_preprocessed_ = False # tune Binarize thresholds (onset & offset) # with respect to detection error rate binarize_params, metric = Binarize.tune(getattr(protocol, subset)(), aggregation.apply, get_metric=DetectionErrorRate, dimension=1) return binarize_params, metric
def annotate_speakers(self, filename, gui, visualization=True): test_file = {'uri': 'filename', 'audio': filename} sad_scores = self.sad(test_file) binarize = Binarize(offset=0.5, onset=0.70, log_scale=True) speech = binarize.apply(sad_scores, dimension=1) scd_scores = self.scd(test_file) peak = Peak(alpha=0.1, min_duration=1, log_scale=True) partition = peak.apply(scd_scores, dimension=1) speech_turns = partition.crop(speech) embeddings = self.emb(test_file) long_turns = Timeline( segments=[s for s in speech_turns if s.duration > 1.1]) res = [] for segment in long_turns: x = embeddings.crop(segment, mode='strict') if x.size == 0: continue n_sample = x.shape[0] x = np.mean(x, axis=0) if np.any(np.isnan(x)): continue res.append((segment, x, n_sample)) if visualization: dist = [] for i in range(len(res)): for j in range(i + 1, len(res)): dist.append(l2_dist(res[i][1], res[j][1])) fig, ax = plt.subplots() ax.scatter(np.arange(len(dist)), np.array(sorted(dist))) fig.show() # let's visualize SAD and SCD results using pyannote.core visualization API # helper function to make visualization prettier plot_ready = lambda scores: SlidingWindowFeature( np.exp(scores.data[:, 1:]), scores.sliding_window) # create a figure with 6 rows with matplotlib nrows = 6 fig, ax = plt.subplots(nrows=nrows, ncols=1) fig.set_figwidth(20) fig.set_figheight(nrows * 2) # 1st row: reference annotation # notebook.plot_annotation(test_file['annotation'], ax=ax[0]) # ax[0].text(notebook.crop.start + 0.5, 0.1, 'reference', fontsize=14) # 2nd row: SAD raw scores notebook.plot_feature(plot_ready(sad_scores), ax=ax[1]) ax[1].text(notebook.crop.start + 0.5, 0.6, 'SAD\nscores', fontsize=14) ax[1].set_ylim(-0.1, 1.1) # 3rd row: SAD result notebook.plot_timeline(speech, ax=ax[2]) ax[2].text(notebook.crop.start + 0.5, 0.1, 'SAD', fontsize=14) # 4th row: SCD raw scores notebook.plot_feature(plot_ready(scd_scores), ax=ax[3]) ax[3].text(notebook.crop.start + 0.5, 0.3, 'SCD\nscores', fontsize=14) ax[3].set_ylim(-0.1, 0.6) # 5th row: SCD result notebook.plot_timeline(partition, ax=ax[4]) ax[4].text(notebook.crop.start + 0.5, 0.1, 'SCD', fontsize=14) # 6th row: combination of SAD and SCD notebook.plot_timeline(speech_turns, ax=ax[5]) ax[5].text(notebook.crop.start + 0.5, 0.1, 'speech turns', fontsize=14) fig.show() res, num_people = self.min_spanning_tree(res) gui.append_line('There are {} people in this audio'.format(num_people)) return res
class SpeechActivityDetection(Pipeline): """Speech activity detection pipeline Parameters ---------- precomputed : str Path to precomputed SAD scores. """ def __init__(self, precomputed=None, **kwargs): super(SpeechActivityDetection, self).__init__() self.precomputed = precomputed self.precomputed_ = Precomputed(self.precomputed) self.has_overlap_ = self.precomputed_.dimension() == 3 self.with_params(**kwargs) def get_tune_space(self): space = { 'speech_onset': chocolate.uniform(0., 1.), 'speech_offset': chocolate.uniform(0., 1.), 'speech_min_duration_on': chocolate.uniform(0., 2.), 'speech_min_duration_off': chocolate.uniform(0., 2.), 'speech_pad_onset': chocolate.uniform(-1., 1.), 'speech_pad_offset': chocolate.uniform(-1., 1.) } if self.has_overlap_: space.update({ 'overlap_onset': chocolate.uniform(0., 1.), 'overlap_offset': chocolate.uniform(0., 1.), 'overlap_min_duration_on': chocolate.uniform(0., 2.), 'overlap_min_duration_off': chocolate.uniform(0., 2.), 'overlap_pad_onset': chocolate.uniform(-1., 1.), 'overlap_pad_offset': chocolate.uniform(-1., 1.) }) return space def get_tune_metric(self): return DetectionErrorRate() def with_params(self, **params): # initialize speech/non-speech binarizer speech_params = { '_'.join(param.split('_')[1:]): value for param, value in params.items() if param.startswith('speech_') } self.speech_binarize_ = Binarize(**speech_params) # initialize overlap binarizer if self.has_overlap_: overlap_params = { '_'.join(param.split('_')[1:]): value for param, value in params.items() if param.startswith('overlap_') } self.overlap_binarize_ = Binarize(**overlap_params) return self def apply(self, current_file): # extract precomputed scores precomputed = self.precomputed_(current_file) # if this check has not been done yet, do it once and for all if not hasattr(self, "log_scale_"): # heuristic to determine whether scores are log-scaled if np.nanmean(precomputed.data) < 0: self.log_scale_ = True else: self.log_scale_ = False data = np.exp(precomputed.data) if self.log_scale_ \ else precomputed.data # speech vs. non-speech speech_prob = SlidingWindowFeature(1. - data[:, 0], precomputed.sliding_window) speech = self.speech_binarize_.apply(speech_prob) if self.has_overlap_: # overlap vs. non-overlap overlap_prob = SlidingWindowFeature(data[:, 2], precomputed.sliding_window) overlap = self.overlap_binarize_.apply(overlap_prob) # overlap speech can only happen in speech regions overlap = overlap.crop(speech) else: # empty timeline overlap = Timeline() speech = speech.to_annotation(generator='string') overlap = overlap.to_annotation(generator='int') hypothesis = speech.update(overlap) return hypothesis
def validate_epoch(self, epoch, protocol_name, subset='development', validation_data=None): target_precision = self.precision # load model for current epoch model = self.load_model(epoch).to(self.device) model.eval() if isinstance(self.feature_extraction_, Precomputed): self.feature_extraction_.use_memmap = False duration = self.task_.duration step = .25 * duration sequence_labeling = SequenceLabeling( model, self.feature_extraction_, duration=duration, step=.25 * duration, batch_size=self.batch_size, source='audio', device=self.device) protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) predictions = {} references = {} file_generator = getattr(protocol, subset)() for current_file in file_generator: uri = get_unique_identifier(current_file) # build overlap reference reference = Timeline(uri=uri) annotation = current_file['annotation'] for track1, track2 in annotation.co_iter(annotation): if track1 == track2: continue reference.add(track1[0] & track2[0]) references[uri] = reference.to_annotation() # extract overlap scores scores = sequence_labeling.apply(current_file) if model.logsoftmax: scores = SlidingWindowFeature( np.exp(scores.data[:, 2]), scores.sliding_window) else: scores = SlidingWindowFeature( scores.data[:, 2], scores.sliding_window) predictions[uri] = scores # dichotomic search to find threshold that maximizes recall # while having at least `target_precision` lower_alpha = 0. upper_alpha = 1. best_alpha = .5 * (lower_alpha + upper_alpha) best_recall = 0. for _ in range(10): current_alpha = .5 * (lower_alpha + upper_alpha) binarizer = Binarize(onset=current_alpha, offset=current_alpha, log_scale=False) precision = DetectionPrecision() recall = DetectionRecall() for current_file in getattr(protocol, subset)(): uri = get_unique_identifier(current_file) reference = references[uri] hypothesis = binarizer.apply(predictions[uri], dimension=0) hypothesis = hypothesis.to_annotation() uem = get_annotated(current_file) _ = precision(reference, hypothesis, uem=uem) _ = recall(reference, hypothesis, uem=uem) if abs(precision) < target_precision: # precision is not high enough: try higher thresholds lower_alpha = current_alpha else: upper_alpha = current_alpha r = abs(recall) if r > best_recall: best_recall = r best_alpha = current_alpha task = 'overlap_speech_detection' metric_name = f'{task}/recall@{target_precision:.2f}precision' return { metric_name: {'minimize': False, 'value': best_recall}, f'{task}/threshold': {'minimize': 'NA', 'value': best_alpha}}
class NeuralSegmentation(Pipeline): def __init__(self, sad=None, scd=None, **kwargs): super().__init__() self.sad = Path(sad).expanduser().resolve(strict=True) self.scd = Path(scd).expanduser().resolve(strict=True) self.with_params(**kwargs) def get_tune_space(self): return { 'sad_onset': chocolate.uniform(0., 1.), 'sad_offset': chocolate.uniform(0., 1.), 'scd_alpha': chocolate.uniform(0., 1.), 'scd_min_duration': chocolate.uniform(0., 5.), } def get_tune_metric(self): raise NotImplementedError() def with_params(self, sad_onset=0.7, sad_offset=0.7, scd_alpha=0.5, scd_min_duration=1.): # initialize speech activity detection self.sad_ = Precomputed(self.sad) self.sad_onset = sad_onset self.sad_offset = sad_offset self.sad_binarize_ = Binarize(onset=sad_onset, offset=sad_offset) # initialize speaker change detection self.scd_ = Precomputed(self.scd) self.scd_alpha = scd_alpha self.scd_min_duration = scd_min_duration self.scd_peak_ = Peak(alpha=scd_alpha, min_duration=scd_min_duration) return self def apply(self, current_file): # Speech Activity Detection # get raw SAD scores soft_sad = self.sad_(current_file) # check once and for all whether SAD scores are log-scaled if not hasattr(self, 'sad_log_scale_'): if np.nanmean(soft_sad.data) < 0: self.sad_log_scale_ = True else: self.sad_log_scale_ = False # get SAD probability prob_sad = np.exp(soft_sad.data) if self.sad_log_scale_ \ else soft_sad.data # support both non-speech/speech & non-speech/single/overlap prob_sad = 1. - prob_sad[:, 0] prob_sad = SlidingWindowFeature(prob_sad, soft_sad.sliding_window) # binarization hard_sad = self.sad_binarize_.apply(prob_sad) # Speaker Change Detection # get raw SCD scores soft_scd = self.scd_(current_file) # check once and for all whether SCD scores are log-scaled if not hasattr(self, 'scd_log_scale_'): if np.nanmean(soft_scd.data) < 0: self.scd_log_scale_ = True else: self.scd_log_scale_ = False # get SCD probability prob_scd = np.exp(soft_scd.data) if self.scd_log_scale_ \ else soft_scd.data # take the final dimension # (in order to support both classification and regression scores) prob_scd = prob_scd[:, -1] prob_scd = SlidingWindowFeature(prob_scd, soft_scd.sliding_window) # peak detection hard_scd = self.scd_peak_.apply(prob_scd) speech_turns = hard_scd.crop(hard_sad) # only process the annotated part speech_turns = speech_turns.crop(get_annotated(current_file)) return speech_turns
class SpeakerDiarizationWeighted(object): def __init__(self, feature_extraction, sad__pre, scd__pre, weight__pre, emb__pre, sad__onset=0.7, sad__offset=0.7, sad__dimension=1, scd__alpha=0.5, scd__min_duration=1., scd__dimension=1, emb__internal=False, cls__damping=0.8, cls__preference=-20, cls__metric='cosine'): super(SpeakerDiarizationWeighted, self).__init__() self.feature_extraction = feature_extraction # speech activity detection hyper-parameters self.sad__onset = sad__onset self.sad__offset = sad__offset self.sad__dimension = sad__dimension # speaker change detection hyper-parameters self.scd__alpha = scd__alpha self.scd__min_duration = scd__min_duration self.scd__dimension = scd__dimension # embedding hyper-parameters self.emb__internal = emb__internal # clustering hyper-parameters self.cls__damping = cls__damping self.cls__preference = cls__preference self.cls__metric = cls__metric step = self.feature_extraction.sliding_window().step # initialize speech activity detection module self.sad_ = Precomputed(sad__pre) self.sad_binarize_ = Binarize(onset=self.sad__onset, offset=self.sad__offset) # initialize speaker change detection module self.scd_ = Precomputed(scd__pre) self.scd_peak_ = Peak(alpha=self.scd__alpha, min_duration=self.scd__min_duration, percentile=False) # initialize weights self.weight_ = Precomputed(weight__pre) # initialize speech turn embedding module self.emb_ = Precomputed(emb__pre) # initialize clustering module self.cls_ = my_cluster.ClusteringAP(metric=self.cls__metric, damping=self.cls__damping, preference=self.cls__preference) def __call__(self, current_file, annotated=False): # speech activity detection soft_sad = self.sad_(current_file) hard_sad = self.sad_binarize_.apply(soft_sad, dimension=self.sad__dimension) # speaker change detection soft_scd = self.scd_(current_file) hard_scd = self.scd_peak_.apply(soft_scd, dimension=self.scd__dimension) # speech turns speech_turns = hard_scd.crop(hard_sad) if annotated: speech_turns = speech_turns.crop(get_annotated(current_file)) # remove small speech turns emb = self.emb_(current_file) speech_turns = [ speech_turn for speech_turn in speech_turns if len(emb.crop(speech_turn, mode='loose')) > 0 ] # weights weight = self.weight_(current_file) # speech turns embedding to_stack = [ np.mean(emb.crop(speech_turn, mode='loose') * (1 - weight.crop(speech_turn, mode='loose')), axis=0) for speech_turn in speech_turns ] if len(to_stack) < 1: return None fX = l2_normalize(np.vstack(to_stack)) # speech turn clustering cluster_labels = self.cls_.apply(fX) # build hypothesis from clustering results hypothesis = Annotation(uri=current_file['uri']) for speech_turn, label in zip(speech_turns, cluster_labels): hypothesis[speech_turn] = label return hypothesis
class SpeakerDiarizationHACPre(object): '''Speaker diarization with hierarchical agglomerative clustering''' def __init__(self, feature_extraction, sad__pre, scd__pre, emb__pre, sad__onset=0.7, sad__offset=0.7, sad__dimension=1, scd__alpha=0.5, scd__min_duration=1., scd__dimension=1, emb__internal=False, cls__method='average', cls__threshold=5, cls__metric='cosine'): super(SpeakerDiarizationHACPre, self).__init__() self.feature_extraction = feature_extraction # speech activity detection hyper-parameters self.sad__onset = sad__onset self.sad__offset = sad__offset self.sad__dimension = sad__dimension # speaker change detection hyper-parameters self.scd__alpha = scd__alpha self.scd__min_duration = scd__min_duration self.scd__dimension = scd__dimension # embedding hyper-parameters self.emb__internal = emb__internal # clustering hyper-parameters self.cls__method = cls__method self.cls__threshold = cls__threshold self.cls__metric = cls__metric step = self.feature_extraction.sliding_window().step # initialize speech activity detection module self.sad_ = Precomputed(sad__pre) self.sad_binarize_ = Binarize(onset=self.sad__onset, offset=self.sad__offset) # initialize speaker change detection module self.scd_ = Precomputed(scd__pre) self.scd_peak_ = Peak(alpha=self.scd__alpha, min_duration=self.scd__min_duration, percentile=False) # initialize speech turn embedding module self.emb_ = Precomputed(emb__pre) # initialize clustering module self.cls_ = my_cluster.ClusteringHAC(metric=self.cls__metric, method=self.cls__method, threshold=self.cls__threshold) def __call__(self, current_file, annotated=False): # speech activity detection soft_sad = self.sad_(current_file) hard_sad = self.sad_binarize_.apply(soft_sad, dimension=self.sad__dimension) # speaker change detection soft_scd = self.scd_(current_file) hard_scd = self.scd_peak_.apply(soft_scd, dimension=self.scd__dimension) # speech turns speech_turns = hard_scd.crop(hard_sad) if annotated: speech_turns = speech_turns.crop(get_annotated(current_file)) # remove small speech turns emb = self.emb_(current_file) speech_turns = [ speech_turn for speech_turn in speech_turns if len(emb.crop(speech_turn, mode='loose')) > 0 ] # speech turns embedding to_stack = [ np.sum(emb.crop(speech_turn, mode='loose'), axis=0) for speech_turn in speech_turns ] if len(to_stack) < 1: return None fX = l2_normalize(np.vstack(to_stack)) # speech turn clustering cluster_labels = self.cls_.apply(fX) # build hypothesis from clustering results hypothesis = Annotation(uri=current_file['uri']) for speech_turn, label in zip(speech_turns, cluster_labels): hypothesis[speech_turn] = label return hypothesis
def test(dataset, medium_template, config_yml, weights_h5, output_dir): # load configuration file with open(config_yml, 'r') as fp: config = yaml.load(fp) # this is where model architecture was saved architecture_yml = os.path.dirname( os.path.dirname(weights_h5)) + '/architecture.yml' # -- DATASET -- db, task, protocol, subset = dataset.split('.') database = get_database(db, medium_template=medium_template) protocol = database.get_protocol(task, protocol) if not hasattr(protocol, subset): raise NotImplementedError('') file_generator = getattr(protocol, subset)() # -- FEATURE EXTRACTION -- # input sequence duration duration = config['feature_extraction']['duration'] # MFCCs feature_extractor = YaafeMFCC(**config['feature_extraction']['mfcc']) # normalization normalize = config['feature_extraction']['normalize'] # -- TESTING -- # overlap ratio between each window overlap = config['testing']['overlap'] step = duration * (1. - overlap) # prediction smoothing onset = config['testing']['binarize']['onset'] offset = config['testing']['binarize']['offset'] binarizer = Binarize(onset=0.5, offset=0.5) sequence_labeling = SequenceLabeling.from_disk(architecture_yml, weights_h5) aggregation = SequenceLabelingAggregation(sequence_labeling, feature_extractor, normalize=normalize, duration=duration, step=step) collar = 0.500 error_rate = DetectionErrorRate(collar=collar) accuracy = DetectionAccuracy(collar=collar) precision = DetectionPrecision(collar=collar) recall = DetectionRecall(collar=collar) LINE = '{uri} {e:.3f} {a:.3f} {p:.3f} {r:.3f} {f:.3f}\n' PATH = '{output_dir}/eval.{dataset}.{subset}.txt' path = PATH.format(output_dir=output_dir, dataset=dataset, subset=subset) with open(path, 'w') as fp: header = '# uri error accuracy precision recall f_measure\n' fp.write(header) fp.flush() for current_file in file_generator: uri = current_file['uri'] wav = current_file['medium']['wav'] annotated = current_file['annotated'] annotation = current_file['annotation'] predictions = aggregation.apply(wav) hypothesis = binarizer.apply(predictions, dimension=1) e = error_rate(annotation, hypothesis, uem=annotated) a = accuracy(annotation, hypothesis, uem=annotated) p = precision(annotation, hypothesis, uem=annotated) r = recall(annotation, hypothesis, uem=annotated) f = f_measure(p, r) line = LINE.format(uri=uri, e=e, a=a, p=p, r=r, f=f) fp.write(line) fp.flush() PATH = '{output_dir}/{uri}.json' path = PATH.format(output_dir=output_dir, uri=uri) dump_to(hypothesis, path) # average on whole corpus uri = '{dataset}.{subset}'.format(dataset=dataset, subset=subset) e = abs(error_rate) a = abs(accuracy) p = abs(precision) r = abs(recall) f = f_measure(p, r) line = LINE.format(uri=uri, e=e, a=a, p=p, r=r, f=f) fp.write(line) fp.flush()
def apply(self, protocol_name, subset='test'): apply_dir = self.APPLY_DIR.format(tune_dir=self.tune_dir_) mkdir_p(apply_dir) # load tuning results tune_yml = self.TUNE_YML.format(tune_dir=self.tune_dir_) with io.open(tune_yml, 'r') as fp: self.tune_ = yaml.load(fp) # load model for epoch 'epoch' epoch = self.tune_['epoch'] sequence_labeling = SequenceLabeling.from_disk(self.train_dir_, epoch) # initialize sequence labeling duration = self.config_['sequences']['duration'] step = self.config_['sequences']['step'] aggregation = SequenceLabelingAggregation(sequence_labeling, self.feature_extraction_, duration=duration, step=step) # initialize protocol protocol = get_protocol(protocol_name, progress=True, preprocessors=self.preprocessors_) for i, item in enumerate(getattr(protocol, subset)()): prediction = aggregation.apply(item) if i == 0: # create metadata file at root that contains # sliding window and dimension information path = Precomputed.get_config_path(apply_dir) f = h5py.File(path) f.attrs['start'] = prediction.sliding_window.start f.attrs['duration'] = prediction.sliding_window.duration f.attrs['step'] = prediction.sliding_window.step f.attrs['dimension'] = 2 f.close() path = Precomputed.get_path(apply_dir, item) # create parent directory mkdir_p(dirname(path)) f = h5py.File(path) f.attrs['start'] = prediction.sliding_window.start f.attrs['duration'] = prediction.sliding_window.duration f.attrs['step'] = prediction.sliding_window.step f.attrs['dimension'] = 2 f.create_dataset('features', data=prediction.data) f.close() # initialize binarizer onset = self.tune_['onset'] offset = self.tune_['offset'] binarize = Binarize(onset=onset, offset=offset) precomputed = Precomputed(root_dir=apply_dir) writer = MDTMParser() path = self.HARD_MDTM.format(apply_dir=apply_dir, protocol=protocol_name, subset=subset) with io.open(path, mode='w') as gp: for item in getattr(protocol, subset)(): prediction = precomputed(item) segmentation = binarize.apply(prediction, dimension=1) writer.write(segmentation.to_annotation(), f=gp, uri=item['uri'], modality='speaker')
class SpeechActivityDetection(Pipeline): """Speech activity detection pipeline Parameters ---------- precomputed : str Path to precomputed SAD scores. """ def __init__(self, precomputed=None, **kwargs): super(SpeechActivityDetection, self).__init__() self.precomputed = precomputed self.precomputed_ = Precomputed(self.precomputed) self.has_overlap_ = self.precomputed_.dimension() == 3 self.with_params(**kwargs) def get_tune_space(self): space = { 'speech_onset': chocolate.uniform(0., 1.), 'speech_offset': chocolate.uniform(0., 1.), 'speech_min_duration_on': chocolate.uniform(0., 2.), 'speech_min_duration_off': chocolate.uniform(0., 2.), 'speech_pad_onset': chocolate.uniform(-1., 1.), 'speech_pad_offset': chocolate.uniform(-1., 1.) } if self.has_overlap_: space.update({ 'overlap_onset': chocolate.uniform(0., 1.), 'overlap_offset': chocolate.uniform(0., 1.), 'overlap_min_duration_on': chocolate.uniform(0., 2.), 'overlap_min_duration_off': chocolate.uniform(0., 2.), 'overlap_pad_onset': chocolate.uniform(-1., 1.), 'overlap_pad_offset': chocolate.uniform(-1., 1.) }) return space def get_tune_metric(self): return DetectionErrorRate() def with_params(self, **params): # initialize speech/non-speech binarizer speech_params = { '_'.join(param.split('_')[1:]): value for param, value in params.items() if param.startswith('speech_')} self.speech_binarize_ = Binarize(**speech_params) # initialize overlap binarizer if self.has_overlap_: overlap_params = { '_'.join(param.split('_')[1:]): value for param, value in params.items() if param.startswith('overlap_')} self.overlap_binarize_ = Binarize(**overlap_params) return self def apply(self, current_file): # extract precomputed scores precomputed = self.precomputed_(current_file) # if this check has not been done yet, do it once and for all if not hasattr(self, "log_scale_"): # heuristic to determine whether scores are log-scaled if np.nanmean(precomputed.data) < 0: self.log_scale_ = True else: self.log_scale_ = False data = np.exp(precomputed.data) if self.log_scale_ \ else precomputed.data # speech vs. non-speech speech_prob = SlidingWindowFeature( 1. - data[:, 0], precomputed.sliding_window) speech = self.speech_binarize_.apply(speech_prob) if self.has_overlap_: # overlap vs. non-overlap overlap_prob = SlidingWindowFeature( data[:, 2], precomputed.sliding_window) overlap = self.overlap_binarize_.apply(overlap_prob) # overlap speech can only happen in speech regions overlap = overlap.crop(speech) else: # empty timeline overlap = Timeline() speech = speech.to_annotation(generator='string') overlap = overlap.to_annotation(generator='int') hypothesis = speech.update(overlap) return hypothesis
# protocol = get_protocol('AMI.SpeakerDiarization.MixHeadset', # preprocessors=preprocessors) # test_file = next(protocol.test()) # from pyannote.audio.labeling.extraction import SequenceLabeling sad = SequenceLabeling(model=SAD_MODEL) scd = SequenceLabeling(model=SCD_MODEL) sad_scores = sad(test_file) # # # binarize raw SAD scores (as `pyannote.core.Timeline` instance) # # NOTE: both onset/offset values were tuned on AMI dataset. # # you might need to use different values for better results. from pyannote.audio.signal import Binarize binarize = Binarize(offset=0.94, onset=0.70, log_scale=True) speech = binarize.apply(sad_scores, dimension=1) # # iterate over speech segments (as `pyannote.core.Segment` instances) for segment in speech: print(segment.start, segment.end) # obtain raw SCD scores (as `pyannote.core.SlidingWindowFeature` instance) scd_scores = scd(test_file) # detect peaks and return speaker homogeneous segments # (as `pyannote.core.Annotation` instance) # NOTE: both alpha/min_duration values were tuned on AMI dataset. # you might need to use different values for better results. from pyannote.audio.signal import Peak peak = Peak(alpha=0.08, min_duration=0.40, log_scale=True)