def process_current_file(current_file, file_finder=None, precomputed=None, feature_extraction=None, robust=False): try: current_file['audio'] = file_finder(current_file) except ValueError as e: if not robust: raise PyannoteFeatureExtractionError(*e.args) return e uri = get_unique_identifier(current_file) path = precomputed.get_path(current_file) if os.path.exists(path): return try: features = feature_extraction(current_file) except PyannoteFeatureExtractionError as e: msg = 'Feature extraction failed for file "{uri}".' return msg.format(uri=uri) if features is None: msg = 'Feature extraction returned None for file "{uri}".' return msg.format(uri=uri) if np.any(np.isnan(features.data)): msg = 'Feature extraction returned NaNs for file "{uri}".' return msg.format(uri=uri) precomputed.dump(current_file, features) return
def __call__(self, current_file) -> SlidingWindowFeature: """Extract features from file Parameters ---------- current_file : dict `pyannote.database` files. Returns ------- features : `pyannote.core.SlidingWindowFeature` Extracted features """ # load waveform, re-sample, convert to mono, augment, normalize y, sample_rate = self.raw_audio_(current_file, return_sr=True) # compute features features = self.get_features(y.data, sample_rate) # basic quality check if np.any(np.isnan(features)): uri = get_unique_identifier(current_file) msg = f'Features extracted from "{uri}" contain NaNs.' warnings.warn(msg.format(uri=uri)) # wrap features in a `SlidingWindowFeature` instance return SlidingWindowFeature(features, self.sliding_window)
def get_hash(trial_file): uri = get_unique_identifier(trial_file) try_with = trial_file['try_with'] if isinstance(try_with, Timeline): segments = tuple(try_with) else: segments = (try_with, ) return hash((uri, segments))
def apply(self, current_file): """Compute predictions on a sliding window Parameter --------- current_file : dict Returns ------- predictions : SlidingWindowFeature """ # frame and sub-sequence sliding windows frames = self.feature_extraction.sliding_window() batches = [ batch for batch in self.from_file(current_file, incomplete=True) ] if not batches: data = np.zeros((0, self.dimension), dtype=np.float32) return SlidingWindowFeature(data, frames) fX = np.vstack(batches) subsequences = SlidingWindow(duration=self.duration, step=self.step) # get total number of frames if isinstance(self.feature_extraction, Precomputed): n_frames, _ = self.feature_extraction.shape(current_file) else: uri = get_unique_identifier(current_file) n_frames, _ = self.preprocessed_[uri].data # data[i] is the sum of all predictions for frame #i data = np.zeros((n_frames, self.dimension), dtype=np.float32) # k[i] is the number of sequences that overlap with frame #i k = np.zeros((n_frames, 1), dtype=np.int8) for subsequence, fX_ in zip(subsequences, fX): # indices of frames overlapped by subsequence indices = frames.crop(subsequence, mode='center', fixed=self.duration) # accumulate the outputs data[indices] += fX_ # keep track of the number of overlapping sequence # TODO - use smarter weights (e.g. Hamming window) k[indices] += 1 # compute average embedding of each frame data = data / np.maximum(k, 1) return SlidingWindowFeature(data, frames)
def apply(self, current_file): """Compute predictions on a sliding window Parameter --------- current_file : dict Returns ------- predictions : SlidingWindowFeature """ # frame and sub-sequence sliding windows frames = self.feature_extraction.sliding_window() batches = [batch for batch in self.from_file(current_file, incomplete=True)] if not batches: data = np.zeros((0, self.dimension), dtype=np.float32) return SlidingWindowFeature(data, frames) fX = np.vstack(batches) subsequences = SlidingWindow(duration=self.duration, step=self.step) # get total number of frames if isinstance(self.feature_extraction, Precomputed): n_frames, _ = self.feature_extraction.shape(current_file) else: uri = get_unique_identifier(current_file) n_frames, _ = self.preprocessed_[uri].data # data[i] is the sum of all predictions for frame #i data = np.zeros((n_frames, self.dimension), dtype=np.float32) # k[i] is the number of sequences that overlap with frame #i k = np.zeros((n_frames, 1), dtype=np.int8) for subsequence, fX_ in zip(subsequences, fX): # indices of frames overlapped by subsequence indices = frames.crop(subsequence, mode='center', fixed=self.duration) # accumulate the outputs data[indices] += fX_ # keep track of the number of overlapping sequence # TODO - use smarter weights (e.g. Hamming window) k[indices] += 1 # compute average embedding of each frame data = data / np.maximum(k, 1) return SlidingWindowFeature(data, frames)
def preprocess(self, current_file): """On-demand feature extraction Parameters ---------- current_file : dict Generated by a pyannote.database.Protocol Returns ------- current_file : dict Current file with additional "features" entry Notes ----- Does nothing when self.feature_extraction is a pyannote.audio.features.Precomputed instance. """ # if "features" are precomputed on disk, do nothing # as "process_segment" will load just the part we need if isinstance(self.feature_extraction, Precomputed): return current_file # if (by chance) current_file already contains "features" # do nothing. if 'features' in current_file: return current_file # if we get there, it means that we need to extract features # for current_file. let's create a cache to store them... if not hasattr(self, 'preprocessed_'): self.preprocessed_ = LRUCache(maxsize=CACHE_MAXSIZE) # this is the key that will be used to know if "features" # already exist in cache uri = get_unique_identifier(current_file) # if "features" are not cached for current file # compute and cache them... if uri not in self.preprocessed_: features = self.feature_extraction(current_file) self.preprocessed_[uri] = features # create copy of current_file to prevent "features" # from consuming increasing memory... preprocessed = dict(current_file) # add "features" key preprocessed['features'] = self.preprocessed_[uri] return preprocessed
def check(protocol_name, file_finder, experiment_dir): protocol = get_protocol(protocol_name) precomputed = Precomputed(experiment_dir) for subset in ['development', 'test', 'train']: try: file_generator = getattr(protocol, subset)() first_item = next(file_generator) except NotImplementedError as e: continue for current_file in getattr(protocol, subset)(): try: audio = file_finder(current_file) current_file['audio'] = audio except ValueError as e: print(e) continue duration = get_audio_duration(current_file) try: features = precomputed(current_file) except PyannoteFeatureExtractionError as e: print(e) continue if not np.isclose(duration, features.getExtent().duration, atol=1.): uri = get_unique_identifier(current_file) print('Duration mismatch for "{uri}"'.format(uri=uri)) if np.any(np.isnan(features.data)): uri = get_unique_identifier(current_file) print('NaN for "{uri}"'.format(uri=uri))
def fun(threshold): _metric = DiarizationPurityCoverageFMeasure(weighted=False) for current_file in getattr(_protocol, subset)(): uri = get_unique_identifier(current_file) uem = get_annotated(current_file) reference = current_file["annotation"] clusters = fcluster(Z[uri], threshold, criterion="distance") hypothesis = Annotation(uri=uri) for (start_time, end_time), cluster in zip(t[uri], clusters): hypothesis[Segment(start_time, end_time)] = cluster _ = _metric(reference, hypothesis, uem=uem) return 1.0 - abs(_metric)
def process_current_file(current_file, file_finder=None, precomputed=None, feature_extraction=None, normalization=None, robust=False): try: current_file['audio'] = file_finder(current_file) except ValueError as e: if not robust: raise PyannoteFeatureExtractionError(*e.args) return e uri = get_unique_identifier(current_file) path = precomputed.get_path(current_file) if os.path.exists(path): return try: features = feature_extraction(current_file) except PyannoteFeatureExtractionError as e: msg = 'Feature extraction failed for file "{uri}".' return msg.format(uri=uri) if features is None: msg = 'Feature extraction returned None for file "{uri}".' return msg.format(uri=uri) if np.any(np.isnan(features.data)): msg = 'Feature extraction returned NaNs for file "{uri}".' return msg.format(uri=uri) if normalization is not None: features = normalization(features) precomputed.dump(current_file, features) return
def apply(self, protocol_name, output_dir): # file generator protocol = get_protocol(protocol_name, progress=True, preprocessors=self.preprocessors_) mkdir_p(output_dir) path = Path(output_dir) / f'{protocol_name}.txt' with open(path, mode='w') as fp: for current_file in FileFinder.protocol_file_iter( protocol, extra_keys=['audio']): uri = get_unique_identifier(current_file) hypothesis = self.pipeline_.apply(current_file) if isinstance(hypothesis, Timeline): for s in hypothesis: fp.write(f'{uri} {s.start:.3f} {s.end:.3f}\n') continue for s, t, l in hypothesis.itertracks(yield_label=True): fp.write(f'{uri} {s.start:.3f} {s.end:.3f} {t} {l}\n')
def fun(threshold): binarizer = Binarize(onset=threshold, offset=threshold, log_scale=False) protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) metric = DetectionErrorRate() # NOTE -- embarrasingly parallel # TODO -- parallelize this file_generator = getattr(protocol, subset)() for current_file in file_generator: uri = get_unique_identifier(current_file) hypothesis = binarizer.apply( predictions[uri], dimension=0).to_annotation() reference = current_file['annotation'] uem = get_annotated(current_file) _ = metric(reference, hypothesis, uem=uem) return abs(metric)
def initialize(self, protocol, subset='train'): """Gather the following information about the training subset: data_ : dict {'segments': <list of annotated segments>, 'duration': <total duration of annotated segments>, 'current_file': <protocol dictionary>, 'y': <labels as numpy array>} databases_ : list Sorted list of (unique) databases in protocol. labels_ : list Sorted list of (unique) lables in protocol. """ self.data_ = {} labels, databases = set(), set() # loop once on all files for current_file in getattr(protocol, subset)(): # keep track of database database = current_file['database'] databases.add(database) # keep track of unique labels for label in current_file['annotation'].labels(): label = get_label_identifier(label, current_file) labels.add(label) annotated = get_annotated(current_file) if not self.precomputed.use_memmap: msg = ('Loading all precomputed features in memory. ' 'Set "use_memmap" to True if you run out of memory.') warnings.warn(msg) segments = [s for s in annotated if s.duration > self.duration] # corner case where no segment is long enough # and we removed them all... if not segments: continue # total duration of label in current_file (after removal of # short segments). duration = sum(s.duration for s in segments) # store all these in data_ dictionary datum = {'segments': segments, 'duration': duration, 'current_file': current_file} uri = get_unique_identifier(current_file) self.data_[uri] = datum self.databases_ = sorted(databases) self.labels_ = sorted(labels) sliding_window = self.precomputed.sliding_window() for current_file in getattr(protocol, subset)(): y, _ = to_numpy(current_file, self.precomputed, labels=self.labels_) uri = get_unique_identifier(current_file) self.data_[uri]['y'] = SlidingWindowFeature( self.postprocess_y(y), sliding_window)
def _validate_epoch_diarization( self, epoch, validation_data, protocol=None, subset: Subset = "development", device: Optional[torch.device] = None, batch_size: int = 32, n_jobs: int = 1, duration: float = None, step: float = 0.25, metric: str = None, **kwargs, ): # initialize embedding extraction pretrained = Pretrained( validate_dir=self.validate_dir_, epoch=epoch, duration=duration, step=step, batch_size=batch_size, device=device, ) preprocessors = self.preprocessors_ if "audio" not in preprocessors: preprocessors["audio"] = FileFinder() if "duration" not in preprocessors: preprocessors["duration"] = get_audio_duration _protocol = get_protocol(protocol, preprocessors=preprocessors) Z, t = dict(), dict() min_d, max_d = np.inf, -np.inf for current_file in getattr(_protocol, subset)(): uri = get_unique_identifier(current_file) uem = get_annotated(current_file) reference = current_file["annotation"] X_, t_ = [], [] embedding = pretrained(current_file) for i, (turn, _) in enumerate(reference.itertracks()): # extract embedding for current speech turn x_ = embedding.crop(turn, mode="center") if len(x_) < 1: x_ = embedding.crop(turn, mode="loose") if len(x_) < 1: msg = f"No embedding for {turn} in {uri:s}." raise ValueError(msg) # each speech turn is represented by its average embedding X_.append(np.mean(x_, axis=0)) t_.append(turn) X_ = np.array(X_) # apply hierarchical agglomerative clustering # all the way up to just one cluster (ie complete dendrogram) D = pdist(X_, metric=metric) min_d = min(np.min(D), min_d) max_d = max(np.max(D), max_d) Z[uri] = linkage(X_, method="pool", metric=metric) t[uri] = np.array(t_) def fun(threshold): _metric = DiarizationPurityCoverageFMeasure(weighted=False) for current_file in getattr(_protocol, subset)(): uri = get_unique_identifier(current_file) uem = get_annotated(current_file) reference = current_file["annotation"] clusters = fcluster(Z[uri], threshold, criterion="distance") hypothesis = Annotation(uri=uri) for (start_time, end_time), cluster in zip(t[uri], clusters): hypothesis[Segment(start_time, end_time)] = cluster _ = _metric(reference, hypothesis, uem=uem) return 1.0 - abs(_metric) res = scipy.optimize.minimize_scalar(fun, bounds=(0.0, 1.0), method="bounded", options={"maxiter": 10}) threshold = res.x.item() return { "metric": "diarization_fscore", "minimize": False, "value": float(1.0 - res.fun), }
def validate_epoch(self, epoch, protocol_name, subset='development', validation_data=None): target_precision = self.precision # load model for current epoch model = self.load_model(epoch).to(self.device) model.eval() if isinstance(self.feature_extraction_, Precomputed): self.feature_extraction_.use_memmap = False duration = self.task_.duration step = .25 * duration sequence_labeling = SequenceLabeling( model, self.feature_extraction_, duration=duration, step=.25 * duration, batch_size=self.batch_size, source='audio', device=self.device) protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) predictions = {} references = {} file_generator = getattr(protocol, subset)() for current_file in file_generator: uri = get_unique_identifier(current_file) # build overlap reference reference = Timeline(uri=uri) annotation = current_file['annotation'] for track1, track2 in annotation.co_iter(annotation): if track1 == track2: continue reference.add(track1[0] & track2[0]) references[uri] = reference.to_annotation() # extract overlap scores scores = sequence_labeling.apply(current_file) if model.logsoftmax: scores = SlidingWindowFeature( np.exp(scores.data[:, 2]), scores.sliding_window) else: scores = SlidingWindowFeature( scores.data[:, 2], scores.sliding_window) predictions[uri] = scores # dichotomic search to find threshold that maximizes recall # while having at least `target_precision` lower_alpha = 0. upper_alpha = 1. best_alpha = .5 * (lower_alpha + upper_alpha) best_recall = 0. for _ in range(10): current_alpha = .5 * (lower_alpha + upper_alpha) binarizer = Binarize(onset=current_alpha, offset=current_alpha, log_scale=False) precision = DetectionPrecision() recall = DetectionRecall() for current_file in getattr(protocol, subset)(): uri = get_unique_identifier(current_file) reference = references[uri] hypothesis = binarizer.apply(predictions[uri], dimension=0) hypothesis = hypothesis.to_annotation() uem = get_annotated(current_file) _ = precision(reference, hypothesis, uem=uem) _ = recall(reference, hypothesis, uem=uem) if abs(precision) < target_precision: # precision is not high enough: try higher thresholds lower_alpha = current_alpha else: upper_alpha = current_alpha r = abs(recall) if r > best_recall: best_recall = r best_alpha = current_alpha task = 'overlap_speech_detection' metric_name = f'{task}/recall@{target_precision:.2f}precision' return { metric_name: {'minimize': False, 'value': best_recall}, f'{task}/threshold': {'minimize': 'NA', 'value': best_alpha}}
def apply_iter(self, current_file, hypothesis, partial=True, device=None, log_dir=None): """Yield re-segmentation results for each epoch Parameters ---------- current_file : pyannote.database dict Currently processed file hypothesis : pyannote.core.Annotation Input segmentation partial : bool, optional Set to False to only yield final re-segmentation. Set to True to yield re-segmentation after each epoch. device : torch.device, optional Defaults to torch.device('cpu') log_dir : str, optional Path to log directory. Yields ------ resegmented : pyannote.core.Annotation Resegmentation results after each epoch. """ device = torch.device('cpu') if device is None else device current_file = dict(current_file) current_file['annotation'] = hypothesis # set `per_epoch` attribute to current file annotated duration self.per_epoch = get_annotated(current_file).duration() # number of speakers + 1 for non-speech self.n_classes_ = len(hypothesis.labels()) + 1 model = StackedRNN(self.precomputed.dimension(), self.n_classes, rnn=self.rnn, recurrent=self.recurrent, linear=self.linear, bidirectional=self.bidirectional, logsoftmax=True) # initialize dummy protocol that has only one file protocol = self.get_dummy_protocol(current_file) if log_dir is None: log_dir = tempfile.mkdtemp() uri = get_unique_identifier(current_file) log_dir = 'f{log_dir}/{uri}' self.scores_ = collections.deque([], maxlen=self.ensemble) iterations = self.fit_iter( model, self.precomputed, protocol, subset='train', restart=0, epochs=self.epochs, learning_rate='auto', get_optimizer=SGD, get_scheduler=ConstantScheduler, log_dir=log_dir, device=device) for i, iteration in enumerate(iterations): # if 'partial', compute scores for every iteration # if not, compute scores for last 'ensemble' iterations only if partial or (i + 1 > self.epochs - self.ensemble): iteration_score = self._score(iteration['model'], current_file, device=device) self.scores_.append(iteration_score) # if 'partial', generate (and yield) hypothesis if partial: hypothesis = self._decode(self.scores_) yield hypothesis # generate (and yield) final hypothesis in case it's not already if not partial: hypothesis = self._decode(self.scores_) yield hypothesis
def _validate_epoch_verification(self, epoch, protocol_name, subset='development', validation_data=None): """Perform a speaker verification experiment using model at `epoch` Parameters ---------- epoch : int Epoch to validate. protocol_name : str Name of speaker verification protocol subset : {'train', 'development', 'test'}, optional Name of subset. validation_data : provided by `validate_init` Returns ------- metrics : dict """ # load current model model = self.load_model(epoch).to(self.device) model.eval() # use user-provided --duration when available # otherwise use 'duration' used for training if self.duration is None: duration = self.task_.duration else: duration = self.duration min_duration = None # if 'duration' is still None, it means that # network was trained with variable lengths if duration is None: duration = self.task_.max_duration min_duration = self.task_.min_duration step = .5 * duration if isinstance(self.feature_extraction_, Precomputed): self.feature_extraction_.use_memmap = False # initialize embedding extraction sequence_embedding = SequenceEmbedding(model, self.feature_extraction_, duration=duration, step=step, min_duration=min_duration, batch_size=self.batch_size, device=self.device) metrics = {} protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) enrolment_models, enrolment_khashes = {}, {} enrolments = getattr(protocol, '{0}_enrolment'.format(subset))() for i, enrolment in enumerate(enrolments): data = sequence_embedding.apply(enrolment, crop=enrolment['enrol_with']) model_id = enrolment['model_id'] model = np.mean(np.stack(data), axis=0, keepdims=True) enrolment_models[model_id] = model # in some specific speaker verification protocols, # enrolment data may be used later as trial data. # therefore, we cache information about enrolment data # to speed things up by reusing the enrolment as trial h = hash((get_unique_identifier(enrolment), tuple(enrolment['enrol_with']))) enrolment_khashes[h] = model_id trial_models = {} trials = getattr(protocol, '{0}_trial'.format(subset))() y_true, y_pred = [], [] for i, trial in enumerate(trials): model_id = trial['model_id'] h = hash((get_unique_identifier(trial), tuple(trial['try_with']))) # re-use enrolment model whenever possible if h in enrolment_khashes: model = enrolment_models[enrolment_khashes[h]] # re-use trial model whenever possible elif h in trial_models: model = trial_models[h] else: data = sequence_embedding.apply(trial, crop=trial['try_with']) model = np.mean(data, axis=0, keepdims=True) # cache trial model for later re-use trial_models[h] = model distance = cdist(enrolment_models[model_id], model, metric=self.metric)[0, 0] y_pred.append(distance) y_true.append(trial['reference']) _, _, _, eer = det_curve(np.array(y_true), np.array(y_pred), distances=True) metrics['EER'] = {'minimize': True, 'value': eer} return metrics
def _validate_epoch_verification(self, epoch, protocol_name, subset='development', validation_data=None): """Perform a speaker verification experiment using model at `epoch` Parameters ---------- epoch : int Epoch to validate. protocol_name : str Name of speaker verification protocol subset : {'train', 'development', 'test'}, optional Name of subset. validation_data : provided by `validate_init` Returns ------- metrics : dict """ # load current model model = self.load_model(epoch).to(self.device) model.eval() # use user-provided --duration when available # otherwise use 'duration' used for training if self.duration is None: duration = self.task_.duration else: duration = self.duration min_duration = None # if 'duration' is still None, it means that # network was trained with variable lengths if duration is None: duration = self.task_.max_duration min_duration = self.task_.min_duration step = .5 * duration if isinstance(self.feature_extraction_, Precomputed): self.feature_extraction_.use_memmap = False # initialize embedding extraction sequence_embedding = SequenceEmbedding( model, self.feature_extraction_, duration=duration, step=step, min_duration=min_duration, batch_size=self.batch_size, device=self.device) metrics = {} protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) enrolment_models, enrolment_khashes = {}, {} enrolments = getattr(protocol, '{0}_enrolment'.format(subset))() for i, enrolment in enumerate(enrolments): data = sequence_embedding.apply(enrolment, crop=enrolment['enrol_with']) model_id = enrolment['model_id'] model = np.mean(np.stack(data), axis=0, keepdims=True) enrolment_models[model_id] = model # in some specific speaker verification protocols, # enrolment data may be used later as trial data. # therefore, we cache information about enrolment data # to speed things up by reusing the enrolment as trial h = hash((get_unique_identifier(enrolment), tuple(enrolment['enrol_with']))) enrolment_khashes[h] = model_id trial_models = {} trials = getattr(protocol, '{0}_trial'.format(subset))() y_true, y_pred = [], [] for i, trial in enumerate(trials): model_id = trial['model_id'] h = hash((get_unique_identifier(trial), tuple(trial['try_with']))) # re-use enrolment model whenever possible if h in enrolment_khashes: model = enrolment_models[enrolment_khashes[h]] # re-use trial model whenever possible elif h in trial_models: model = trial_models[h] else: data = sequence_embedding.apply(trial, crop=trial['try_with']) model = np.mean(data, axis=0, keepdims=True) # cache trial model for later re-use trial_models[h] = model distance = cdist(enrolment_models[model_id], model, metric=self.metric)[0, 0] y_pred.append(distance) y_true.append(trial['reference']) _, _, _, eer = det_curve(np.array(y_true), np.array(y_pred), distances=True) metrics['EER'] = {'minimize': True, 'value': eer} return metrics
def validate_epoch(self, epoch, protocol_name, subset='development', validation_data=None): target_purity = self.purity # load model for current epoch model = self.load_model(epoch).to(self.device) model.eval() if isinstance(self.feature_extraction_, Precomputed): self.feature_extraction_.use_memmap = False duration = self.task_.duration step = .25 * duration sequence_labeling = SequenceLabeling( model, self.feature_extraction_, duration=duration, step=.25 * duration, batch_size=self.batch_size, source='audio', device=self.device) protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) # extract predictions for all files. predictions = {} for current_file in getattr(protocol, subset)(): uri = get_unique_identifier(current_file) predictions[uri] = sequence_labeling.apply(current_file) # dichotomic search to find alpha that maximizes coverage # while having at least `target_purity` lower_alpha = 0. upper_alpha = 1. best_alpha = .5 * (lower_alpha + upper_alpha) best_coverage = 0. for _ in range(10): current_alpha = .5 * (lower_alpha + upper_alpha) peak = Peak(alpha=current_alpha, min_duration=0.0, log_scale=model.logsoftmax) metric = DiarizationPurityCoverageFMeasure() # NOTE -- embarrasingly parallel # TODO -- parallelize this for current_file in getattr(protocol, subset)(): reference = current_file['annotation'] uri = get_unique_identifier(current_file) hypothesis = peak.apply(predictions[uri], dimension=1) hypothesis = hypothesis.to_annotation() uem = get_annotated(current_file) metric(reference, hypothesis, uem=uem) purity, coverage, _ = metric.compute_metrics() if purity < target_purity: upper_alpha = current_alpha else: lower_alpha = current_alpha if coverage > best_coverage: best_coverage = coverage best_alpha = current_alpha task = 'speaker_change_detection' metric_name = f'{task}/coverage@{target_purity:.2f}purity' return { metric_name: {'minimize': False, 'value': best_coverage}, f'{task}/threshold': {'minimize': 'NA', 'value': best_alpha}}
from pyannote.metrics.diarization import GreedyDiarizationErrorRate metric1 = GreedyDiarizationErrorRate(parallel=False) metric2 = GreedyDiarizationErrorRate(parallel=False, collar=0.500, skip_overlap=True) metric3 = GreedyDiarizationErrorRate(parallel=False, collar=0.500, skip_overlap=False) from optimize_cluster import speaker_diarization from pyannote.audio.features import Precomputed file_list = [] for current_file in getattr(protocol, subset)(): uri = get_unique_identifier(current_file).split('/')[1] hypothesis = diarization_res[uri] reference = current_file['annotation'] current_file['prediction'] = hypothesis file_list.append(current_file) uem = get_annotated(current_file) metric1(reference, hypothesis, uem=uem) metric2(reference, hypothesis, uem=uem) metric3(reference, hypothesis, uem=uem) print(abs(metric1)) print(abs(metric2)) print(abs(metric3)) config_yml = arguments['<config_yml>'] models_dir = arguments['<models_dir>']
def _load_metadata(self, protocol, subset: Subset = "train") -> float: """Load training set metadata This function is called once at instantiation time, returns the total training set duration, and populates the following attributes: Attributes ---------- data_ : dict {'segments': <list of annotated segments>, 'duration': <total duration of annotated segments>, 'current_file': <protocol dictionary>, 'y': <labels as numpy array>} segment_labels_ : list Sorted list of (unique) labels in protocol. file_labels_ : dict of list Sorted lists of (unique) file labels in protocol Returns ------- duration : float Total duration of annotated segments, in seconds. """ self.data_ = {} segment_labels, file_labels = set(), dict() # loop once on all files files = getattr(protocol, subset)() for current_file in tqdm(files, desc="Loading labels", unit="file"): # ensure annotation/annotated are cropped to actual file duration support = Segment(start=0, end=current_file["duration"]) current_file["annotated"] = get_annotated(current_file).crop( support, mode="intersection" ) current_file["annotation"] = current_file["annotation"].crop( support, mode="intersection" ) # keep track of unique segment labels segment_labels.update(current_file["annotation"].labels()) # keep track of unique file labels for key, value in current_file.items(): if isinstance(value, (Annotation, Timeline, SlidingWindowFeature)): continue if key not in file_labels: file_labels[key] = set() file_labels[key].add(value) segments = [ s for s in current_file["annotated"] if s.duration > self.duration ] # corner case where no segment is long enough # and we removed them all... if not segments: continue # total duration of label in current_file (after removal of # short segments). duration = sum(s.duration for s in segments) # store all these in data_ dictionary datum = { "segments": segments, "duration": duration, "current_file": current_file, } uri = get_unique_identifier(current_file) self.data_[uri] = datum self.file_labels_ = {k: sorted(file_labels[k]) for k in file_labels} self.segment_labels_ = sorted(segment_labels) for uri in list(self.data_): current_file = self.data_[uri]["current_file"] y = self.initialize_y(current_file) self.data_[uri]["y"] = y if self.mask is not None: mask = current_file[self.mask] current_file[self.mask] = mask.align(y) return sum(datum["duration"] for datum in self.data_.values())
def apply_iter(self, current_file, hypothesis, partial=True, device=None, log_dir=None): """Yield re-segmentation results for each epoch Parameters ---------- current_file : pyannote.database dict Currently processed file hypothesis : pyannote.core.Annotation Input segmentation partial : bool, optional Set to False to only yield final re-segmentation. Set to True to yield re-segmentation after each epoch. device : torch.device, optional Defaults to torch.device('cpu') log_dir : str, optional Path to log directory. Yields ------ resegmented : pyannote.core.Annotation Resegmentation results after each epoch. """ device = torch.device('cpu') if device is None else device current_file = dict(current_file) current_file['annotation'] = hypothesis # set `per_epoch` attribute to current file annotated duration self.per_epoch = get_annotated(current_file).duration() # number of speakers + 1 for non-speech self.n_classes_ = len(hypothesis.labels()) + 1 model = StackedRNN(self.precomputed.dimension(), self.n_classes, rnn=self.rnn, recurrent=self.recurrent, linear=self.linear, bidirectional=self.bidirectional, logsoftmax=True) # initialize dummy protocol that has only one file protocol = self.get_dummy_protocol(current_file) if log_dir is None: log_dir = tempfile.mkdtemp() uri = get_unique_identifier(current_file) log_dir = 'f{log_dir}/{uri}' self.scores_ = collections.deque([], maxlen=self.ensemble) iterations = self.fit_iter(model, self.precomputed, protocol, subset='train', restart=0, epochs=self.epochs, learning_rate='auto', get_optimizer=SGD, get_scheduler=ConstantScheduler, log_dir=log_dir, device=device) for i, iteration in enumerate(iterations): # if 'partial', compute scores for every iteration # if not, compute scores for last 'ensemble' iterations only if partial or (i + 1 > self.epochs - self.ensemble): iteration_score = self._score(iteration['model'], current_file, device=device) self.scores_.append(iteration_score) # if 'partial', generate (and yield) hypothesis if partial: hypothesis = self._decode(self.scores_) yield hypothesis # generate (and yield) final hypothesis in case it's not already if not partial: hypothesis = self._decode(self.scores_) yield hypothesis
def train(self, current_file, batch_size=32): def generator(xs, ys, batch_size, shuffle=True): length = len(xs) idxs = list(range(length)) if shuffle: random.shuffle(idxs) while True: tmp = [] for i in idxs: tmp.append(i) if len(tmp) == batch_size: xbatch = np.vstack([xs[i] for i in tmp]) ybatch = np.vstack([ys[i] for i in tmp]) tmp = [] yield xbatch, ybatch duration = self.config_['sequences']['duration'] step = self.config_['sequences']['step'] current_file['features'] = self.feature_precomputed(current_file) realignment_generator = RealignmentBatchGenerator(duration=duration, step=step, batch_size=1, source=self.source) bg = realignment_generator.from_file(current_file) xys = [(x, y) for x, y in bg] xs = [x for x, _ in xys] ys = [y for _, y in xys] input_shape = realignment_generator.input_shape n_classes = realignment_generator.n_classes # architecture architecture_name = self.config_['architecture']['name'] models = __import__('pyannote.audio.labeling.models', fromlist=[architecture_name]) Architecture = getattr(models, architecture_name) params = self.config_['architecture'].get('params', {}) params['n_classes'] = n_classes self.architecture_ = Architecture(**params) train_total = sum( [end - start for start, end in current_file['annotated']]) steps_per_epoch = int(np.ceil((train_total / step) / batch_size)) if self.models_dir is None: return SequenceLabeling.train(input_shape, self.architecture_, generator(xs, ys, batch_size, shuffle=True), steps_per_epoch, self.num_epoch, optimizer=SSMORMS3(), log_dir=None) else: return SequenceLabeling.train(input_shape, self.architecture_, generator(xs, ys, batch_size, shuffle=True), steps_per_epoch, self.num_epoch, optimizer=SSMORMS3(), log_dir=self.models_dir + get_unique_identifier(current_file))
def validate_epoch(self, epoch, protocol_name, subset='development', validation_data=None): # load model for current epoch model = self.load_model(epoch).to(self.device) model.eval() if isinstance(self.feature_extraction_, Precomputed): self.feature_extraction_.use_memmap = False duration = self.task_.duration step = .25 * duration sequence_labeling = SequenceLabeling( model, self.feature_extraction_, duration=duration, step=.25 * duration, batch_size=self.batch_size, source='audio', device=self.device) protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) metric = DetectionErrorRate() predictions = {} file_generator = getattr(protocol, subset)() for current_file in file_generator: uri = get_unique_identifier(current_file) scores = sequence_labeling.apply(current_file) if model.logsoftmax: scores = SlidingWindowFeature( 1. - np.exp(scores.data[:, 0]), scores.sliding_window) else: scores = SlidingWindowFeature( 1. - scores.data[:, 0], scores.sliding_window) predictions[uri] = scores def fun(threshold): binarizer = Binarize(onset=threshold, offset=threshold, log_scale=False) protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) metric = DetectionErrorRate() # NOTE -- embarrasingly parallel # TODO -- parallelize this file_generator = getattr(protocol, subset)() for current_file in file_generator: uri = get_unique_identifier(current_file) hypothesis = binarizer.apply( predictions[uri], dimension=0).to_annotation() reference = current_file['annotation'] uem = get_annotated(current_file) _ = metric(reference, hypothesis, uem=uem) return abs(metric) res = scipy.optimize.minimize_scalar( fun, bounds=(0., 1.), method='bounded', options={'maxiter': 10}) return { 'speech_activity_detection/error': {'minimize': True, 'value': res.fun}, 'speech_activity_detection/threshold': {'minimize': 'NA', 'value': res.x}}