def __init__(self, sad_model_path, scd_model_path, emb_model_path): self.sad = SequenceLabeling(model=sad_model_path) self.scd = SequenceLabeling(model=scd_model_path) self.emb = SequenceEmbedding(model=emb_model_path, duration=1., step=0.2) self.fa = None
def _validate_epoch_segment(self, epoch, protocol_name, subset='development', validation_data=None): model = self.load_model(epoch).to(self.device) model.eval() sequence_embedding = SequenceEmbedding( model, self.feature_extraction_, batch_size=self.batch_size, device=self.device) fX = sequence_embedding.apply(validation_data['X']) y_pred = pdist(fX, metric=self.metric) _, _, _, eer = det_curve(validation_data['y'], y_pred, distances=True) return {'EER.{0:g}s'.format(self.duration): {'minimize': True, 'value': eer}}
def _validate_epoch_turn(self, epoch, protocol_name, subset='development', validation_data=None): model = self.load_model(epoch).to(self.device) model.eval() sequence_embedding = SequenceEmbedding(model, self.feature_extraction_, batch_size=self.batch_size, device=self.device) fX = sequence_embedding.apply(validation_data['X']) z = validation_data['z'] # iterate over segments, speech turn by speech turn fX_avg = [] nz = np.vstack([np.arange(len(z)), z]).T for _, nz_ in itertools.groupby(nz, lambda t: t[1]): # (n, 2) numpy array where # * n is the number of segments in current speech turn # * dim #0 is the index of segment in original batch # * dim #1 is the index of speech turn (used for grouping) nz_ = np.stack(nz_) # compute (and stack) average embedding over all segments # of current speech turn indices = nz_[:, 0] fX_avg.append(np.mean(fX[indices], axis=0)) fX = np.vstack(fX_avg) y_pred = pdist(fX, metric=self.metric) _, _, _, eer = det_curve(validation_data['y'], y_pred, distances=True) metrics = {} metrics['EER.turn'] = {'minimize': True, 'value': eer} return metrics
def _file_embedding(self, file_dict: dict, sequence_embedding: SequenceEmbedding, cache: dict): file1 = file_dict f_hash = self.get_hash(file1) if f_hash in cache: emb = cache[f_hash] else: emb = sequence_embedding.crop(file1, file1['try_with']) emb = np.mean(np.stack(emb), axis=0, keepdims=True) cache[f_hash] = emb return emb
def apply(self, protocol_name, output_dir, step=None): model = self.model_.to(self.device) model.eval() duration = self.duration if step is None: step = 0.25 * duration # do not use memmap as this would lead to too many open files if isinstance(self.feature_extraction_, Precomputed): self.feature_extraction_.use_memmap = False # initialize embedding extraction sequence_embedding = SequenceEmbedding(model, self.feature_extraction_, duration=duration, step=step, batch_size=self.batch_size, device=self.device) sliding_window = sequence_embedding.sliding_window dimension = sequence_embedding.dimension # create metadata file at root that contains # sliding window and dimension information precomputed = Precomputed(root_dir=output_dir, sliding_window=sliding_window, dimension=dimension) # file generator protocol = get_protocol(protocol_name, progress=True, preprocessors=self.preprocessors_) for current_file in FileFinder.protocol_file_iter(protocol, extra_keys=['audio' ]): fX = sequence_embedding.apply(current_file) precomputed.dump(current_file, fX)
def _validate_epoch_turn(self, epoch, protocol_name, subset='development', validation_data=None): model = self.load_model(epoch).to(self.device) model.eval() sequence_embedding = SequenceEmbedding( model, self.feature_extraction_, batch_size=self.batch_size, device=self.device) fX = sequence_embedding.apply(validation_data['X']) z = validation_data['z'] # iterate over segments, speech turn by speech turn fX_avg = [] nz = np.vstack([np.arange(len(z)), z]).T for _, nz_ in itertools.groupby(nz, lambda t: t[1]): # (n, 2) numpy array where # * n is the number of segments in current speech turn # * dim #0 is the index of segment in original batch # * dim #1 is the index of speech turn (used for grouping) nz_ = np.stack(nz_) # compute (and stack) average embedding over all segments # of current speech turn indices = nz_[:, 0] fX_avg.append(np.mean(fX[indices], axis=0)) fX = np.vstack(fX_avg) y_pred = pdist(fX, metric=self.metric) _, _, _, eer = det_curve(validation_data['y'], y_pred, distances=True) metrics = {} metrics['EER.turn'] = {'minimize': True, 'value': eer} return metrics
def _validate_epoch_segment(self, epoch, protocol_name, subset='development', validation_data=None): model = self.load_model(epoch).to(self.device) model.eval() sequence_embedding = SequenceEmbedding(model, self.feature_extraction_, batch_size=self.batch_size, device=self.device) fX = sequence_embedding.apply(validation_data['X']) y_pred = pdist(fX, metric=self.metric) _, _, _, eer = det_curve(validation_data['y'], y_pred, distances=True) return { 'EER.{0:g}s'.format(self.duration): { 'minimize': True, 'value': eer } }
def apply(self, protocol_name, output_dir, step=None): model = self.model_.to(self.device) model.eval() duration = self.duration if step is None: step = 0.25 * duration # do not use memmap as this would lead to too many open files if isinstance(self.feature_extraction_, Precomputed): self.feature_extraction_.use_memmap = False # initialize embedding extraction sequence_embedding = SequenceEmbedding( model, self.feature_extraction_, duration=duration, step=step, batch_size=self.batch_size, device=self.device) sliding_window = sequence_embedding.sliding_window dimension = sequence_embedding.dimension # create metadata file at root that contains # sliding window and dimension information precomputed = Precomputed( root_dir=output_dir, sliding_window=sliding_window, dimension=dimension) # file generator protocol = get_protocol(protocol_name, progress=True, preprocessors=self.preprocessors_) for current_file in FileFinder.protocol_file_iter( protocol, extra_keys=['audio']): fX = sequence_embedding.apply(current_file) precomputed.dump(current_file, fX)
def eval(self, model, partition: str = 'development'): model.eval() sequence_embedding = SequenceEmbedding( model=model, feature_extraction=self.config.feature_extraction, duration=self.config.duration, step=.5 * self.config.duration, batch_size=self.batch_size, device=common.DEVICE) protocol = get_protocol(self.config.protocol_name, progress=False, preprocessors=self.config.preprocessors) y_true, y_pred, cache = [], [], {} for trial in getattr(protocol, f"{partition}_trial")(): # Compute embeddings emb1 = self._file_embedding(trial['file1'], sequence_embedding, cache) emb2 = self._file_embedding(trial['file2'], sequence_embedding, cache) # Compare embeddings dist = cdist(emb1, emb2, metric=self.distance.to_sklearn_metric())[0, 0] y_pred.append(dist) y_true.append(trial['reference']) _, _, _, eer = det_curve(np.array(y_true), np.array(y_pred), distances=True) # Returning 1-eer because the evaluator keeps track of the highest metric value return 1 - eer, y_pred, y_true
# # 4th row: SCD raw scores # notebook.plot_feature(plot_ready(scd_scores), ax=ax[3]) # ax[3].text(notebook.crop.start + 0.5, 0.3, 'SCD\nscores', fontsize=14) # ax[3].set_ylim(-0.1, 0.6) # # # 5th row: SCD result # notebook.plot_timeline(partition, ax=ax[4]) # ax[4].text(notebook.crop.start + 0.5, 0.1, 'SCD', fontsize=14) # # # 6th row: combination of SAD and SCD # notebook.plot_timeline(speech_turns, ax=ax[5]) # ax[5].text(notebook.crop.start + 0.5, 0.1, 'speech turns', fontsize=14) # initialize sequence embedding model from pyannote.audio.embedding.extraction import SequenceEmbedding emb = SequenceEmbedding(model=EMB_MODEL, duration=1., step=0.5) # obtain raw embeddings (as `pyannote.core.SlidingWindowFeature` instance) # embeddings are extracted every 500ms on 1s-long windows embeddings = emb(test_file) # for the purpose of this tutorial, we only work of long (> 2s) speech turns from pyannote.core import Timeline long_turns = Timeline(segments=[s for s in speech_turns if s.duration > 2.]) def run_speech_pipeline(): return def run_spectral_clusterer():
def _validate_epoch_verification(self, epoch, protocol_name, subset='development', validation_data=None): """Perform a speaker verification experiment using model at `epoch` Parameters ---------- epoch : int Epoch to validate. protocol_name : str Name of speaker verification protocol subset : {'train', 'development', 'test'}, optional Name of subset. validation_data : provided by `validate_init` Returns ------- metrics : dict """ # load current model model = self.load_model(epoch).to(self.device) model.eval() # use user-provided --duration when available # otherwise use 'duration' used for training if self.duration is None: duration = self.task_.duration else: duration = self.duration min_duration = None # if 'duration' is still None, it means that # network was trained with variable lengths if duration is None: duration = self.task_.max_duration min_duration = self.task_.min_duration step = .5 * duration if isinstance(self.feature_extraction_, Precomputed): self.feature_extraction_.use_memmap = False # initialize embedding extraction sequence_embedding = SequenceEmbedding(model, self.feature_extraction_, duration=duration, step=step, min_duration=min_duration, batch_size=self.batch_size, device=self.device) metrics = {} protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) enrolment_models, enrolment_khashes = {}, {} enrolments = getattr(protocol, '{0}_enrolment'.format(subset))() for i, enrolment in enumerate(enrolments): data = sequence_embedding.apply(enrolment, crop=enrolment['enrol_with']) model_id = enrolment['model_id'] model = np.mean(np.stack(data), axis=0, keepdims=True) enrolment_models[model_id] = model # in some specific speaker verification protocols, # enrolment data may be used later as trial data. # therefore, we cache information about enrolment data # to speed things up by reusing the enrolment as trial h = hash((get_unique_identifier(enrolment), tuple(enrolment['enrol_with']))) enrolment_khashes[h] = model_id trial_models = {} trials = getattr(protocol, '{0}_trial'.format(subset))() y_true, y_pred = [], [] for i, trial in enumerate(trials): model_id = trial['model_id'] h = hash((get_unique_identifier(trial), tuple(trial['try_with']))) # re-use enrolment model whenever possible if h in enrolment_khashes: model = enrolment_models[enrolment_khashes[h]] # re-use trial model whenever possible elif h in trial_models: model = trial_models[h] else: data = sequence_embedding.apply(trial, crop=trial['try_with']) model = np.mean(data, axis=0, keepdims=True) # cache trial model for later re-use trial_models[h] = model distance = cdist(enrolment_models[model_id], model, metric=self.metric)[0, 0] y_pred.append(distance) y_true.append(trial['reference']) _, _, _, eer = det_curve(np.array(y_true), np.array(y_pred), distances=True) metrics['EER'] = {'minimize': True, 'value': eer} return metrics
def _validate_epoch_verification(self, epoch, protocol_name, subset='development', validation_data=None): """Perform a speaker verification experiment using model at `epoch` Parameters ---------- epoch : int Epoch to validate. protocol_name : str Name of speaker verification protocol subset : {'train', 'development', 'test'}, optional Name of subset. validation_data : provided by `validate_init` Returns ------- metrics : dict """ # load current model model = self.load_model(epoch).to(self.device) model.eval() # use user-provided --duration when available # otherwise use 'duration' used for training if self.duration is None: duration = self.task_.duration else: duration = self.duration min_duration = None # if 'duration' is still None, it means that # network was trained with variable lengths if duration is None: duration = self.task_.max_duration min_duration = self.task_.min_duration step = .5 * duration if isinstance(self.feature_extraction_, Precomputed): self.feature_extraction_.use_memmap = False # initialize embedding extraction sequence_embedding = SequenceEmbedding( model, self.feature_extraction_, duration=duration, step=step, min_duration=min_duration, batch_size=self.batch_size, device=self.device) metrics = {} protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) enrolment_models, enrolment_khashes = {}, {} enrolments = getattr(protocol, '{0}_enrolment'.format(subset))() for i, enrolment in enumerate(enrolments): data = sequence_embedding.apply(enrolment, crop=enrolment['enrol_with']) model_id = enrolment['model_id'] model = np.mean(np.stack(data), axis=0, keepdims=True) enrolment_models[model_id] = model # in some specific speaker verification protocols, # enrolment data may be used later as trial data. # therefore, we cache information about enrolment data # to speed things up by reusing the enrolment as trial h = hash((get_unique_identifier(enrolment), tuple(enrolment['enrol_with']))) enrolment_khashes[h] = model_id trial_models = {} trials = getattr(protocol, '{0}_trial'.format(subset))() y_true, y_pred = [], [] for i, trial in enumerate(trials): model_id = trial['model_id'] h = hash((get_unique_identifier(trial), tuple(trial['try_with']))) # re-use enrolment model whenever possible if h in enrolment_khashes: model = enrolment_models[enrolment_khashes[h]] # re-use trial model whenever possible elif h in trial_models: model = trial_models[h] else: data = sequence_embedding.apply(trial, crop=trial['try_with']) model = np.mean(data, axis=0, keepdims=True) # cache trial model for later re-use trial_models[h] = model distance = cdist(enrolment_models[model_id], model, metric=self.metric)[0, 0] y_pred.append(distance) y_true.append(trial['reference']) _, _, _, eer = det_curve(np.array(y_true), np.array(y_pred), distances=True) metrics['EER'] = {'minimize': True, 'value': eer} return metrics