def _validate_epoch_segment(self, epoch, protocol_name, subset='development', validation_data=None): model = self.load_model(epoch).to(self.device) model.eval() sequence_embedding = SequenceEmbedding( model, self.feature_extraction_, batch_size=self.batch_size, device=self.device) fX = sequence_embedding.apply(validation_data['X']) y_pred = pdist(fX, metric=self.metric) _, _, _, eer = det_curve(validation_data['y'], y_pred, distances=True) return {'EER.{0:g}s'.format(self.duration): {'minimize': True, 'value': eer}}
def _validate_epoch_turn(self, epoch, protocol_name, subset='development', validation_data=None): model = self.load_model(epoch).to(self.device) model.eval() sequence_embedding = SequenceEmbedding(model, self.feature_extraction_, batch_size=self.batch_size, device=self.device) fX = sequence_embedding.apply(validation_data['X']) z = validation_data['z'] # iterate over segments, speech turn by speech turn fX_avg = [] nz = np.vstack([np.arange(len(z)), z]).T for _, nz_ in itertools.groupby(nz, lambda t: t[1]): # (n, 2) numpy array where # * n is the number of segments in current speech turn # * dim #0 is the index of segment in original batch # * dim #1 is the index of speech turn (used for grouping) nz_ = np.stack(nz_) # compute (and stack) average embedding over all segments # of current speech turn indices = nz_[:, 0] fX_avg.append(np.mean(fX[indices], axis=0)) fX = np.vstack(fX_avg) y_pred = pdist(fX, metric=self.metric) _, _, _, eer = det_curve(validation_data['y'], y_pred, distances=True) metrics = {} metrics['EER.turn'] = {'minimize': True, 'value': eer} return metrics
def apply(self, protocol_name, output_dir, step=None): model = self.model_.to(self.device) model.eval() duration = self.duration if step is None: step = 0.25 * duration # do not use memmap as this would lead to too many open files if isinstance(self.feature_extraction_, Precomputed): self.feature_extraction_.use_memmap = False # initialize embedding extraction sequence_embedding = SequenceEmbedding(model, self.feature_extraction_, duration=duration, step=step, batch_size=self.batch_size, device=self.device) sliding_window = sequence_embedding.sliding_window dimension = sequence_embedding.dimension # create metadata file at root that contains # sliding window and dimension information precomputed = Precomputed(root_dir=output_dir, sliding_window=sliding_window, dimension=dimension) # file generator protocol = get_protocol(protocol_name, progress=True, preprocessors=self.preprocessors_) for current_file in FileFinder.protocol_file_iter(protocol, extra_keys=['audio' ]): fX = sequence_embedding.apply(current_file) precomputed.dump(current_file, fX)
def _validate_epoch_turn(self, epoch, protocol_name, subset='development', validation_data=None): model = self.load_model(epoch).to(self.device) model.eval() sequence_embedding = SequenceEmbedding( model, self.feature_extraction_, batch_size=self.batch_size, device=self.device) fX = sequence_embedding.apply(validation_data['X']) z = validation_data['z'] # iterate over segments, speech turn by speech turn fX_avg = [] nz = np.vstack([np.arange(len(z)), z]).T for _, nz_ in itertools.groupby(nz, lambda t: t[1]): # (n, 2) numpy array where # * n is the number of segments in current speech turn # * dim #0 is the index of segment in original batch # * dim #1 is the index of speech turn (used for grouping) nz_ = np.stack(nz_) # compute (and stack) average embedding over all segments # of current speech turn indices = nz_[:, 0] fX_avg.append(np.mean(fX[indices], axis=0)) fX = np.vstack(fX_avg) y_pred = pdist(fX, metric=self.metric) _, _, _, eer = det_curve(validation_data['y'], y_pred, distances=True) metrics = {} metrics['EER.turn'] = {'minimize': True, 'value': eer} return metrics
def _validate_epoch_segment(self, epoch, protocol_name, subset='development', validation_data=None): model = self.load_model(epoch).to(self.device) model.eval() sequence_embedding = SequenceEmbedding(model, self.feature_extraction_, batch_size=self.batch_size, device=self.device) fX = sequence_embedding.apply(validation_data['X']) y_pred = pdist(fX, metric=self.metric) _, _, _, eer = det_curve(validation_data['y'], y_pred, distances=True) return { 'EER.{0:g}s'.format(self.duration): { 'minimize': True, 'value': eer } }
def apply(self, protocol_name, output_dir, step=None): model = self.model_.to(self.device) model.eval() duration = self.duration if step is None: step = 0.25 * duration # do not use memmap as this would lead to too many open files if isinstance(self.feature_extraction_, Precomputed): self.feature_extraction_.use_memmap = False # initialize embedding extraction sequence_embedding = SequenceEmbedding( model, self.feature_extraction_, duration=duration, step=step, batch_size=self.batch_size, device=self.device) sliding_window = sequence_embedding.sliding_window dimension = sequence_embedding.dimension # create metadata file at root that contains # sliding window and dimension information precomputed = Precomputed( root_dir=output_dir, sliding_window=sliding_window, dimension=dimension) # file generator protocol = get_protocol(protocol_name, progress=True, preprocessors=self.preprocessors_) for current_file in FileFinder.protocol_file_iter( protocol, extra_keys=['audio']): fX = sequence_embedding.apply(current_file) precomputed.dump(current_file, fX)
def _validate_epoch_verification(self, epoch, protocol_name, subset='development', validation_data=None): """Perform a speaker verification experiment using model at `epoch` Parameters ---------- epoch : int Epoch to validate. protocol_name : str Name of speaker verification protocol subset : {'train', 'development', 'test'}, optional Name of subset. validation_data : provided by `validate_init` Returns ------- metrics : dict """ # load current model model = self.load_model(epoch).to(self.device) model.eval() # use user-provided --duration when available # otherwise use 'duration' used for training if self.duration is None: duration = self.task_.duration else: duration = self.duration min_duration = None # if 'duration' is still None, it means that # network was trained with variable lengths if duration is None: duration = self.task_.max_duration min_duration = self.task_.min_duration step = .5 * duration if isinstance(self.feature_extraction_, Precomputed): self.feature_extraction_.use_memmap = False # initialize embedding extraction sequence_embedding = SequenceEmbedding(model, self.feature_extraction_, duration=duration, step=step, min_duration=min_duration, batch_size=self.batch_size, device=self.device) metrics = {} protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) enrolment_models, enrolment_khashes = {}, {} enrolments = getattr(protocol, '{0}_enrolment'.format(subset))() for i, enrolment in enumerate(enrolments): data = sequence_embedding.apply(enrolment, crop=enrolment['enrol_with']) model_id = enrolment['model_id'] model = np.mean(np.stack(data), axis=0, keepdims=True) enrolment_models[model_id] = model # in some specific speaker verification protocols, # enrolment data may be used later as trial data. # therefore, we cache information about enrolment data # to speed things up by reusing the enrolment as trial h = hash((get_unique_identifier(enrolment), tuple(enrolment['enrol_with']))) enrolment_khashes[h] = model_id trial_models = {} trials = getattr(protocol, '{0}_trial'.format(subset))() y_true, y_pred = [], [] for i, trial in enumerate(trials): model_id = trial['model_id'] h = hash((get_unique_identifier(trial), tuple(trial['try_with']))) # re-use enrolment model whenever possible if h in enrolment_khashes: model = enrolment_models[enrolment_khashes[h]] # re-use trial model whenever possible elif h in trial_models: model = trial_models[h] else: data = sequence_embedding.apply(trial, crop=trial['try_with']) model = np.mean(data, axis=0, keepdims=True) # cache trial model for later re-use trial_models[h] = model distance = cdist(enrolment_models[model_id], model, metric=self.metric)[0, 0] y_pred.append(distance) y_true.append(trial['reference']) _, _, _, eer = det_curve(np.array(y_true), np.array(y_pred), distances=True) metrics['EER'] = {'minimize': True, 'value': eer} return metrics
def _validate_epoch_verification(self, epoch, protocol_name, subset='development', validation_data=None): """Perform a speaker verification experiment using model at `epoch` Parameters ---------- epoch : int Epoch to validate. protocol_name : str Name of speaker verification protocol subset : {'train', 'development', 'test'}, optional Name of subset. validation_data : provided by `validate_init` Returns ------- metrics : dict """ # load current model model = self.load_model(epoch).to(self.device) model.eval() # use user-provided --duration when available # otherwise use 'duration' used for training if self.duration is None: duration = self.task_.duration else: duration = self.duration min_duration = None # if 'duration' is still None, it means that # network was trained with variable lengths if duration is None: duration = self.task_.max_duration min_duration = self.task_.min_duration step = .5 * duration if isinstance(self.feature_extraction_, Precomputed): self.feature_extraction_.use_memmap = False # initialize embedding extraction sequence_embedding = SequenceEmbedding( model, self.feature_extraction_, duration=duration, step=step, min_duration=min_duration, batch_size=self.batch_size, device=self.device) metrics = {} protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) enrolment_models, enrolment_khashes = {}, {} enrolments = getattr(protocol, '{0}_enrolment'.format(subset))() for i, enrolment in enumerate(enrolments): data = sequence_embedding.apply(enrolment, crop=enrolment['enrol_with']) model_id = enrolment['model_id'] model = np.mean(np.stack(data), axis=0, keepdims=True) enrolment_models[model_id] = model # in some specific speaker verification protocols, # enrolment data may be used later as trial data. # therefore, we cache information about enrolment data # to speed things up by reusing the enrolment as trial h = hash((get_unique_identifier(enrolment), tuple(enrolment['enrol_with']))) enrolment_khashes[h] = model_id trial_models = {} trials = getattr(protocol, '{0}_trial'.format(subset))() y_true, y_pred = [], [] for i, trial in enumerate(trials): model_id = trial['model_id'] h = hash((get_unique_identifier(trial), tuple(trial['try_with']))) # re-use enrolment model whenever possible if h in enrolment_khashes: model = enrolment_models[enrolment_khashes[h]] # re-use trial model whenever possible elif h in trial_models: model = trial_models[h] else: data = sequence_embedding.apply(trial, crop=trial['try_with']) model = np.mean(data, axis=0, keepdims=True) # cache trial model for later re-use trial_models[h] = model distance = cdist(enrolment_models[model_id], model, metric=self.metric)[0, 0] y_pred.append(distance) y_true.append(trial['reference']) _, _, _, eer = det_curve(np.array(y_true), np.array(y_pred), distances=True) metrics['EER'] = {'minimize': True, 'value': eer} return metrics