Exemplo n.º 1
0
 def __init__(self, sad_model_path, scd_model_path, emb_model_path):
     self.sad = SequenceLabeling(model=sad_model_path)
     self.scd = SequenceLabeling(model=scd_model_path)
     self.emb = SequenceEmbedding(model=emb_model_path,
                                  duration=1.,
                                  step=0.2)
     self.fa = None
    def _validate_epoch_segment(self, epoch, protocol_name,
                                subset='development',
                                validation_data=None):

        model = self.load_model(epoch).to(self.device)
        model.eval()

        sequence_embedding = SequenceEmbedding(
            model, self.feature_extraction_,
            batch_size=self.batch_size, device=self.device)


        fX = sequence_embedding.apply(validation_data['X'])
        y_pred = pdist(fX, metric=self.metric)
        _, _, _, eer = det_curve(validation_data['y'], y_pred,
                                 distances=True)

        return {'EER.{0:g}s'.format(self.duration): {'minimize': True,
                                                'value': eer}}
    def _validate_epoch_turn(self,
                             epoch,
                             protocol_name,
                             subset='development',
                             validation_data=None):

        model = self.load_model(epoch).to(self.device)
        model.eval()

        sequence_embedding = SequenceEmbedding(model,
                                               self.feature_extraction_,
                                               batch_size=self.batch_size,
                                               device=self.device)

        fX = sequence_embedding.apply(validation_data['X'])

        z = validation_data['z']

        # iterate over segments, speech turn by speech turn

        fX_avg = []
        nz = np.vstack([np.arange(len(z)), z]).T
        for _, nz_ in itertools.groupby(nz, lambda t: t[1]):

            # (n, 2) numpy array where
            # * n is the number of segments in current speech turn
            # * dim #0 is the index of segment in original batch
            # * dim #1 is the index of speech turn (used for grouping)
            nz_ = np.stack(nz_)

            # compute (and stack) average embedding over all segments
            # of current speech turn
            indices = nz_[:, 0]

            fX_avg.append(np.mean(fX[indices], axis=0))

        fX = np.vstack(fX_avg)
        y_pred = pdist(fX, metric=self.metric)
        _, _, _, eer = det_curve(validation_data['y'], y_pred, distances=True)
        metrics = {}
        metrics['EER.turn'] = {'minimize': True, 'value': eer}
        return metrics
Exemplo n.º 4
0
 def _file_embedding(self, file_dict: dict,
                     sequence_embedding: SequenceEmbedding, cache: dict):
     file1 = file_dict
     f_hash = self.get_hash(file1)
     if f_hash in cache:
         emb = cache[f_hash]
     else:
         emb = sequence_embedding.crop(file1, file1['try_with'])
         emb = np.mean(np.stack(emb), axis=0, keepdims=True)
         cache[f_hash] = emb
     return emb
    def apply(self, protocol_name, output_dir, step=None):

        model = self.model_.to(self.device)
        model.eval()

        duration = self.duration
        if step is None:
            step = 0.25 * duration

        # do not use memmap as this would lead to too many open files
        if isinstance(self.feature_extraction_, Precomputed):
            self.feature_extraction_.use_memmap = False

        # initialize embedding extraction
        sequence_embedding = SequenceEmbedding(model,
                                               self.feature_extraction_,
                                               duration=duration,
                                               step=step,
                                               batch_size=self.batch_size,
                                               device=self.device)
        sliding_window = sequence_embedding.sliding_window
        dimension = sequence_embedding.dimension

        # create metadata file at root that contains
        # sliding window and dimension information
        precomputed = Precomputed(root_dir=output_dir,
                                  sliding_window=sliding_window,
                                  dimension=dimension)

        # file generator
        protocol = get_protocol(protocol_name,
                                progress=True,
                                preprocessors=self.preprocessors_)

        for current_file in FileFinder.protocol_file_iter(protocol,
                                                          extra_keys=['audio'
                                                                      ]):

            fX = sequence_embedding.apply(current_file)
            precomputed.dump(current_file, fX)
    def _validate_epoch_turn(self, epoch, protocol_name,
                             subset='development',
                             validation_data=None):

        model = self.load_model(epoch).to(self.device)
        model.eval()

        sequence_embedding = SequenceEmbedding(
            model, self.feature_extraction_,
            batch_size=self.batch_size, device=self.device)

        fX = sequence_embedding.apply(validation_data['X'])

        z = validation_data['z']

        # iterate over segments, speech turn by speech turn

        fX_avg = []
        nz = np.vstack([np.arange(len(z)), z]).T
        for _, nz_ in itertools.groupby(nz, lambda t: t[1]):

            # (n, 2) numpy array where
            # * n is the number of segments in current speech turn
            # * dim #0 is the index of segment in original batch
            # * dim #1 is the index of speech turn (used for grouping)
            nz_ = np.stack(nz_)

            # compute (and stack) average embedding over all segments
            # of current speech turn
            indices = nz_[:, 0]

            fX_avg.append(np.mean(fX[indices], axis=0))

        fX = np.vstack(fX_avg)
        y_pred = pdist(fX, metric=self.metric)
        _, _, _, eer = det_curve(validation_data['y'], y_pred,
                                 distances=True)
        metrics = {}
        metrics['EER.turn'] = {'minimize': True, 'value': eer}
        return metrics
    def _validate_epoch_segment(self,
                                epoch,
                                protocol_name,
                                subset='development',
                                validation_data=None):

        model = self.load_model(epoch).to(self.device)
        model.eval()

        sequence_embedding = SequenceEmbedding(model,
                                               self.feature_extraction_,
                                               batch_size=self.batch_size,
                                               device=self.device)

        fX = sequence_embedding.apply(validation_data['X'])
        y_pred = pdist(fX, metric=self.metric)
        _, _, _, eer = det_curve(validation_data['y'], y_pred, distances=True)

        return {
            'EER.{0:g}s'.format(self.duration): {
                'minimize': True,
                'value': eer
            }
        }
    def apply(self, protocol_name, output_dir, step=None):

        model = self.model_.to(self.device)
        model.eval()

        duration = self.duration
        if step is None:
            step = 0.25 * duration

        # do not use memmap as this would lead to too many open files
        if isinstance(self.feature_extraction_, Precomputed):
            self.feature_extraction_.use_memmap = False

        # initialize embedding extraction
        sequence_embedding = SequenceEmbedding(
            model, self.feature_extraction_, duration=duration,
            step=step, batch_size=self.batch_size, device=self.device)
        sliding_window = sequence_embedding.sliding_window
        dimension = sequence_embedding.dimension

        # create metadata file at root that contains
        # sliding window and dimension information
        precomputed = Precomputed(
            root_dir=output_dir,
            sliding_window=sliding_window,
            dimension=dimension)

        # file generator
        protocol = get_protocol(protocol_name, progress=True,
                                preprocessors=self.preprocessors_)

        for current_file in FileFinder.protocol_file_iter(
            protocol, extra_keys=['audio']):

            fX = sequence_embedding.apply(current_file)
            precomputed.dump(current_file, fX)
Exemplo n.º 9
0
    def eval(self, model, partition: str = 'development'):
        model.eval()
        sequence_embedding = SequenceEmbedding(
            model=model,
            feature_extraction=self.config.feature_extraction,
            duration=self.config.duration,
            step=.5 * self.config.duration,
            batch_size=self.batch_size,
            device=common.DEVICE)
        protocol = get_protocol(self.config.protocol_name,
                                progress=False,
                                preprocessors=self.config.preprocessors)

        y_true, y_pred, cache = [], [], {}

        for trial in getattr(protocol, f"{partition}_trial")():

            # Compute embeddings
            emb1 = self._file_embedding(trial['file1'], sequence_embedding,
                                        cache)
            emb2 = self._file_embedding(trial['file2'], sequence_embedding,
                                        cache)

            # Compare embeddings
            dist = cdist(emb1, emb2,
                         metric=self.distance.to_sklearn_metric())[0, 0]

            y_pred.append(dist)
            y_true.append(trial['reference'])

        _, _, _, eer = det_curve(np.array(y_true),
                                 np.array(y_pred),
                                 distances=True)

        # Returning 1-eer because the evaluator keeps track of the highest metric value
        return 1 - eer, y_pred, y_true
Exemplo n.º 10
0
# # 4th row: SCD raw scores
# notebook.plot_feature(plot_ready(scd_scores), ax=ax[3])
# ax[3].text(notebook.crop.start + 0.5, 0.3, 'SCD\nscores', fontsize=14)
# ax[3].set_ylim(-0.1, 0.6)
#
# # 5th row: SCD result
# notebook.plot_timeline(partition, ax=ax[4])
# ax[4].text(notebook.crop.start + 0.5, 0.1, 'SCD', fontsize=14)
#
# # 6th row: combination of SAD and SCD
# notebook.plot_timeline(speech_turns, ax=ax[5])
# ax[5].text(notebook.crop.start + 0.5, 0.1, 'speech turns', fontsize=14)

# initialize sequence embedding model
from pyannote.audio.embedding.extraction import SequenceEmbedding
emb = SequenceEmbedding(model=EMB_MODEL, duration=1., step=0.5)

# obtain raw embeddings (as `pyannote.core.SlidingWindowFeature` instance)
# embeddings are extracted every 500ms on 1s-long windows
embeddings = emb(test_file)

# for the purpose of this tutorial, we only work of long (> 2s) speech turns
from pyannote.core import Timeline
long_turns = Timeline(segments=[s for s in speech_turns if s.duration > 2.])


def run_speech_pipeline():
    return


def run_spectral_clusterer():
Exemplo n.º 11
0
    def _validate_epoch_verification(self,
                                     epoch,
                                     protocol_name,
                                     subset='development',
                                     validation_data=None):
        """Perform a speaker verification experiment using model at `epoch`

        Parameters
        ----------
        epoch : int
            Epoch to validate.
        protocol_name : str
            Name of speaker verification protocol
        subset : {'train', 'development', 'test'}, optional
            Name of subset.
        validation_data : provided by `validate_init`

        Returns
        -------
        metrics : dict
        """

        # load current model
        model = self.load_model(epoch).to(self.device)
        model.eval()

        # use user-provided --duration when available
        # otherwise use 'duration' used for training
        if self.duration is None:
            duration = self.task_.duration
        else:
            duration = self.duration
        min_duration = None

        # if 'duration' is still None, it means that
        # network was trained with variable lengths
        if duration is None:
            duration = self.task_.max_duration
            min_duration = self.task_.min_duration

        step = .5 * duration

        if isinstance(self.feature_extraction_, Precomputed):
            self.feature_extraction_.use_memmap = False

        # initialize embedding extraction
        sequence_embedding = SequenceEmbedding(model,
                                               self.feature_extraction_,
                                               duration=duration,
                                               step=step,
                                               min_duration=min_duration,
                                               batch_size=self.batch_size,
                                               device=self.device)

        metrics = {}
        protocol = get_protocol(protocol_name,
                                progress=False,
                                preprocessors=self.preprocessors_)

        enrolment_models, enrolment_khashes = {}, {}
        enrolments = getattr(protocol, '{0}_enrolment'.format(subset))()
        for i, enrolment in enumerate(enrolments):
            data = sequence_embedding.apply(enrolment,
                                            crop=enrolment['enrol_with'])
            model_id = enrolment['model_id']
            model = np.mean(np.stack(data), axis=0, keepdims=True)
            enrolment_models[model_id] = model

            # in some specific speaker verification protocols,
            # enrolment data may be  used later as trial data.
            # therefore, we cache information about enrolment data
            # to speed things up by reusing the enrolment as trial
            h = hash((get_unique_identifier(enrolment),
                      tuple(enrolment['enrol_with'])))
            enrolment_khashes[h] = model_id

        trial_models = {}
        trials = getattr(protocol, '{0}_trial'.format(subset))()
        y_true, y_pred = [], []
        for i, trial in enumerate(trials):
            model_id = trial['model_id']

            h = hash((get_unique_identifier(trial), tuple(trial['try_with'])))

            # re-use enrolment model whenever possible
            if h in enrolment_khashes:
                model = enrolment_models[enrolment_khashes[h]]

            # re-use trial model whenever possible
            elif h in trial_models:
                model = trial_models[h]

            else:
                data = sequence_embedding.apply(trial, crop=trial['try_with'])
                model = np.mean(data, axis=0, keepdims=True)
                # cache trial model for later re-use
                trial_models[h] = model

            distance = cdist(enrolment_models[model_id],
                             model,
                             metric=self.metric)[0, 0]
            y_pred.append(distance)
            y_true.append(trial['reference'])

        _, _, _, eer = det_curve(np.array(y_true),
                                 np.array(y_pred),
                                 distances=True)
        metrics['EER'] = {'minimize': True, 'value': eer}

        return metrics
    def _validate_epoch_verification(self, epoch, protocol_name,
                                     subset='development',
                                     validation_data=None):
        """Perform a speaker verification experiment using model at `epoch`

        Parameters
        ----------
        epoch : int
            Epoch to validate.
        protocol_name : str
            Name of speaker verification protocol
        subset : {'train', 'development', 'test'}, optional
            Name of subset.
        validation_data : provided by `validate_init`

        Returns
        -------
        metrics : dict
        """


        # load current model
        model = self.load_model(epoch).to(self.device)
        model.eval()

        # use user-provided --duration when available
        # otherwise use 'duration' used for training
        if self.duration is None:
            duration = self.task_.duration
        else:
            duration = self.duration
        min_duration = None

        # if 'duration' is still None, it means that
        # network was trained with variable lengths
        if duration is None:
            duration = self.task_.max_duration
            min_duration = self.task_.min_duration

        step = .5 * duration

        if isinstance(self.feature_extraction_, Precomputed):
            self.feature_extraction_.use_memmap = False

        # initialize embedding extraction
        sequence_embedding = SequenceEmbedding(
            model, self.feature_extraction_, duration=duration,
            step=step, min_duration=min_duration,
            batch_size=self.batch_size, device=self.device)

        metrics = {}
        protocol = get_protocol(protocol_name, progress=False,
                                preprocessors=self.preprocessors_)

        enrolment_models, enrolment_khashes = {}, {}
        enrolments = getattr(protocol, '{0}_enrolment'.format(subset))()
        for i, enrolment in enumerate(enrolments):
            data = sequence_embedding.apply(enrolment,
                                            crop=enrolment['enrol_with'])
            model_id = enrolment['model_id']
            model = np.mean(np.stack(data), axis=0, keepdims=True)
            enrolment_models[model_id] = model

            # in some specific speaker verification protocols,
            # enrolment data may be  used later as trial data.
            # therefore, we cache information about enrolment data
            # to speed things up by reusing the enrolment as trial
            h = hash((get_unique_identifier(enrolment),
                      tuple(enrolment['enrol_with'])))
            enrolment_khashes[h] = model_id

        trial_models = {}
        trials = getattr(protocol, '{0}_trial'.format(subset))()
        y_true, y_pred = [], []
        for i, trial in enumerate(trials):
            model_id = trial['model_id']

            h = hash((get_unique_identifier(trial),
                      tuple(trial['try_with'])))

            # re-use enrolment model whenever possible
            if h in enrolment_khashes:
                model = enrolment_models[enrolment_khashes[h]]

            # re-use trial model whenever possible
            elif h in trial_models:
                model = trial_models[h]

            else:
                data = sequence_embedding.apply(trial, crop=trial['try_with'])
                model = np.mean(data, axis=0, keepdims=True)
                # cache trial model for later re-use
                trial_models[h] = model

            distance = cdist(enrolment_models[model_id], model,
                             metric=self.metric)[0, 0]
            y_pred.append(distance)
            y_true.append(trial['reference'])

        _, _, _, eer = det_curve(np.array(y_true), np.array(y_pred),
                                 distances=True)
        metrics['EER'] = {'minimize': True, 'value': eer}

        return metrics