Python SequenceEmbedding.apply 예제들, pyannote.audio.embedding.extraction.SequenceEmbedding.apply Python 예제들

예제 #1

0

파일 보기

파일: speaker_embedding.py 프로젝트: instinct2k18/pyannote-audio

    def _validate_epoch_segment(self, epoch, protocol_name,
                                subset='development',
                                validation_data=None):

        model = self.load_model(epoch).to(self.device)
        model.eval()

        sequence_embedding = SequenceEmbedding(
            model, self.feature_extraction_,
            batch_size=self.batch_size, device=self.device)


        fX = sequence_embedding.apply(validation_data['X'])
        y_pred = pdist(fX, metric=self.metric)
        _, _, _, eer = det_curve(validation_data['y'], y_pred,
                                 distances=True)

        return {'EER.{0:g}s'.format(self.duration): {'minimize': True,
                                                'value': eer}}

예제 #2

0

파일 보기

파일: speaker_embedding.py 프로젝트: yining4869/pyannote-audio

    def _validate_epoch_turn(self,
                             epoch,
                             protocol_name,
                             subset='development',
                             validation_data=None):

        model = self.load_model(epoch).to(self.device)
        model.eval()

        sequence_embedding = SequenceEmbedding(model,
                                               self.feature_extraction_,
                                               batch_size=self.batch_size,
                                               device=self.device)

        fX = sequence_embedding.apply(validation_data['X'])

        z = validation_data['z']

        # iterate over segments, speech turn by speech turn

        fX_avg = []
        nz = np.vstack([np.arange(len(z)), z]).T
        for _, nz_ in itertools.groupby(nz, lambda t: t[1]):

            # (n, 2) numpy array where
            # * n is the number of segments in current speech turn
            # * dim #0 is the index of segment in original batch
            # * dim #1 is the index of speech turn (used for grouping)
            nz_ = np.stack(nz_)

            # compute (and stack) average embedding over all segments
            # of current speech turn
            indices = nz_[:, 0]

            fX_avg.append(np.mean(fX[indices], axis=0))

        fX = np.vstack(fX_avg)
        y_pred = pdist(fX, metric=self.metric)
        _, _, _, eer = det_curve(validation_data['y'], y_pred, distances=True)
        metrics = {}
        metrics['EER.turn'] = {'minimize': True, 'value': eer}
        return metrics

예제 #3

0

파일 보기

파일: speaker_embedding.py 프로젝트: yining4869/pyannote-audio

    def apply(self, protocol_name, output_dir, step=None):

        model = self.model_.to(self.device)
        model.eval()

        duration = self.duration
        if step is None:
            step = 0.25 * duration

        # do not use memmap as this would lead to too many open files
        if isinstance(self.feature_extraction_, Precomputed):
            self.feature_extraction_.use_memmap = False

        # initialize embedding extraction
        sequence_embedding = SequenceEmbedding(model,
                                               self.feature_extraction_,
                                               duration=duration,
                                               step=step,
                                               batch_size=self.batch_size,
                                               device=self.device)
        sliding_window = sequence_embedding.sliding_window
        dimension = sequence_embedding.dimension

        # create metadata file at root that contains
        # sliding window and dimension information
        precomputed = Precomputed(root_dir=output_dir,
                                  sliding_window=sliding_window,
                                  dimension=dimension)

        # file generator
        protocol = get_protocol(protocol_name,
                                progress=True,
                                preprocessors=self.preprocessors_)

        for current_file in FileFinder.protocol_file_iter(protocol,
                                                          extra_keys=['audio'
                                                                      ]):

            fX = sequence_embedding.apply(current_file)
            precomputed.dump(current_file, fX)

예제 #4

0

파일 보기

파일: speaker_embedding.py 프로젝트: instinct2k18/pyannote-audio

    def _validate_epoch_turn(self, epoch, protocol_name,
                             subset='development',
                             validation_data=None):

        model = self.load_model(epoch).to(self.device)
        model.eval()

        sequence_embedding = SequenceEmbedding(
            model, self.feature_extraction_,
            batch_size=self.batch_size, device=self.device)

        fX = sequence_embedding.apply(validation_data['X'])

        z = validation_data['z']

        # iterate over segments, speech turn by speech turn

        fX_avg = []
        nz = np.vstack([np.arange(len(z)), z]).T
        for _, nz_ in itertools.groupby(nz, lambda t: t[1]):

            # (n, 2) numpy array where
            # * n is the number of segments in current speech turn
            # * dim #0 is the index of segment in original batch
            # * dim #1 is the index of speech turn (used for grouping)
            nz_ = np.stack(nz_)

            # compute (and stack) average embedding over all segments
            # of current speech turn
            indices = nz_[:, 0]

            fX_avg.append(np.mean(fX[indices], axis=0))

        fX = np.vstack(fX_avg)
        y_pred = pdist(fX, metric=self.metric)
        _, _, _, eer = det_curve(validation_data['y'], y_pred,
                                 distances=True)
        metrics = {}
        metrics['EER.turn'] = {'minimize': True, 'value': eer}
        return metrics

예제 #5

0

파일 보기

파일: speaker_embedding.py 프로젝트: yining4869/pyannote-audio

    def _validate_epoch_segment(self,
                                epoch,
                                protocol_name,
                                subset='development',
                                validation_data=None):

        model = self.load_model(epoch).to(self.device)
        model.eval()

        sequence_embedding = SequenceEmbedding(model,
                                               self.feature_extraction_,
                                               batch_size=self.batch_size,
                                               device=self.device)

        fX = sequence_embedding.apply(validation_data['X'])
        y_pred = pdist(fX, metric=self.metric)
        _, _, _, eer = det_curve(validation_data['y'], y_pred, distances=True)

        return {
            'EER.{0:g}s'.format(self.duration): {
                'minimize': True,
                'value': eer
            }
        }

예제 #6

0

파일 보기

파일: speaker_embedding.py 프로젝트: instinct2k18/pyannote-audio

    def apply(self, protocol_name, output_dir, step=None):

        model = self.model_.to(self.device)
        model.eval()

        duration = self.duration
        if step is None:
            step = 0.25 * duration

        # do not use memmap as this would lead to too many open files
        if isinstance(self.feature_extraction_, Precomputed):
            self.feature_extraction_.use_memmap = False

        # initialize embedding extraction
        sequence_embedding = SequenceEmbedding(
            model, self.feature_extraction_, duration=duration,
            step=step, batch_size=self.batch_size, device=self.device)
        sliding_window = sequence_embedding.sliding_window
        dimension = sequence_embedding.dimension

        # create metadata file at root that contains
        # sliding window and dimension information
        precomputed = Precomputed(
            root_dir=output_dir,
            sliding_window=sliding_window,
            dimension=dimension)

        # file generator
        protocol = get_protocol(protocol_name, progress=True,
                                preprocessors=self.preprocessors_)

        for current_file in FileFinder.protocol_file_iter(
            protocol, extra_keys=['audio']):

            fX = sequence_embedding.apply(current_file)
            precomputed.dump(current_file, fX)

예제 #7

0

파일 보기

파일: speaker_embedding.py 프로젝트: yining4869/pyannote-audio

    def _validate_epoch_verification(self,
                                     epoch,
                                     protocol_name,
                                     subset='development',
                                     validation_data=None):
        """Perform a speaker verification experiment using model at `epoch`

        Parameters
        ----------
        epoch : int
            Epoch to validate.
        protocol_name : str
            Name of speaker verification protocol
        subset : {'train', 'development', 'test'}, optional
            Name of subset.
        validation_data : provided by `validate_init`

        Returns
        -------
        metrics : dict
        """

        # load current model
        model = self.load_model(epoch).to(self.device)
        model.eval()

        # use user-provided --duration when available
        # otherwise use 'duration' used for training
        if self.duration is None:
            duration = self.task_.duration
        else:
            duration = self.duration
        min_duration = None

        # if 'duration' is still None, it means that
        # network was trained with variable lengths
        if duration is None:
            duration = self.task_.max_duration
            min_duration = self.task_.min_duration

        step = .5 * duration

        if isinstance(self.feature_extraction_, Precomputed):
            self.feature_extraction_.use_memmap = False

        # initialize embedding extraction
        sequence_embedding = SequenceEmbedding(model,
                                               self.feature_extraction_,
                                               duration=duration,
                                               step=step,
                                               min_duration=min_duration,
                                               batch_size=self.batch_size,
                                               device=self.device)

        metrics = {}
        protocol = get_protocol(protocol_name,
                                progress=False,
                                preprocessors=self.preprocessors_)

        enrolment_models, enrolment_khashes = {}, {}
        enrolments = getattr(protocol, '{0}_enrolment'.format(subset))()
        for i, enrolment in enumerate(enrolments):
            data = sequence_embedding.apply(enrolment,
                                            crop=enrolment['enrol_with'])
            model_id = enrolment['model_id']
            model = np.mean(np.stack(data), axis=0, keepdims=True)
            enrolment_models[model_id] = model

            # in some specific speaker verification protocols,
            # enrolment data may be  used later as trial data.
            # therefore, we cache information about enrolment data
            # to speed things up by reusing the enrolment as trial
            h = hash((get_unique_identifier(enrolment),
                      tuple(enrolment['enrol_with'])))
            enrolment_khashes[h] = model_id

        trial_models = {}
        trials = getattr(protocol, '{0}_trial'.format(subset))()
        y_true, y_pred = [], []
        for i, trial in enumerate(trials):
            model_id = trial['model_id']

            h = hash((get_unique_identifier(trial), tuple(trial['try_with'])))

            # re-use enrolment model whenever possible
            if h in enrolment_khashes:
                model = enrolment_models[enrolment_khashes[h]]

            # re-use trial model whenever possible
            elif h in trial_models:
                model = trial_models[h]

            else:
                data = sequence_embedding.apply(trial, crop=trial['try_with'])
                model = np.mean(data, axis=0, keepdims=True)
                # cache trial model for later re-use
                trial_models[h] = model

            distance = cdist(enrolment_models[model_id],
                             model,
                             metric=self.metric)[0, 0]
            y_pred.append(distance)
            y_true.append(trial['reference'])

        _, _, _, eer = det_curve(np.array(y_true),
                                 np.array(y_pred),
                                 distances=True)
        metrics['EER'] = {'minimize': True, 'value': eer}

        return metrics

예제 #8

0

파일 보기

파일: speaker_embedding.py 프로젝트: instinct2k18/pyannote-audio

    def _validate_epoch_verification(self, epoch, protocol_name,
                                     subset='development',
                                     validation_data=None):
        """Perform a speaker verification experiment using model at `epoch`

        Parameters
        ----------
        epoch : int
            Epoch to validate.
        protocol_name : str
            Name of speaker verification protocol
        subset : {'train', 'development', 'test'}, optional
            Name of subset.
        validation_data : provided by `validate_init`

        Returns
        -------
        metrics : dict
        """


        # load current model
        model = self.load_model(epoch).to(self.device)
        model.eval()

        # use user-provided --duration when available
        # otherwise use 'duration' used for training
        if self.duration is None:
            duration = self.task_.duration
        else:
            duration = self.duration
        min_duration = None

        # if 'duration' is still None, it means that
        # network was trained with variable lengths
        if duration is None:
            duration = self.task_.max_duration
            min_duration = self.task_.min_duration

        step = .5 * duration

        if isinstance(self.feature_extraction_, Precomputed):
            self.feature_extraction_.use_memmap = False

        # initialize embedding extraction
        sequence_embedding = SequenceEmbedding(
            model, self.feature_extraction_, duration=duration,
            step=step, min_duration=min_duration,
            batch_size=self.batch_size, device=self.device)

        metrics = {}
        protocol = get_protocol(protocol_name, progress=False,
                                preprocessors=self.preprocessors_)

        enrolment_models, enrolment_khashes = {}, {}
        enrolments = getattr(protocol, '{0}_enrolment'.format(subset))()
        for i, enrolment in enumerate(enrolments):
            data = sequence_embedding.apply(enrolment,
                                            crop=enrolment['enrol_with'])
            model_id = enrolment['model_id']
            model = np.mean(np.stack(data), axis=0, keepdims=True)
            enrolment_models[model_id] = model

            # in some specific speaker verification protocols,
            # enrolment data may be  used later as trial data.
            # therefore, we cache information about enrolment data
            # to speed things up by reusing the enrolment as trial
            h = hash((get_unique_identifier(enrolment),
                      tuple(enrolment['enrol_with'])))
            enrolment_khashes[h] = model_id

        trial_models = {}
        trials = getattr(protocol, '{0}_trial'.format(subset))()
        y_true, y_pred = [], []
        for i, trial in enumerate(trials):
            model_id = trial['model_id']

            h = hash((get_unique_identifier(trial),
                      tuple(trial['try_with'])))

            # re-use enrolment model whenever possible
            if h in enrolment_khashes:
                model = enrolment_models[enrolment_khashes[h]]

            # re-use trial model whenever possible
            elif h in trial_models:
                model = trial_models[h]

            else:
                data = sequence_embedding.apply(trial, crop=trial['try_with'])
                model = np.mean(data, axis=0, keepdims=True)
                # cache trial model for later re-use
                trial_models[h] = model

            distance = cdist(enrolment_models[model_id], model,
                             metric=self.metric)[0, 0]
            y_pred.append(distance)
            y_true.append(trial['reference'])

        _, _, _, eer = det_curve(np.array(y_true), np.array(y_pred),
                                 distances=True)
        metrics['EER'] = {'minimize': True, 'value': eer}

        return metrics