Пример #1
0
    def __init__(self,
                 feature_extraction,
                 sad__pre,
                 scd__pre,
                 emb__pre,
                 sad__onset=0.7,
                 sad__offset=0.7,
                 sad__dimension=1,
                 scd__alpha=0.5,
                 scd__min_duration=1.,
                 scd__dimension=1,
                 emb__internal=False,
                 cls__damping=0.8,
                 cls__preference=-20,
                 cls__metric='cosine'):

        super(SpeakerDiarizationPreStages, self).__init__()

        self.feature_extraction = feature_extraction

        # speech activity detection hyper-parameters
        self.sad__onset = sad__onset
        self.sad__offset = sad__offset
        self.sad__dimension = sad__dimension

        # speaker change detection hyper-parameters
        self.scd__alpha = scd__alpha
        self.scd__min_duration = scd__min_duration
        self.scd__dimension = scd__dimension

        # embedding hyper-parameters
        self.emb__internal = emb__internal

        # clustering hyper-parameters
        self.cls__damping = cls__damping
        self.cls__preference = cls__preference
        self.cls__metric = cls__metric

        step = self.feature_extraction.sliding_window().step

        # initialize speech activity detection module
        self.sad_ = Precomputed(sad__pre)
        self.sad_binarize_ = Binarize(onset=self.sad__onset,
                                      offset=self.sad__offset)

        # initialize speaker change detection module
        self.scd_ = Precomputed(scd__pre)
        self.scd_peak_ = Peak(alpha=self.scd__alpha,
                              min_duration=self.scd__min_duration,
                              percentile=False)

        # initialize speech turn embedding module
        self.emb_ = Precomputed(emb__pre)

        # initialize clustering module
        self.cls_ = my_cluster.ClusteringAP(metric=self.cls__metric,
                                            damping=self.cls__damping,
                                            preference=self.cls__preference)
Пример #2
0
    def __init__(self,
                 feature_extraction,
                 sad__pre,
                 scd__pre,
                 emb__pre,
                 sad__onset=0.7,
                 sad__offset=0.7,
                 sad__dimension=1,
                 scd__alpha=0.5,
                 scd__min_duration=1.,
                 scd__dimension=1,
                 emb__internal=False,
                 cls__method='average',
                 cls__threshold=5,
                 cls__metric='cosine'):

        super(SpeakerDiarizationHACPre, self).__init__()

        self.feature_extraction = feature_extraction

        # speech activity detection hyper-parameters
        self.sad__onset = sad__onset
        self.sad__offset = sad__offset
        self.sad__dimension = sad__dimension

        # speaker change detection hyper-parameters
        self.scd__alpha = scd__alpha
        self.scd__min_duration = scd__min_duration
        self.scd__dimension = scd__dimension

        # embedding hyper-parameters
        self.emb__internal = emb__internal

        # clustering hyper-parameters
        self.cls__method = cls__method
        self.cls__threshold = cls__threshold
        self.cls__metric = cls__metric

        step = self.feature_extraction.sliding_window().step

        # initialize speech activity detection module
        self.sad_ = Precomputed(sad__pre)
        self.sad_binarize_ = Binarize(onset=self.sad__onset,
                                      offset=self.sad__offset)

        # initialize speaker change detection module
        self.scd_ = Precomputed(scd__pre)
        self.scd_peak_ = Peak(alpha=self.scd__alpha,
                              min_duration=self.scd__min_duration,
                              percentile=False)

        # initialize speech turn embedding module
        self.emb_ = Precomputed(emb__pre)

        # initialize clustering module
        self.cls_ = my_cluster.ClusteringHAC(metric=self.cls__metric,
                                             method=self.cls__method,
                                             threshold=self.cls__threshold)
Пример #3
0
def main():
    usage = "%prog [options] database, raw_score_path"
    desc = "Write the output of the binary overlap detector into test based on a threshold"
    version = "%prog 0.1"
    parser = OptionParser(usage=usage, description=desc, version=version)
    parser.add_option("-t", "--onset", action="store", type="float", help="Onset Threshold", default=0.70)
    parser.add_option("-f", "--offset", action="store", type="float", help="Offset Threshold", default=0.70)
    parser.add_option("-d", "--dev", action="store_true", help="Print output based on development set", default=False)
    parser.add_option("-o", "--outputfile", action="store", type="string", help="Output file", default="./overlap.txt")
    (opt, args) = parser.parse_args()

    if(len(args)!=2):
        parser.error("Incorrect number of arguments")
    database, raw_score_path = args

    # get test file of protocol
    protocol = get_protocol(database)

    # load precomputed overlap scores as pyannote.core.SlidingWindowFeature
    precomputed = Precomputed(raw_score_path)
    # StackedRNN model
    # initialize binarizer
    # onset / offset are tunable parameters (and should be tuned for better 
    # performance). we use log_scale=True because of the final log-softmax in the 
    binarize = Binarize(onset=opt.onset, offset=opt.offset, log_scale=True)

    fw = open(opt.outputfile, 'wt')

    if opt.dev:
        for test_file in protocol.development():
            ovl_scores = precomputed(test_file)


            # binarize overlap scores to obtain overlap regions as pyannote.core.Timeline
            ovl_regions = binarize.apply(ovl_scores, dimension=1)
            ovl_regions.uri = test_file['uri']


            # write the output into text
            write_txt(fw, ovl_regions)
 
    else:
        for test_file in protocol.test():
            ovl_scores = precomputed(test_file)


            # binarize overlap scores to obtain overlap regions as pyannote.core.Timeline
            ovl_regions = binarize.apply(ovl_scores, dimension=1)
            ovl_regions.uri = test_file['uri']


            # write the output into text
            write_txt(fw, ovl_regions)
    fw.close()
    def with_params(self, **params):

        # initialize speech/non-speech binarizer
        speech_params = {
            '_'.join(param.split('_')[1:]): value
            for param, value in params.items()
            if param.startswith('speech_')}
        self.speech_binarize_ = Binarize(**speech_params)

        # initialize overlap binarizer
        if self.has_overlap_:
            overlap_params = {
                '_'.join(param.split('_')[1:]): value
                for param, value in params.items()
                if param.startswith('overlap_')}
            self.overlap_binarize_ = Binarize(**overlap_params)

        return self
Пример #5
0
    def with_params(self,
                    sad_onset=0.7,
                    sad_offset=0.7,
                    scd_alpha=0.5,
                    scd_min_duration=1.):

        # initialize speech activity detection
        self.sad_ = Precomputed(self.sad)
        self.sad_onset = sad_onset
        self.sad_offset = sad_offset
        self.sad_binarize_ = Binarize(onset=sad_onset, offset=sad_offset)

        # initialize speaker change detection
        self.scd_ = Precomputed(self.scd)
        self.scd_alpha = scd_alpha
        self.scd_min_duration = scd_min_duration
        self.scd_peak_ = Peak(alpha=scd_alpha, min_duration=scd_min_duration)

        return self
Пример #6
0
    def with_params(self, **params):

        # initialize speech/non-speech binarizer
        speech_params = {
            '_'.join(param.split('_')[1:]): value
            for param, value in params.items() if param.startswith('speech_')
        }
        self.speech_binarize_ = Binarize(**speech_params)

        # initialize overlap binarizer
        if self.has_overlap_:
            overlap_params = {
                '_'.join(param.split('_')[1:]): value
                for param, value in params.items()
                if param.startswith('overlap_')
            }
            self.overlap_binarize_ = Binarize(**overlap_params)

        return self
Пример #7
0
        def fun(threshold):

            binarizer = Binarize(onset=threshold,
                                 offset=threshold,
                                 log_scale=False)

            protocol = get_protocol(protocol_name, progress=False,
                                    preprocessors=self.preprocessors_)

            metric = DetectionErrorRate()

            # NOTE -- embarrasingly parallel
            # TODO -- parallelize this
            file_generator = getattr(protocol, subset)()
            for current_file in file_generator:

                uri = get_unique_identifier(current_file)
                hypothesis = binarizer.apply(
                    predictions[uri], dimension=0).to_annotation()
                reference = current_file['annotation']
                uem = get_annotated(current_file)
                _ = metric(reference, hypothesis, uem=uem)

            return abs(metric)
    def with_params(self, sad_onset=0.7, sad_offset=0.7,
                    scd_alpha=0.5, scd_min_duration=1.):

        # initialize speech activity detection
        self.sad_ = Precomputed(self.sad)
        self.sad_onset = sad_onset
        self.sad_offset = sad_offset
        self.sad_binarize_ = Binarize(onset=sad_onset, offset=sad_offset)

        # initialize speaker change detection
        self.scd_ = Precomputed(self.scd)
        self.scd_alpha = scd_alpha
        self.scd_min_duration = scd_min_duration
        self.scd_peak_ = Peak(alpha=scd_alpha, min_duration=scd_min_duration)

        return self
Пример #9
0
def tune_binarizer(app, epoch, protocol_name, subset='development'):
    """Tune binarizer

    Parameters
    ----------
    app : SpeechActivityDetection
    epoch : int
        Epoch number.
    protocol_name : str
        E.g. 'Etape.SpeakerDiarization.TV'
    subset : {'train', 'development', 'test'}, optional
        Defaults to 'development'.

    Returns
    -------
    params : dict
        See Binarize.tune
    metric : float
        Best achieved detection error rate
    """

    # initialize protocol
    protocol = get_protocol(protocol_name,
                            progress=False,
                            preprocessors=app.preprocessors_)

    # load model for epoch 'epoch'
    sequence_labeling = SequenceLabeling.from_disk(app.train_dir_, epoch)

    # initialize sequence labeling
    duration = app.config_['sequences']['duration']
    step = app.config_['sequences']['step']
    aggregation = SequenceLabelingAggregation(sequence_labeling,
                                              app.feature_extraction_,
                                              duration=duration,
                                              step=step)
    aggregation.cache_preprocessed_ = False

    # tune Binarize thresholds (onset & offset)
    # with respect to detection error rate
    binarize_params, metric = Binarize.tune(getattr(protocol, subset)(),
                                            aggregation.apply,
                                            get_metric=DetectionErrorRate,
                                            dimension=1)

    return binarize_params, metric
Пример #10
0
    def annotate_speakers(self, filename, gui, visualization=True):
        test_file = {'uri': 'filename', 'audio': filename}

        sad_scores = self.sad(test_file)
        binarize = Binarize(offset=0.5, onset=0.70, log_scale=True)
        speech = binarize.apply(sad_scores, dimension=1)

        scd_scores = self.scd(test_file)
        peak = Peak(alpha=0.1, min_duration=1, log_scale=True)
        partition = peak.apply(scd_scores, dimension=1)

        speech_turns = partition.crop(speech)
        embeddings = self.emb(test_file)
        long_turns = Timeline(
            segments=[s for s in speech_turns if s.duration > 1.1])

        res = []
        for segment in long_turns:
            x = embeddings.crop(segment, mode='strict')
            if x.size == 0:
                continue
            n_sample = x.shape[0]
            x = np.mean(x, axis=0)
            if np.any(np.isnan(x)):
                continue
            res.append((segment, x, n_sample))

        if visualization:

            dist = []
            for i in range(len(res)):
                for j in range(i + 1, len(res)):
                    dist.append(l2_dist(res[i][1], res[j][1]))
            fig, ax = plt.subplots()
            ax.scatter(np.arange(len(dist)), np.array(sorted(dist)))
            fig.show()

            # let's visualize SAD and SCD results using pyannote.core visualization API

            # helper function to make visualization prettier
            plot_ready = lambda scores: SlidingWindowFeature(
                np.exp(scores.data[:, 1:]), scores.sliding_window)

            # create a figure with 6 rows with matplotlib
            nrows = 6
            fig, ax = plt.subplots(nrows=nrows, ncols=1)
            fig.set_figwidth(20)
            fig.set_figheight(nrows * 2)

            # 1st row: reference annotation
            # notebook.plot_annotation(test_file['annotation'], ax=ax[0])
            # ax[0].text(notebook.crop.start + 0.5, 0.1, 'reference', fontsize=14)

            # 2nd row: SAD raw scores
            notebook.plot_feature(plot_ready(sad_scores), ax=ax[1])
            ax[1].text(notebook.crop.start + 0.5,
                       0.6,
                       'SAD\nscores',
                       fontsize=14)
            ax[1].set_ylim(-0.1, 1.1)

            # 3rd row: SAD result
            notebook.plot_timeline(speech, ax=ax[2])
            ax[2].text(notebook.crop.start + 0.5, 0.1, 'SAD', fontsize=14)

            # 4th row: SCD raw scores
            notebook.plot_feature(plot_ready(scd_scores), ax=ax[3])
            ax[3].text(notebook.crop.start + 0.5,
                       0.3,
                       'SCD\nscores',
                       fontsize=14)
            ax[3].set_ylim(-0.1, 0.6)

            # 5th row: SCD result
            notebook.plot_timeline(partition, ax=ax[4])
            ax[4].text(notebook.crop.start + 0.5, 0.1, 'SCD', fontsize=14)

            # 6th row: combination of SAD and SCD
            notebook.plot_timeline(speech_turns, ax=ax[5])
            ax[5].text(notebook.crop.start + 0.5,
                       0.1,
                       'speech turns',
                       fontsize=14)

            fig.show()

        res, num_people = self.min_spanning_tree(res)
        gui.append_line('There are {} people in this audio'.format(num_people))

        return res
Пример #11
0
class SpeechActivityDetection(Pipeline):
    """Speech activity detection pipeline

    Parameters
    ----------
    precomputed : str
        Path to precomputed SAD scores.
    """
    def __init__(self, precomputed=None, **kwargs):
        super(SpeechActivityDetection, self).__init__()
        self.precomputed = precomputed

        self.precomputed_ = Precomputed(self.precomputed)
        self.has_overlap_ = self.precomputed_.dimension() == 3

        self.with_params(**kwargs)

    def get_tune_space(self):

        space = {
            'speech_onset': chocolate.uniform(0., 1.),
            'speech_offset': chocolate.uniform(0., 1.),
            'speech_min_duration_on': chocolate.uniform(0., 2.),
            'speech_min_duration_off': chocolate.uniform(0., 2.),
            'speech_pad_onset': chocolate.uniform(-1., 1.),
            'speech_pad_offset': chocolate.uniform(-1., 1.)
        }

        if self.has_overlap_:
            space.update({
                'overlap_onset': chocolate.uniform(0., 1.),
                'overlap_offset': chocolate.uniform(0., 1.),
                'overlap_min_duration_on': chocolate.uniform(0., 2.),
                'overlap_min_duration_off': chocolate.uniform(0., 2.),
                'overlap_pad_onset': chocolate.uniform(-1., 1.),
                'overlap_pad_offset': chocolate.uniform(-1., 1.)
            })

        return space

    def get_tune_metric(self):
        return DetectionErrorRate()

    def with_params(self, **params):

        # initialize speech/non-speech binarizer
        speech_params = {
            '_'.join(param.split('_')[1:]): value
            for param, value in params.items() if param.startswith('speech_')
        }
        self.speech_binarize_ = Binarize(**speech_params)

        # initialize overlap binarizer
        if self.has_overlap_:
            overlap_params = {
                '_'.join(param.split('_')[1:]): value
                for param, value in params.items()
                if param.startswith('overlap_')
            }
            self.overlap_binarize_ = Binarize(**overlap_params)

        return self

    def apply(self, current_file):

        # extract precomputed scores
        precomputed = self.precomputed_(current_file)

        # if this check has not been done yet, do it once and for all
        if not hasattr(self, "log_scale_"):
            # heuristic to determine whether scores are log-scaled
            if np.nanmean(precomputed.data) < 0:
                self.log_scale_ = True
            else:
                self.log_scale_ = False

        data = np.exp(precomputed.data) if self.log_scale_ \
               else precomputed.data

        # speech vs. non-speech
        speech_prob = SlidingWindowFeature(1. - data[:, 0],
                                           precomputed.sliding_window)
        speech = self.speech_binarize_.apply(speech_prob)

        if self.has_overlap_:

            # overlap vs. non-overlap
            overlap_prob = SlidingWindowFeature(data[:, 2],
                                                precomputed.sliding_window)
            overlap = self.overlap_binarize_.apply(overlap_prob)

            # overlap speech can only happen in speech regions
            overlap = overlap.crop(speech)
        else:
            # empty timeline
            overlap = Timeline()

        speech = speech.to_annotation(generator='string')
        overlap = overlap.to_annotation(generator='int')
        hypothesis = speech.update(overlap)

        return hypothesis
Пример #12
0
    def validate_epoch(self, epoch, protocol_name, subset='development',
                       validation_data=None):

        target_precision = self.precision

        # load model for current epoch
        model = self.load_model(epoch).to(self.device)
        model.eval()

        if isinstance(self.feature_extraction_, Precomputed):
            self.feature_extraction_.use_memmap = False

        duration = self.task_.duration
        step = .25 * duration
        sequence_labeling = SequenceLabeling(
            model, self.feature_extraction_, duration=duration,
            step=.25 * duration, batch_size=self.batch_size,
            source='audio', device=self.device)

        protocol = get_protocol(protocol_name, progress=False,
                                preprocessors=self.preprocessors_)

        predictions = {}
        references = {}

        file_generator = getattr(protocol, subset)()
        for current_file in file_generator:
            uri = get_unique_identifier(current_file)

            # build overlap reference
            reference = Timeline(uri=uri)
            annotation = current_file['annotation']
            for track1, track2 in annotation.co_iter(annotation):
                if track1 == track2:
                    continue
                reference.add(track1[0] & track2[0])
            references[uri] = reference.to_annotation()

            # extract overlap scores
            scores = sequence_labeling.apply(current_file)

            if model.logsoftmax:
                scores = SlidingWindowFeature(
                    np.exp(scores.data[:, 2]), scores.sliding_window)
            else:
                scores = SlidingWindowFeature(
                    scores.data[:, 2], scores.sliding_window)

            predictions[uri] = scores

        # dichotomic search to find threshold that maximizes recall
        # while having at least `target_precision`

        lower_alpha = 0.
        upper_alpha = 1.
        best_alpha = .5 * (lower_alpha + upper_alpha)
        best_recall = 0.

        for _ in range(10):
            current_alpha = .5 * (lower_alpha + upper_alpha)
            binarizer = Binarize(onset=current_alpha,
                                 offset=current_alpha,
                                 log_scale=False)

            precision = DetectionPrecision()
            recall = DetectionRecall()

            for current_file in getattr(protocol, subset)():
                uri = get_unique_identifier(current_file)
                reference = references[uri]
                hypothesis = binarizer.apply(predictions[uri], dimension=0)
                hypothesis = hypothesis.to_annotation()
                uem = get_annotated(current_file)
                _ = precision(reference, hypothesis, uem=uem)
                _ = recall(reference, hypothesis, uem=uem)

            if abs(precision) < target_precision:
                # precision is not high enough: try higher thresholds
                lower_alpha = current_alpha
            else:
                upper_alpha = current_alpha
                r = abs(recall)
                if r > best_recall:
                    best_recall = r
                    best_alpha = current_alpha

        task = 'overlap_speech_detection'
        metric_name = f'{task}/recall@{target_precision:.2f}precision'
        return {
            metric_name: {'minimize': False, 'value': best_recall},
            f'{task}/threshold': {'minimize': 'NA', 'value': best_alpha}}
class NeuralSegmentation(Pipeline):

    def __init__(self, sad=None, scd=None, **kwargs):
        super().__init__()
        self.sad = Path(sad).expanduser().resolve(strict=True)
        self.scd = Path(scd).expanduser().resolve(strict=True)
        self.with_params(**kwargs)

    def get_tune_space(self):
        return {
            'sad_onset': chocolate.uniform(0., 1.),
            'sad_offset': chocolate.uniform(0., 1.),
            'scd_alpha': chocolate.uniform(0., 1.),
            'scd_min_duration': chocolate.uniform(0., 5.),
        }

    def get_tune_metric(self):
        raise NotImplementedError()

    def with_params(self, sad_onset=0.7, sad_offset=0.7,
                    scd_alpha=0.5, scd_min_duration=1.):

        # initialize speech activity detection
        self.sad_ = Precomputed(self.sad)
        self.sad_onset = sad_onset
        self.sad_offset = sad_offset
        self.sad_binarize_ = Binarize(onset=sad_onset, offset=sad_offset)

        # initialize speaker change detection
        self.scd_ = Precomputed(self.scd)
        self.scd_alpha = scd_alpha
        self.scd_min_duration = scd_min_duration
        self.scd_peak_ = Peak(alpha=scd_alpha, min_duration=scd_min_duration)

        return self

    def apply(self, current_file):

        # Speech Activity Detection

        # get raw SAD scores
        soft_sad = self.sad_(current_file)

        # check once and for all whether SAD scores are log-scaled
        if not hasattr(self, 'sad_log_scale_'):
            if np.nanmean(soft_sad.data) < 0:
                self.sad_log_scale_ = True
            else:
                self.sad_log_scale_ = False

        # get SAD probability
        prob_sad = np.exp(soft_sad.data) if self.sad_log_scale_ \
                   else soft_sad.data

        # support both non-speech/speech & non-speech/single/overlap
        prob_sad = 1. - prob_sad[:, 0]
        prob_sad = SlidingWindowFeature(prob_sad, soft_sad.sliding_window)

        # binarization
        hard_sad = self.sad_binarize_.apply(prob_sad)

        # Speaker Change Detection

        # get raw SCD scores
        soft_scd = self.scd_(current_file)

        # check once and for all whether SCD scores are log-scaled
        if not hasattr(self, 'scd_log_scale_'):
            if np.nanmean(soft_scd.data) < 0:
                self.scd_log_scale_ = True
            else:
                self.scd_log_scale_ = False

        # get SCD probability
        prob_scd = np.exp(soft_scd.data) if self.scd_log_scale_ \
                   else soft_scd.data

        # take the final dimension
        # (in order to support both classification and regression scores)
        prob_scd = prob_scd[:, -1]
        prob_scd = SlidingWindowFeature(prob_scd, soft_scd.sliding_window)

        # peak detection
        hard_scd = self.scd_peak_.apply(prob_scd)

        speech_turns = hard_scd.crop(hard_sad)

        # only process the annotated part
        speech_turns = speech_turns.crop(get_annotated(current_file))

        return speech_turns
Пример #14
0
class SpeakerDiarizationWeighted(object):
    def __init__(self,
                 feature_extraction,
                 sad__pre,
                 scd__pre,
                 weight__pre,
                 emb__pre,
                 sad__onset=0.7,
                 sad__offset=0.7,
                 sad__dimension=1,
                 scd__alpha=0.5,
                 scd__min_duration=1.,
                 scd__dimension=1,
                 emb__internal=False,
                 cls__damping=0.8,
                 cls__preference=-20,
                 cls__metric='cosine'):

        super(SpeakerDiarizationWeighted, self).__init__()

        self.feature_extraction = feature_extraction

        # speech activity detection hyper-parameters
        self.sad__onset = sad__onset
        self.sad__offset = sad__offset
        self.sad__dimension = sad__dimension

        # speaker change detection hyper-parameters
        self.scd__alpha = scd__alpha
        self.scd__min_duration = scd__min_duration
        self.scd__dimension = scd__dimension

        # embedding hyper-parameters
        self.emb__internal = emb__internal

        # clustering hyper-parameters
        self.cls__damping = cls__damping
        self.cls__preference = cls__preference
        self.cls__metric = cls__metric

        step = self.feature_extraction.sliding_window().step

        # initialize speech activity detection module
        self.sad_ = Precomputed(sad__pre)
        self.sad_binarize_ = Binarize(onset=self.sad__onset,
                                      offset=self.sad__offset)

        # initialize speaker change detection module
        self.scd_ = Precomputed(scd__pre)
        self.scd_peak_ = Peak(alpha=self.scd__alpha,
                              min_duration=self.scd__min_duration,
                              percentile=False)

        # initialize weights
        self.weight_ = Precomputed(weight__pre)

        # initialize speech turn embedding module
        self.emb_ = Precomputed(emb__pre)

        # initialize clustering module
        self.cls_ = my_cluster.ClusteringAP(metric=self.cls__metric,
                                            damping=self.cls__damping,
                                            preference=self.cls__preference)

    def __call__(self, current_file, annotated=False):

        # speech activity detection
        soft_sad = self.sad_(current_file)
        hard_sad = self.sad_binarize_.apply(soft_sad,
                                            dimension=self.sad__dimension)

        # speaker change detection
        soft_scd = self.scd_(current_file)
        hard_scd = self.scd_peak_.apply(soft_scd,
                                        dimension=self.scd__dimension)

        # speech turns
        speech_turns = hard_scd.crop(hard_sad)

        if annotated:
            speech_turns = speech_turns.crop(get_annotated(current_file))

        # remove small speech turns
        emb = self.emb_(current_file)
        speech_turns = [
            speech_turn for speech_turn in speech_turns
            if len(emb.crop(speech_turn, mode='loose')) > 0
        ]

        # weights
        weight = self.weight_(current_file)

        # speech turns embedding
        to_stack = [
            np.mean(emb.crop(speech_turn, mode='loose') *
                    (1 - weight.crop(speech_turn, mode='loose')),
                    axis=0) for speech_turn in speech_turns
        ]
        if len(to_stack) < 1:
            return None
        fX = l2_normalize(np.vstack(to_stack))

        # speech turn clustering
        cluster_labels = self.cls_.apply(fX)

        # build hypothesis from clustering results
        hypothesis = Annotation(uri=current_file['uri'])
        for speech_turn, label in zip(speech_turns, cluster_labels):
            hypothesis[speech_turn] = label
        return hypothesis
Пример #15
0
class SpeakerDiarizationHACPre(object):
    '''Speaker diarization with hierarchical agglomerative clustering'''
    def __init__(self,
                 feature_extraction,
                 sad__pre,
                 scd__pre,
                 emb__pre,
                 sad__onset=0.7,
                 sad__offset=0.7,
                 sad__dimension=1,
                 scd__alpha=0.5,
                 scd__min_duration=1.,
                 scd__dimension=1,
                 emb__internal=False,
                 cls__method='average',
                 cls__threshold=5,
                 cls__metric='cosine'):

        super(SpeakerDiarizationHACPre, self).__init__()

        self.feature_extraction = feature_extraction

        # speech activity detection hyper-parameters
        self.sad__onset = sad__onset
        self.sad__offset = sad__offset
        self.sad__dimension = sad__dimension

        # speaker change detection hyper-parameters
        self.scd__alpha = scd__alpha
        self.scd__min_duration = scd__min_duration
        self.scd__dimension = scd__dimension

        # embedding hyper-parameters
        self.emb__internal = emb__internal

        # clustering hyper-parameters
        self.cls__method = cls__method
        self.cls__threshold = cls__threshold
        self.cls__metric = cls__metric

        step = self.feature_extraction.sliding_window().step

        # initialize speech activity detection module
        self.sad_ = Precomputed(sad__pre)
        self.sad_binarize_ = Binarize(onset=self.sad__onset,
                                      offset=self.sad__offset)

        # initialize speaker change detection module
        self.scd_ = Precomputed(scd__pre)
        self.scd_peak_ = Peak(alpha=self.scd__alpha,
                              min_duration=self.scd__min_duration,
                              percentile=False)

        # initialize speech turn embedding module
        self.emb_ = Precomputed(emb__pre)

        # initialize clustering module
        self.cls_ = my_cluster.ClusteringHAC(metric=self.cls__metric,
                                             method=self.cls__method,
                                             threshold=self.cls__threshold)

    def __call__(self, current_file, annotated=False):

        # speech activity detection
        soft_sad = self.sad_(current_file)
        hard_sad = self.sad_binarize_.apply(soft_sad,
                                            dimension=self.sad__dimension)

        # speaker change detection
        soft_scd = self.scd_(current_file)
        hard_scd = self.scd_peak_.apply(soft_scd,
                                        dimension=self.scd__dimension)

        # speech turns
        speech_turns = hard_scd.crop(hard_sad)

        if annotated:
            speech_turns = speech_turns.crop(get_annotated(current_file))

        # remove small speech turns
        emb = self.emb_(current_file)
        speech_turns = [
            speech_turn for speech_turn in speech_turns
            if len(emb.crop(speech_turn, mode='loose')) > 0
        ]

        # speech turns embedding
        to_stack = [
            np.sum(emb.crop(speech_turn, mode='loose'), axis=0)
            for speech_turn in speech_turns
        ]
        if len(to_stack) < 1:
            return None
        fX = l2_normalize(np.vstack(to_stack))

        # speech turn clustering
        cluster_labels = self.cls_.apply(fX)

        # build hypothesis from clustering results
        hypothesis = Annotation(uri=current_file['uri'])
        for speech_turn, label in zip(speech_turns, cluster_labels):
            hypothesis[speech_turn] = label
        return hypothesis
def test(dataset, medium_template, config_yml, weights_h5, output_dir):

    # load configuration file
    with open(config_yml, 'r') as fp:
        config = yaml.load(fp)

    # this is where model architecture was saved
    architecture_yml = os.path.dirname(
        os.path.dirname(weights_h5)) + '/architecture.yml'

    # -- DATASET --
    db, task, protocol, subset = dataset.split('.')
    database = get_database(db, medium_template=medium_template)
    protocol = database.get_protocol(task, protocol)

    if not hasattr(protocol, subset):
        raise NotImplementedError('')

    file_generator = getattr(protocol, subset)()

    # -- FEATURE EXTRACTION --
    # input sequence duration
    duration = config['feature_extraction']['duration']
    # MFCCs
    feature_extractor = YaafeMFCC(**config['feature_extraction']['mfcc'])
    # normalization
    normalize = config['feature_extraction']['normalize']

    # -- TESTING --
    # overlap ratio between each window
    overlap = config['testing']['overlap']
    step = duration * (1. - overlap)

    # prediction smoothing
    onset = config['testing']['binarize']['onset']
    offset = config['testing']['binarize']['offset']
    binarizer = Binarize(onset=0.5, offset=0.5)

    sequence_labeling = SequenceLabeling.from_disk(architecture_yml,
                                                   weights_h5)

    aggregation = SequenceLabelingAggregation(sequence_labeling,
                                              feature_extractor,
                                              normalize=normalize,
                                              duration=duration,
                                              step=step)

    collar = 0.500
    error_rate = DetectionErrorRate(collar=collar)
    accuracy = DetectionAccuracy(collar=collar)
    precision = DetectionPrecision(collar=collar)
    recall = DetectionRecall(collar=collar)

    LINE = '{uri} {e:.3f} {a:.3f} {p:.3f} {r:.3f} {f:.3f}\n'

    PATH = '{output_dir}/eval.{dataset}.{subset}.txt'
    path = PATH.format(output_dir=output_dir, dataset=dataset, subset=subset)

    with open(path, 'w') as fp:

        header = '# uri error accuracy precision recall f_measure\n'
        fp.write(header)
        fp.flush()

        for current_file in file_generator:

            uri = current_file['uri']
            wav = current_file['medium']['wav']
            annotated = current_file['annotated']
            annotation = current_file['annotation']

            predictions = aggregation.apply(wav)
            hypothesis = binarizer.apply(predictions, dimension=1)

            e = error_rate(annotation, hypothesis, uem=annotated)
            a = accuracy(annotation, hypothesis, uem=annotated)
            p = precision(annotation, hypothesis, uem=annotated)
            r = recall(annotation, hypothesis, uem=annotated)
            f = f_measure(p, r)

            line = LINE.format(uri=uri, e=e, a=a, p=p, r=r, f=f)
            fp.write(line)
            fp.flush()

            PATH = '{output_dir}/{uri}.json'
            path = PATH.format(output_dir=output_dir, uri=uri)
            dump_to(hypothesis, path)

        # average on whole corpus
        uri = '{dataset}.{subset}'.format(dataset=dataset, subset=subset)
        e = abs(error_rate)
        a = abs(accuracy)
        p = abs(precision)
        r = abs(recall)
        f = f_measure(p, r)
        line = LINE.format(uri=uri, e=e, a=a, p=p, r=r, f=f)
        fp.write(line)
        fp.flush()
Пример #17
0
    def apply(self, protocol_name, subset='test'):

        apply_dir = self.APPLY_DIR.format(tune_dir=self.tune_dir_)

        mkdir_p(apply_dir)

        # load tuning results
        tune_yml = self.TUNE_YML.format(tune_dir=self.tune_dir_)
        with io.open(tune_yml, 'r') as fp:
            self.tune_ = yaml.load(fp)

        # load model for epoch 'epoch'
        epoch = self.tune_['epoch']
        sequence_labeling = SequenceLabeling.from_disk(self.train_dir_, epoch)

        # initialize sequence labeling
        duration = self.config_['sequences']['duration']
        step = self.config_['sequences']['step']
        aggregation = SequenceLabelingAggregation(sequence_labeling,
                                                  self.feature_extraction_,
                                                  duration=duration,
                                                  step=step)

        # initialize protocol
        protocol = get_protocol(protocol_name,
                                progress=True,
                                preprocessors=self.preprocessors_)

        for i, item in enumerate(getattr(protocol, subset)()):

            prediction = aggregation.apply(item)

            if i == 0:
                # create metadata file at root that contains
                # sliding window and dimension information
                path = Precomputed.get_config_path(apply_dir)
                f = h5py.File(path)
                f.attrs['start'] = prediction.sliding_window.start
                f.attrs['duration'] = prediction.sliding_window.duration
                f.attrs['step'] = prediction.sliding_window.step
                f.attrs['dimension'] = 2
                f.close()

            path = Precomputed.get_path(apply_dir, item)

            # create parent directory
            mkdir_p(dirname(path))

            f = h5py.File(path)
            f.attrs['start'] = prediction.sliding_window.start
            f.attrs['duration'] = prediction.sliding_window.duration
            f.attrs['step'] = prediction.sliding_window.step
            f.attrs['dimension'] = 2
            f.create_dataset('features', data=prediction.data)
            f.close()

        # initialize binarizer
        onset = self.tune_['onset']
        offset = self.tune_['offset']
        binarize = Binarize(onset=onset, offset=offset)

        precomputed = Precomputed(root_dir=apply_dir)

        writer = MDTMParser()
        path = self.HARD_MDTM.format(apply_dir=apply_dir,
                                     protocol=protocol_name,
                                     subset=subset)
        with io.open(path, mode='w') as gp:
            for item in getattr(protocol, subset)():
                prediction = precomputed(item)
                segmentation = binarize.apply(prediction, dimension=1)
                writer.write(segmentation.to_annotation(),
                             f=gp,
                             uri=item['uri'],
                             modality='speaker')
class SpeechActivityDetection(Pipeline):
    """Speech activity detection pipeline

    Parameters
    ----------
    precomputed : str
        Path to precomputed SAD scores.
    """

    def __init__(self, precomputed=None, **kwargs):
        super(SpeechActivityDetection, self).__init__()
        self.precomputed = precomputed

        self.precomputed_ = Precomputed(self.precomputed)
        self.has_overlap_ = self.precomputed_.dimension() == 3

        self.with_params(**kwargs)

    def get_tune_space(self):

        space = {
            'speech_onset': chocolate.uniform(0., 1.),
            'speech_offset': chocolate.uniform(0., 1.),
            'speech_min_duration_on': chocolate.uniform(0., 2.),
            'speech_min_duration_off': chocolate.uniform(0., 2.),
            'speech_pad_onset': chocolate.uniform(-1., 1.),
            'speech_pad_offset': chocolate.uniform(-1., 1.)
        }

        if self.has_overlap_:
            space.update({
                'overlap_onset': chocolate.uniform(0., 1.),
                'overlap_offset': chocolate.uniform(0., 1.),
                'overlap_min_duration_on': chocolate.uniform(0., 2.),
                'overlap_min_duration_off': chocolate.uniform(0., 2.),
                'overlap_pad_onset': chocolate.uniform(-1., 1.),
                'overlap_pad_offset': chocolate.uniform(-1., 1.)
            })

        return space

    def get_tune_metric(self):
        return DetectionErrorRate()

    def with_params(self, **params):

        # initialize speech/non-speech binarizer
        speech_params = {
            '_'.join(param.split('_')[1:]): value
            for param, value in params.items()
            if param.startswith('speech_')}
        self.speech_binarize_ = Binarize(**speech_params)

        # initialize overlap binarizer
        if self.has_overlap_:
            overlap_params = {
                '_'.join(param.split('_')[1:]): value
                for param, value in params.items()
                if param.startswith('overlap_')}
            self.overlap_binarize_ = Binarize(**overlap_params)

        return self

    def apply(self, current_file):

        # extract precomputed scores
        precomputed = self.precomputed_(current_file)

        # if this check has not been done yet, do it once and for all
        if not hasattr(self, "log_scale_"):
            # heuristic to determine whether scores are log-scaled
            if np.nanmean(precomputed.data) < 0:
                self.log_scale_ = True
            else:
                self.log_scale_ = False

        data = np.exp(precomputed.data) if self.log_scale_ \
               else precomputed.data

        # speech vs. non-speech
        speech_prob = SlidingWindowFeature(
            1. - data[:, 0],
            precomputed.sliding_window)
        speech = self.speech_binarize_.apply(speech_prob)

        if self.has_overlap_:

            # overlap vs. non-overlap
            overlap_prob = SlidingWindowFeature(
                data[:, 2], precomputed.sliding_window)
            overlap = self.overlap_binarize_.apply(overlap_prob)

            # overlap speech can only happen in speech regions
            overlap = overlap.crop(speech)
        else:
            # empty timeline
            overlap = Timeline()

        speech = speech.to_annotation(generator='string')
        overlap = overlap.to_annotation(generator='int')
        hypothesis = speech.update(overlap)

        return hypothesis
Пример #19
0
# protocol = get_protocol('AMI.SpeakerDiarization.MixHeadset',
#                         preprocessors=preprocessors)
# test_file = next(protocol.test())

#
from pyannote.audio.labeling.extraction import SequenceLabeling
sad = SequenceLabeling(model=SAD_MODEL)
scd = SequenceLabeling(model=SCD_MODEL)

sad_scores = sad(test_file)
#
# # binarize raw SAD scores (as `pyannote.core.Timeline` instance)
# # NOTE: both onset/offset values were tuned on AMI dataset.
# # you might need to use different values for better results.
from pyannote.audio.signal import Binarize
binarize = Binarize(offset=0.94, onset=0.70, log_scale=True)
speech = binarize.apply(sad_scores, dimension=1)
#
# iterate over speech segments (as `pyannote.core.Segment` instances)
for segment in speech:
    print(segment.start, segment.end)

# obtain raw SCD scores (as `pyannote.core.SlidingWindowFeature` instance)
scd_scores = scd(test_file)

# detect peaks and return speaker homogeneous segments
# (as `pyannote.core.Annotation` instance)
# NOTE: both alpha/min_duration values were tuned on AMI dataset.
# you might need to use different values for better results.
from pyannote.audio.signal import Peak
peak = Peak(alpha=0.08, min_duration=0.40, log_scale=True)
Пример #20
0
class NeuralSegmentation(Pipeline):
    def __init__(self, sad=None, scd=None, **kwargs):
        super().__init__()
        self.sad = Path(sad).expanduser().resolve(strict=True)
        self.scd = Path(scd).expanduser().resolve(strict=True)
        self.with_params(**kwargs)

    def get_tune_space(self):
        return {
            'sad_onset': chocolate.uniform(0., 1.),
            'sad_offset': chocolate.uniform(0., 1.),
            'scd_alpha': chocolate.uniform(0., 1.),
            'scd_min_duration': chocolate.uniform(0., 5.),
        }

    def get_tune_metric(self):
        raise NotImplementedError()

    def with_params(self,
                    sad_onset=0.7,
                    sad_offset=0.7,
                    scd_alpha=0.5,
                    scd_min_duration=1.):

        # initialize speech activity detection
        self.sad_ = Precomputed(self.sad)
        self.sad_onset = sad_onset
        self.sad_offset = sad_offset
        self.sad_binarize_ = Binarize(onset=sad_onset, offset=sad_offset)

        # initialize speaker change detection
        self.scd_ = Precomputed(self.scd)
        self.scd_alpha = scd_alpha
        self.scd_min_duration = scd_min_duration
        self.scd_peak_ = Peak(alpha=scd_alpha, min_duration=scd_min_duration)

        return self

    def apply(self, current_file):

        # Speech Activity Detection

        # get raw SAD scores
        soft_sad = self.sad_(current_file)

        # check once and for all whether SAD scores are log-scaled
        if not hasattr(self, 'sad_log_scale_'):
            if np.nanmean(soft_sad.data) < 0:
                self.sad_log_scale_ = True
            else:
                self.sad_log_scale_ = False

        # get SAD probability
        prob_sad = np.exp(soft_sad.data) if self.sad_log_scale_ \
                   else soft_sad.data

        # support both non-speech/speech & non-speech/single/overlap
        prob_sad = 1. - prob_sad[:, 0]
        prob_sad = SlidingWindowFeature(prob_sad, soft_sad.sliding_window)

        # binarization
        hard_sad = self.sad_binarize_.apply(prob_sad)

        # Speaker Change Detection

        # get raw SCD scores
        soft_scd = self.scd_(current_file)

        # check once and for all whether SCD scores are log-scaled
        if not hasattr(self, 'scd_log_scale_'):
            if np.nanmean(soft_scd.data) < 0:
                self.scd_log_scale_ = True
            else:
                self.scd_log_scale_ = False

        # get SCD probability
        prob_scd = np.exp(soft_scd.data) if self.scd_log_scale_ \
                   else soft_scd.data

        # take the final dimension
        # (in order to support both classification and regression scores)
        prob_scd = prob_scd[:, -1]
        prob_scd = SlidingWindowFeature(prob_scd, soft_scd.sliding_window)

        # peak detection
        hard_scd = self.scd_peak_.apply(prob_scd)

        speech_turns = hard_scd.crop(hard_sad)

        # only process the annotated part
        speech_turns = speech_turns.crop(get_annotated(current_file))

        return speech_turns