Exemplo n.º 1
0
    def __init__(self,
                 feature_extraction,
                 sad__pre,
                 scd__pre,
                 emb__pre,
                 sad__onset=0.7,
                 sad__offset=0.7,
                 sad__dimension=1,
                 scd__alpha=0.5,
                 scd__min_duration=1.,
                 scd__dimension=1,
                 emb__internal=False,
                 cls__damping=0.8,
                 cls__preference=-20,
                 cls__metric='cosine'):

        super(SpeakerDiarizationPreStages, self).__init__()

        self.feature_extraction = feature_extraction

        # speech activity detection hyper-parameters
        self.sad__onset = sad__onset
        self.sad__offset = sad__offset
        self.sad__dimension = sad__dimension

        # speaker change detection hyper-parameters
        self.scd__alpha = scd__alpha
        self.scd__min_duration = scd__min_duration
        self.scd__dimension = scd__dimension

        # embedding hyper-parameters
        self.emb__internal = emb__internal

        # clustering hyper-parameters
        self.cls__damping = cls__damping
        self.cls__preference = cls__preference
        self.cls__metric = cls__metric

        step = self.feature_extraction.sliding_window().step

        # initialize speech activity detection module
        self.sad_ = Precomputed(sad__pre)
        self.sad_binarize_ = Binarize(onset=self.sad__onset,
                                      offset=self.sad__offset)

        # initialize speaker change detection module
        self.scd_ = Precomputed(scd__pre)
        self.scd_peak_ = Peak(alpha=self.scd__alpha,
                              min_duration=self.scd__min_duration,
                              percentile=False)

        # initialize speech turn embedding module
        self.emb_ = Precomputed(emb__pre)

        # initialize clustering module
        self.cls_ = my_cluster.ClusteringAP(metric=self.cls__metric,
                                            damping=self.cls__damping,
                                            preference=self.cls__preference)
Exemplo n.º 2
0
    def __init__(self,
                 feature_extraction,
                 sad__pre,
                 scd__pre,
                 emb__pre,
                 sad__onset=0.7,
                 sad__offset=0.7,
                 sad__dimension=1,
                 scd__alpha=0.5,
                 scd__min_duration=1.,
                 scd__dimension=1,
                 emb__internal=False,
                 cls__method='average',
                 cls__threshold=5,
                 cls__metric='cosine'):

        super(SpeakerDiarizationHACPre, self).__init__()

        self.feature_extraction = feature_extraction

        # speech activity detection hyper-parameters
        self.sad__onset = sad__onset
        self.sad__offset = sad__offset
        self.sad__dimension = sad__dimension

        # speaker change detection hyper-parameters
        self.scd__alpha = scd__alpha
        self.scd__min_duration = scd__min_duration
        self.scd__dimension = scd__dimension

        # embedding hyper-parameters
        self.emb__internal = emb__internal

        # clustering hyper-parameters
        self.cls__method = cls__method
        self.cls__threshold = cls__threshold
        self.cls__metric = cls__metric

        step = self.feature_extraction.sliding_window().step

        # initialize speech activity detection module
        self.sad_ = Precomputed(sad__pre)
        self.sad_binarize_ = Binarize(onset=self.sad__onset,
                                      offset=self.sad__offset)

        # initialize speaker change detection module
        self.scd_ = Precomputed(scd__pre)
        self.scd_peak_ = Peak(alpha=self.scd__alpha,
                              min_duration=self.scd__min_duration,
                              percentile=False)

        # initialize speech turn embedding module
        self.emb_ = Precomputed(emb__pre)

        # initialize clustering module
        self.cls_ = my_cluster.ClusteringHAC(metric=self.cls__metric,
                                             method=self.cls__method,
                                             threshold=self.cls__threshold)
Exemplo n.º 3
0
    def with_params(self,
                    sad_onset=0.7,
                    sad_offset=0.7,
                    scd_alpha=0.5,
                    scd_min_duration=1.):

        # initialize speech activity detection
        self.sad_ = Precomputed(self.sad)
        self.sad_onset = sad_onset
        self.sad_offset = sad_offset
        self.sad_binarize_ = Binarize(onset=sad_onset, offset=sad_offset)

        # initialize speaker change detection
        self.scd_ = Precomputed(self.scd)
        self.scd_alpha = scd_alpha
        self.scd_min_duration = scd_min_duration
        self.scd_peak_ = Peak(alpha=scd_alpha, min_duration=scd_min_duration)

        return self
Exemplo n.º 4
0
    def objective_function(parameters, beta=1.0):

        epoch, alpha = parameters

        weights_h5 = WEIGHTS_H5.format(epoch=epoch)
        sequence_embedding = SequenceEmbedding.from_disk(
            architecture_yml, weights_h5)

        segmentation = Segmentation(
            sequence_embedding, feature_extraction,
            duration=duration, step=0.100)

        if epoch not in predictions:
            predictions[epoch] = {}

        purity = SegmentationPurity()
        coverage = SegmentationCoverage()

        f, n = 0., 0
        for dev_file in getattr(protocol, subset)():

            uri = get_unique_identifier(dev_file)
            reference = dev_file['annotation']
            n += 1

            if uri in predictions[epoch]:
                prediction = predictions[epoch][uri]
            else:
                prediction = segmentation.apply(dev_file)
                predictions[epoch][uri] = prediction

            peak = Peak(alpha=alpha)
            hypothesis = peak.apply(prediction)

            p = purity(reference, hypothesis)
            c = coverage(reference, hypothesis)
            f += f_measure(c, p, beta=beta)

        return 1 - (f / n)
    def with_params(self, sad_onset=0.7, sad_offset=0.7,
                    scd_alpha=0.5, scd_min_duration=1.):

        # initialize speech activity detection
        self.sad_ = Precomputed(self.sad)
        self.sad_onset = sad_onset
        self.sad_offset = sad_offset
        self.sad_binarize_ = Binarize(onset=sad_onset, offset=sad_offset)

        # initialize speaker change detection
        self.scd_ = Precomputed(self.scd)
        self.scd_alpha = scd_alpha
        self.scd_min_duration = scd_min_duration
        self.scd_peak_ = Peak(alpha=scd_alpha, min_duration=scd_min_duration)

        return self
class NeuralSegmentation(Pipeline):

    def __init__(self, sad=None, scd=None, **kwargs):
        super().__init__()
        self.sad = Path(sad).expanduser().resolve(strict=True)
        self.scd = Path(scd).expanduser().resolve(strict=True)
        self.with_params(**kwargs)

    def get_tune_space(self):
        return {
            'sad_onset': chocolate.uniform(0., 1.),
            'sad_offset': chocolate.uniform(0., 1.),
            'scd_alpha': chocolate.uniform(0., 1.),
            'scd_min_duration': chocolate.uniform(0., 5.),
        }

    def get_tune_metric(self):
        raise NotImplementedError()

    def with_params(self, sad_onset=0.7, sad_offset=0.7,
                    scd_alpha=0.5, scd_min_duration=1.):

        # initialize speech activity detection
        self.sad_ = Precomputed(self.sad)
        self.sad_onset = sad_onset
        self.sad_offset = sad_offset
        self.sad_binarize_ = Binarize(onset=sad_onset, offset=sad_offset)

        # initialize speaker change detection
        self.scd_ = Precomputed(self.scd)
        self.scd_alpha = scd_alpha
        self.scd_min_duration = scd_min_duration
        self.scd_peak_ = Peak(alpha=scd_alpha, min_duration=scd_min_duration)

        return self

    def apply(self, current_file):

        # Speech Activity Detection

        # get raw SAD scores
        soft_sad = self.sad_(current_file)

        # check once and for all whether SAD scores are log-scaled
        if not hasattr(self, 'sad_log_scale_'):
            if np.nanmean(soft_sad.data) < 0:
                self.sad_log_scale_ = True
            else:
                self.sad_log_scale_ = False

        # get SAD probability
        prob_sad = np.exp(soft_sad.data) if self.sad_log_scale_ \
                   else soft_sad.data

        # support both non-speech/speech & non-speech/single/overlap
        prob_sad = 1. - prob_sad[:, 0]
        prob_sad = SlidingWindowFeature(prob_sad, soft_sad.sliding_window)

        # binarization
        hard_sad = self.sad_binarize_.apply(prob_sad)

        # Speaker Change Detection

        # get raw SCD scores
        soft_scd = self.scd_(current_file)

        # check once and for all whether SCD scores are log-scaled
        if not hasattr(self, 'scd_log_scale_'):
            if np.nanmean(soft_scd.data) < 0:
                self.scd_log_scale_ = True
            else:
                self.scd_log_scale_ = False

        # get SCD probability
        prob_scd = np.exp(soft_scd.data) if self.scd_log_scale_ \
                   else soft_scd.data

        # take the final dimension
        # (in order to support both classification and regression scores)
        prob_scd = prob_scd[:, -1]
        prob_scd = SlidingWindowFeature(prob_scd, soft_scd.sliding_window)

        # peak detection
        hard_scd = self.scd_peak_.apply(prob_scd)

        speech_turns = hard_scd.crop(hard_sad)

        # only process the annotated part
        speech_turns = speech_turns.crop(get_annotated(current_file))

        return speech_turns
Exemplo n.º 7
0
def test(protocol, tune_dir, apply_dir, subset='test', beta=1.0):

    os.makedirs(apply_dir)

    train_dir = os.path.dirname(os.path.dirname(os.path.dirname(tune_dir)))

    duration = float(os.path.basename(train_dir))
    config_dir = os.path.dirname(os.path.dirname(os.path.dirname(train_dir)))
    config_yml = config_dir + '/config.yml'
    with open(config_yml, 'r') as fp:
        config = yaml.load(fp)

    # -- FEATURE EXTRACTION --
    feature_extraction_name = config['feature_extraction']['name']
    features = __import__('pyannote.audio.features',
                          fromlist=[feature_extraction_name])
    FeatureExtraction = getattr(features, feature_extraction_name)
    feature_extraction = FeatureExtraction(
        **config['feature_extraction'].get('params', {}))

    # -- HYPER-PARAMETERS --
    tune_yml = tune_dir + '/tune.yml'
    with open(tune_yml, 'r') as fp:
        tune = yaml.load(fp)

    architecture_yml = train_dir + '/architecture.yml'
    WEIGHTS_H5 = train_dir + '/weights/{epoch:04d}.h5'
    weights_h5 = WEIGHTS_H5.format(epoch=tune['epoch'])

    sequence_embedding = SequenceEmbedding.from_disk(
        architecture_yml, weights_h5)

    segmentation = Segmentation(
        sequence_embedding, feature_extraction,
        duration=duration, step=0.100)

    peak = Peak(alpha=tune['alpha'])

    HARD_JSON = apply_dir + '/{uri}.hard.json'
    SOFT_PKL = apply_dir + '/{uri}.soft.pkl'

    eval_txt = apply_dir + '/eval.txt'
    TEMPLATE = '{uri} {purity:.5f} {coverage:.5f} {f_measure:.5f}\n'
    purity = SegmentationPurity()
    coverage = SegmentationCoverage()
    fscore = []

    for test_file in getattr(protocol, subset)():

        soft = segmentation.apply(test_file)
        hard = peak.apply(soft)

        uri = get_unique_identifier(test_file)

        path = SOFT_PKL.format(uri=uri)
        mkdir_p(os.path.dirname(path))
        with open(path, 'w') as fp:
            pickle.dump(soft, fp)

        path = HARD_JSON.format(uri=uri)
        mkdir_p(os.path.dirname(path))
        with open(path, 'w') as fp:
            pyannote.core.json.dump(hard, fp)

        try:
            reference = test_file['annotation']
            uem = test_file['annotated']
        except KeyError as e:
            continue

        p = purity(reference, hard)
        c = coverage(reference, hard)
        f = f_measure(c, p, beta=beta)
        fscore.append(f)

        line = TEMPLATE.format(
            uri=uri, purity=p, coverage=c, f_measure=f)
        with open(eval_txt, 'a') as fp:
            fp.write(line)

    p = abs(purity)
    c = abs(coverage)
    f = np.mean(fscore)
    line = TEMPLATE.format(
        uri='ALL', purity=p, coverage=c, f_measure=f)
    with open(eval_txt, 'a') as fp:
        fp.write(line)
Exemplo n.º 8
0
    predictions[uri] = segmentation.apply(wav)

# tested thresholds
alphas = np.linspace(0, 1, 50)

# evaluation metrics (purity and coverage)
from pyannote.metrics.segmentation import SegmentationPurity
from pyannote.metrics.segmentation import SegmentationCoverage
purity = [SegmentationPurity() for alpha in alphas]
coverage = [SegmentationCoverage() for alpha in alphas]

# peak detection
from pyannote.audio.signal import Peak
for i, alpha in enumerate(alphas):
    # initialize peak detection algorithm
    peak = Peak(alpha=alpha, min_duration=1.0)
    for uri, reference in groundtruth.items():
        # apply peak detection
        hypothesis = peak.apply(predictions[uri])
        # compute purity and coverage
        purity[i](reference, hypothesis)
        coverage[i](reference, hypothesis)

# print the results in three columns:
# threshold, purity, coverage
TEMPLATE = '{alpha:.3f} {purity:.1f}% {coverage:.1f}%'
for i, a in enumerate(alphas):
    p = 100 * abs(purity[i])
    c = 100 * abs(coverage[i])
    print(TEMPLATE.format(alpha=a, purity=p, coverage=c))
Exemplo n.º 9
0
class SpeakerDiarizationHACPre(object):
    '''Speaker diarization with hierarchical agglomerative clustering'''
    def __init__(self,
                 feature_extraction,
                 sad__pre,
                 scd__pre,
                 emb__pre,
                 sad__onset=0.7,
                 sad__offset=0.7,
                 sad__dimension=1,
                 scd__alpha=0.5,
                 scd__min_duration=1.,
                 scd__dimension=1,
                 emb__internal=False,
                 cls__method='average',
                 cls__threshold=5,
                 cls__metric='cosine'):

        super(SpeakerDiarizationHACPre, self).__init__()

        self.feature_extraction = feature_extraction

        # speech activity detection hyper-parameters
        self.sad__onset = sad__onset
        self.sad__offset = sad__offset
        self.sad__dimension = sad__dimension

        # speaker change detection hyper-parameters
        self.scd__alpha = scd__alpha
        self.scd__min_duration = scd__min_duration
        self.scd__dimension = scd__dimension

        # embedding hyper-parameters
        self.emb__internal = emb__internal

        # clustering hyper-parameters
        self.cls__method = cls__method
        self.cls__threshold = cls__threshold
        self.cls__metric = cls__metric

        step = self.feature_extraction.sliding_window().step

        # initialize speech activity detection module
        self.sad_ = Precomputed(sad__pre)
        self.sad_binarize_ = Binarize(onset=self.sad__onset,
                                      offset=self.sad__offset)

        # initialize speaker change detection module
        self.scd_ = Precomputed(scd__pre)
        self.scd_peak_ = Peak(alpha=self.scd__alpha,
                              min_duration=self.scd__min_duration,
                              percentile=False)

        # initialize speech turn embedding module
        self.emb_ = Precomputed(emb__pre)

        # initialize clustering module
        self.cls_ = my_cluster.ClusteringHAC(metric=self.cls__metric,
                                             method=self.cls__method,
                                             threshold=self.cls__threshold)

    def __call__(self, current_file, annotated=False):

        # speech activity detection
        soft_sad = self.sad_(current_file)
        hard_sad = self.sad_binarize_.apply(soft_sad,
                                            dimension=self.sad__dimension)

        # speaker change detection
        soft_scd = self.scd_(current_file)
        hard_scd = self.scd_peak_.apply(soft_scd,
                                        dimension=self.scd__dimension)

        # speech turns
        speech_turns = hard_scd.crop(hard_sad)

        if annotated:
            speech_turns = speech_turns.crop(get_annotated(current_file))

        # remove small speech turns
        emb = self.emb_(current_file)
        speech_turns = [
            speech_turn for speech_turn in speech_turns
            if len(emb.crop(speech_turn, mode='loose')) > 0
        ]

        # speech turns embedding
        to_stack = [
            np.sum(emb.crop(speech_turn, mode='loose'), axis=0)
            for speech_turn in speech_turns
        ]
        if len(to_stack) < 1:
            return None
        fX = l2_normalize(np.vstack(to_stack))

        # speech turn clustering
        cluster_labels = self.cls_.apply(fX)

        # build hypothesis from clustering results
        hypothesis = Annotation(uri=current_file['uri'])
        for speech_turn, label in zip(speech_turns, cluster_labels):
            hypothesis[speech_turn] = label
        return hypothesis
Exemplo n.º 10
0
    def annotate_speakers(self, filename, gui, visualization=True):
        test_file = {'uri': 'filename', 'audio': filename}

        sad_scores = self.sad(test_file)
        binarize = Binarize(offset=0.5, onset=0.70, log_scale=True)
        speech = binarize.apply(sad_scores, dimension=1)

        scd_scores = self.scd(test_file)
        peak = Peak(alpha=0.1, min_duration=1, log_scale=True)
        partition = peak.apply(scd_scores, dimension=1)

        speech_turns = partition.crop(speech)
        embeddings = self.emb(test_file)
        long_turns = Timeline(
            segments=[s for s in speech_turns if s.duration > 1.1])

        res = []
        for segment in long_turns:
            x = embeddings.crop(segment, mode='strict')
            if x.size == 0:
                continue
            n_sample = x.shape[0]
            x = np.mean(x, axis=0)
            if np.any(np.isnan(x)):
                continue
            res.append((segment, x, n_sample))

        if visualization:

            dist = []
            for i in range(len(res)):
                for j in range(i + 1, len(res)):
                    dist.append(l2_dist(res[i][1], res[j][1]))
            fig, ax = plt.subplots()
            ax.scatter(np.arange(len(dist)), np.array(sorted(dist)))
            fig.show()

            # let's visualize SAD and SCD results using pyannote.core visualization API

            # helper function to make visualization prettier
            plot_ready = lambda scores: SlidingWindowFeature(
                np.exp(scores.data[:, 1:]), scores.sliding_window)

            # create a figure with 6 rows with matplotlib
            nrows = 6
            fig, ax = plt.subplots(nrows=nrows, ncols=1)
            fig.set_figwidth(20)
            fig.set_figheight(nrows * 2)

            # 1st row: reference annotation
            # notebook.plot_annotation(test_file['annotation'], ax=ax[0])
            # ax[0].text(notebook.crop.start + 0.5, 0.1, 'reference', fontsize=14)

            # 2nd row: SAD raw scores
            notebook.plot_feature(plot_ready(sad_scores), ax=ax[1])
            ax[1].text(notebook.crop.start + 0.5,
                       0.6,
                       'SAD\nscores',
                       fontsize=14)
            ax[1].set_ylim(-0.1, 1.1)

            # 3rd row: SAD result
            notebook.plot_timeline(speech, ax=ax[2])
            ax[2].text(notebook.crop.start + 0.5, 0.1, 'SAD', fontsize=14)

            # 4th row: SCD raw scores
            notebook.plot_feature(plot_ready(scd_scores), ax=ax[3])
            ax[3].text(notebook.crop.start + 0.5,
                       0.3,
                       'SCD\nscores',
                       fontsize=14)
            ax[3].set_ylim(-0.1, 0.6)

            # 5th row: SCD result
            notebook.plot_timeline(partition, ax=ax[4])
            ax[4].text(notebook.crop.start + 0.5, 0.1, 'SCD', fontsize=14)

            # 6th row: combination of SAD and SCD
            notebook.plot_timeline(speech_turns, ax=ax[5])
            ax[5].text(notebook.crop.start + 0.5,
                       0.1,
                       'speech turns',
                       fontsize=14)

            fig.show()

        res, num_people = self.min_spanning_tree(res)
        gui.append_line('There are {} people in this audio'.format(num_people))

        return res
Exemplo n.º 11
0
def apply(protocol,
          train_dir,
          store_dir,
          threshold,
          subset='development',
          epoch=None,
          min_duration=1.0):

    # -- LOAD MODEL --
    nb_epoch = 0
    while True:
        weights_h5 = LoggingCallback.WEIGHTS_H5.format(log_dir=train_dir,
                                                       epoch=nb_epoch)
        if not os.path.isfile(weights_h5):
            break
        nb_epoch += 1
    config_dir = os.path.dirname(os.path.dirname(train_dir))
    config_yml = config_dir + '/config.yml'
    with open(config_yml, 'r') as fp:
        config = yaml.load(fp)

    # -- FEATURE EXTRACTION --
    feature_extraction_name = config['feature_extraction']['name']
    features = __import__('pyannote.audio.features',
                          fromlist=[feature_extraction_name])
    FeatureExtraction = getattr(features, feature_extraction_name)
    feature_extraction = FeatureExtraction(
        **config['feature_extraction'].get('params', {}))

    # -- SEQUENCE GENERATOR --
    duration = config['sequences']['duration']
    step = config['sequences']['step']

    def saveSeg(filepath, filename, segmentation):
        f = open(filepath, 'w')
        for idx, val in enumerate(segmentation):
            line = filename + ' ' + str(idx) + ' 1 ' + str(int(
                val[0] * 100)) + ' ' + str(
                    int(val[1] * 100 - val[0] * 100)) + '\n'
            f.write(line)
        f.close()

    filepath = store_dir + '/' + str(threshold) + '/'
    mkdir_p(filepath)

    # -- CHOOSE MODEL --
    if epoch > nb_epoch:
        raise ValueError('Epoch should be less than ' + str(nb_epoch))
    if epoch is None:
        epoch = nb_epoch - 1
    sequence_labeling = SequenceLabeling.from_disk(train_dir, epoch)
    aggregation = SequenceLabelingAggregation(sequence_labeling,
                                              feature_extraction,
                                              duration=duration,
                                              step=step)

    # -- PREDICTION --
    predictions = {}
    for dev_file in getattr(protocol, subset)():
        uri = dev_file['uri']
        predictions[uri] = aggregation.apply(dev_file)

    # initialize peak detection algorithm
    peak = Peak(alpha=threshold, min_duration=min_duration)

    for dev_file in getattr(protocol, subset)():
        uri = dev_file['uri']
        hypothesis = peak.apply(predictions[uri])
        filepath = store_dir + '/' + str(threshold) + '/' + uri + '.0.seg'
        saveSeg(filepath, uri, hypothesis)
Exemplo n.º 12
0
def evaluate(protocol,
             train_dir,
             store_dir,
             subset='development',
             epoch=None,
             min_duration=1.0):

    mkdir_p(store_dir)

    # -- LOAD MODEL --
    nb_epoch = 0
    while True:
        weights_h5 = LoggingCallback.WEIGHTS_H5.format(log_dir=train_dir,
                                                       epoch=nb_epoch)
        if not os.path.isfile(weights_h5):
            break
        nb_epoch += 1
    config_dir = os.path.dirname(os.path.dirname(train_dir))
    config_yml = config_dir + '/config.yml'
    with open(config_yml, 'r') as fp:
        config = yaml.load(fp)

    # -- FEATURE EXTRACTION --
    feature_extraction_name = config['feature_extraction']['name']
    features = __import__('pyannote.audio.features',
                          fromlist=[feature_extraction_name])
    FeatureExtraction = getattr(features, feature_extraction_name)
    feature_extraction = FeatureExtraction(
        **config['feature_extraction'].get('params', {}))

    # -- SEQUENCE GENERATOR --
    duration = config['sequences']['duration']
    step = config['sequences']['step']

    groundtruth = {}
    for dev_file in getattr(protocol, subset)():
        uri = dev_file['uri']
        groundtruth[uri] = dev_file['annotation']

    # -- CHOOSE MODEL --
    if epoch > nb_epoch:
        raise ValueError('Epoch should be less than ' + str(nb_epoch))
    if epoch is None:
        epoch = nb_epoch - 1

    sequence_labeling = SequenceLabeling.from_disk(train_dir, epoch)

    aggregation = SequenceLabelingAggregation(sequence_labeling,
                                              feature_extraction,
                                              duration=duration,
                                              step=step)

    # -- PREDICTION --
    predictions = {}
    for dev_file in getattr(protocol, subset)():
        uri = dev_file['uri']
        predictions[uri] = aggregation.apply(dev_file)

    alphas = np.linspace(0, 1, 20)

    purity = [SegmentationPurity(parallel=False) for alpha in alphas]
    coverage = [SegmentationCoverage(parallel=False) for alpha in alphas]

    # -- SAVE RESULTS --
    for i, alpha in enumerate(alphas):
        # initialize peak detection algorithm
        peak = Peak(alpha=alpha, min_duration=min_duration)
        for uri, reference in groundtruth.items():
            # apply peak detection
            hypothesis = peak.apply(predictions[uri])
            # compute purity and coverage
            purity[i](reference, hypothesis)
            coverage[i](reference, hypothesis)

    TEMPLATE = '{alpha:g} {purity:.3f}% {coverage:.3f}%'
    with open(store_dir + '/res.txt', 'a') as fp:
        for i, a in enumerate(alphas):
            p = 100 * abs(purity[i])
            c = 100 * abs(coverage[i])
            print(TEMPLATE.format(alpha=a, purity=p, coverage=c))
            fp.write(TEMPLATE.format(alpha=a, purity=p, coverage=c) + '\n')
Exemplo n.º 13
0
class SpeakerDiarizationWeighted(object):
    def __init__(self,
                 feature_extraction,
                 sad__pre,
                 scd__pre,
                 weight__pre,
                 emb__pre,
                 sad__onset=0.7,
                 sad__offset=0.7,
                 sad__dimension=1,
                 scd__alpha=0.5,
                 scd__min_duration=1.,
                 scd__dimension=1,
                 emb__internal=False,
                 cls__damping=0.8,
                 cls__preference=-20,
                 cls__metric='cosine'):

        super(SpeakerDiarizationWeighted, self).__init__()

        self.feature_extraction = feature_extraction

        # speech activity detection hyper-parameters
        self.sad__onset = sad__onset
        self.sad__offset = sad__offset
        self.sad__dimension = sad__dimension

        # speaker change detection hyper-parameters
        self.scd__alpha = scd__alpha
        self.scd__min_duration = scd__min_duration
        self.scd__dimension = scd__dimension

        # embedding hyper-parameters
        self.emb__internal = emb__internal

        # clustering hyper-parameters
        self.cls__damping = cls__damping
        self.cls__preference = cls__preference
        self.cls__metric = cls__metric

        step = self.feature_extraction.sliding_window().step

        # initialize speech activity detection module
        self.sad_ = Precomputed(sad__pre)
        self.sad_binarize_ = Binarize(onset=self.sad__onset,
                                      offset=self.sad__offset)

        # initialize speaker change detection module
        self.scd_ = Precomputed(scd__pre)
        self.scd_peak_ = Peak(alpha=self.scd__alpha,
                              min_duration=self.scd__min_duration,
                              percentile=False)

        # initialize weights
        self.weight_ = Precomputed(weight__pre)

        # initialize speech turn embedding module
        self.emb_ = Precomputed(emb__pre)

        # initialize clustering module
        self.cls_ = my_cluster.ClusteringAP(metric=self.cls__metric,
                                            damping=self.cls__damping,
                                            preference=self.cls__preference)

    def __call__(self, current_file, annotated=False):

        # speech activity detection
        soft_sad = self.sad_(current_file)
        hard_sad = self.sad_binarize_.apply(soft_sad,
                                            dimension=self.sad__dimension)

        # speaker change detection
        soft_scd = self.scd_(current_file)
        hard_scd = self.scd_peak_.apply(soft_scd,
                                        dimension=self.scd__dimension)

        # speech turns
        speech_turns = hard_scd.crop(hard_sad)

        if annotated:
            speech_turns = speech_turns.crop(get_annotated(current_file))

        # remove small speech turns
        emb = self.emb_(current_file)
        speech_turns = [
            speech_turn for speech_turn in speech_turns
            if len(emb.crop(speech_turn, mode='loose')) > 0
        ]

        # weights
        weight = self.weight_(current_file)

        # speech turns embedding
        to_stack = [
            np.mean(emb.crop(speech_turn, mode='loose') *
                    (1 - weight.crop(speech_turn, mode='loose')),
                    axis=0) for speech_turn in speech_turns
        ]
        if len(to_stack) < 1:
            return None
        fX = l2_normalize(np.vstack(to_stack))

        # speech turn clustering
        cluster_labels = self.cls_.apply(fX)

        # build hypothesis from clustering results
        hypothesis = Annotation(uri=current_file['uri'])
        for speech_turn, label in zip(speech_turns, cluster_labels):
            hypothesis[speech_turn] = label
        return hypothesis
Exemplo n.º 14
0
metric = SegmentationPurityCoverageFMeasure()

# peak detection
min_duration = 1.0
from pyannote.audio.signal import Peak
# alpha / min_duration are tunable parameters (and should be tuned for better performance)
# we use log_scale = True because of the final log-softmax in the StackedRNN model

alphas = np.linspace(0, 1, 20)

purity_list = []
coverage_list = []

for alpha in alphas:

    peak = Peak(alpha=alpha, min_duration=min_duration, log_scale=True)

    # evaluation metric

    # loop on test files

    for test_file in protocol.test():
        # load reference annotation
        reference = test_file['annotation']
        uem = get_annotated(test_file)

        # load precomputed change scores as pyannote.core.SlidingWindowFeature
        scd_scores = precomputed(test_file)

        # binarize scores to obtain speech regions as pyannote.core.Timeline
        hypothesis = peak.apply(scd_scores, dimension=1)
Exemplo n.º 15
0
class NeuralSegmentation(Pipeline):
    def __init__(self, sad=None, scd=None, **kwargs):
        super().__init__()
        self.sad = Path(sad).expanduser().resolve(strict=True)
        self.scd = Path(scd).expanduser().resolve(strict=True)
        self.with_params(**kwargs)

    def get_tune_space(self):
        return {
            'sad_onset': chocolate.uniform(0., 1.),
            'sad_offset': chocolate.uniform(0., 1.),
            'scd_alpha': chocolate.uniform(0., 1.),
            'scd_min_duration': chocolate.uniform(0., 5.),
        }

    def get_tune_metric(self):
        raise NotImplementedError()

    def with_params(self,
                    sad_onset=0.7,
                    sad_offset=0.7,
                    scd_alpha=0.5,
                    scd_min_duration=1.):

        # initialize speech activity detection
        self.sad_ = Precomputed(self.sad)
        self.sad_onset = sad_onset
        self.sad_offset = sad_offset
        self.sad_binarize_ = Binarize(onset=sad_onset, offset=sad_offset)

        # initialize speaker change detection
        self.scd_ = Precomputed(self.scd)
        self.scd_alpha = scd_alpha
        self.scd_min_duration = scd_min_duration
        self.scd_peak_ = Peak(alpha=scd_alpha, min_duration=scd_min_duration)

        return self

    def apply(self, current_file):

        # Speech Activity Detection

        # get raw SAD scores
        soft_sad = self.sad_(current_file)

        # check once and for all whether SAD scores are log-scaled
        if not hasattr(self, 'sad_log_scale_'):
            if np.nanmean(soft_sad.data) < 0:
                self.sad_log_scale_ = True
            else:
                self.sad_log_scale_ = False

        # get SAD probability
        prob_sad = np.exp(soft_sad.data) if self.sad_log_scale_ \
                   else soft_sad.data

        # support both non-speech/speech & non-speech/single/overlap
        prob_sad = 1. - prob_sad[:, 0]
        prob_sad = SlidingWindowFeature(prob_sad, soft_sad.sliding_window)

        # binarization
        hard_sad = self.sad_binarize_.apply(prob_sad)

        # Speaker Change Detection

        # get raw SCD scores
        soft_scd = self.scd_(current_file)

        # check once and for all whether SCD scores are log-scaled
        if not hasattr(self, 'scd_log_scale_'):
            if np.nanmean(soft_scd.data) < 0:
                self.scd_log_scale_ = True
            else:
                self.scd_log_scale_ = False

        # get SCD probability
        prob_scd = np.exp(soft_scd.data) if self.scd_log_scale_ \
                   else soft_scd.data

        # take the final dimension
        # (in order to support both classification and regression scores)
        prob_scd = prob_scd[:, -1]
        prob_scd = SlidingWindowFeature(prob_scd, soft_scd.sliding_window)

        # peak detection
        hard_scd = self.scd_peak_.apply(prob_scd)

        speech_turns = hard_scd.crop(hard_sad)

        # only process the annotated part
        speech_turns = speech_turns.crop(get_annotated(current_file))

        return speech_turns
Exemplo n.º 16
0
binarize = Binarize(offset=0.94, onset=0.70, log_scale=True)
speech = binarize.apply(sad_scores, dimension=1)
#
# iterate over speech segments (as `pyannote.core.Segment` instances)
for segment in speech:
    print(segment.start, segment.end)

# obtain raw SCD scores (as `pyannote.core.SlidingWindowFeature` instance)
scd_scores = scd(test_file)

# detect peaks and return speaker homogeneous segments
# (as `pyannote.core.Annotation` instance)
# NOTE: both alpha/min_duration values were tuned on AMI dataset.
# you might need to use different values for better results.
from pyannote.audio.signal import Peak
peak = Peak(alpha=0.08, min_duration=0.40, log_scale=True)
partition = peak.apply(scd_scores, dimension=1)
for segment in partition:
    print(segment.start, segment.end)
#
speech_turns = partition.crop(speech)
#
#
# # let's visualize SAD and SCD results using pyannote.core visualization API
# from matplotlib import pyplot as plt
# from pyannote.core import Segment, notebook
#
# # only plot one minute (between t=120s and t=180s)
# notebook.crop = Segment(120, 180)
#
# # helper function to make visualization prettier
Exemplo n.º 17
0
    def validate_epoch(self, epoch, protocol_name, subset='development',
                       validation_data=None):

        target_purity = self.purity

        # load model for current epoch
        model = self.load_model(epoch).to(self.device)
        model.eval()

        if isinstance(self.feature_extraction_, Precomputed):
            self.feature_extraction_.use_memmap = False

        duration = self.task_.duration
        step = .25 * duration
        sequence_labeling = SequenceLabeling(
            model, self.feature_extraction_, duration=duration,
            step=.25 * duration, batch_size=self.batch_size,
            source='audio', device=self.device)

        protocol = get_protocol(protocol_name, progress=False,
                                preprocessors=self.preprocessors_)

        # extract predictions for all files.
        predictions = {}
        for current_file in getattr(protocol, subset)():
            uri = get_unique_identifier(current_file)
            predictions[uri] = sequence_labeling.apply(current_file)

        # dichotomic search to find alpha that maximizes coverage
        # while having at least `target_purity`

        lower_alpha = 0.
        upper_alpha = 1.
        best_alpha = .5 * (lower_alpha + upper_alpha)
        best_coverage = 0.

        for _ in range(10):
            current_alpha = .5 * (lower_alpha + upper_alpha)
            peak = Peak(alpha=current_alpha, min_duration=0.0,
                        log_scale=model.logsoftmax)
            metric = DiarizationPurityCoverageFMeasure()

            # NOTE -- embarrasingly parallel
            # TODO -- parallelize this
            for current_file in getattr(protocol, subset)():
                reference = current_file['annotation']
                uri = get_unique_identifier(current_file)
                hypothesis = peak.apply(predictions[uri], dimension=1)
                hypothesis = hypothesis.to_annotation()
                uem = get_annotated(current_file)
                metric(reference, hypothesis, uem=uem)

            purity, coverage, _ = metric.compute_metrics()

            if purity < target_purity:
                upper_alpha = current_alpha
            else:
                lower_alpha = current_alpha
                if coverage > best_coverage:
                    best_coverage = coverage
                    best_alpha = current_alpha

        task = 'speaker_change_detection'
        metric_name = f'{task}/coverage@{target_purity:.2f}purity'
        return {
            metric_name: {'minimize': False, 'value': best_coverage},
            f'{task}/threshold': {'minimize': 'NA', 'value': best_alpha}}