예제 #1
0
    def __init__(self, precomputed=None, **kwargs):
        super(SpeechActivityDetection, self).__init__()
        self.precomputed = precomputed

        self.precomputed_ = Precomputed(self.precomputed)
        self.has_overlap_ = self.precomputed_.dimension() == 3

        self.with_params(**kwargs)
예제 #2
0
    def __init__(self,
                 feature_extraction,
                 sad__pre,
                 scd__pre,
                 emb__pre,
                 sad__onset=0.7,
                 sad__offset=0.7,
                 sad__dimension=1,
                 scd__alpha=0.5,
                 scd__min_duration=1.,
                 scd__dimension=1,
                 emb__internal=False,
                 cls__damping=0.8,
                 cls__preference=-20,
                 cls__metric='cosine'):

        super(SpeakerDiarizationPreStages, self).__init__()

        self.feature_extraction = feature_extraction

        # speech activity detection hyper-parameters
        self.sad__onset = sad__onset
        self.sad__offset = sad__offset
        self.sad__dimension = sad__dimension

        # speaker change detection hyper-parameters
        self.scd__alpha = scd__alpha
        self.scd__min_duration = scd__min_duration
        self.scd__dimension = scd__dimension

        # embedding hyper-parameters
        self.emb__internal = emb__internal

        # clustering hyper-parameters
        self.cls__damping = cls__damping
        self.cls__preference = cls__preference
        self.cls__metric = cls__metric

        step = self.feature_extraction.sliding_window().step

        # initialize speech activity detection module
        self.sad_ = Precomputed(sad__pre)
        self.sad_binarize_ = Binarize(onset=self.sad__onset,
                                      offset=self.sad__offset)

        # initialize speaker change detection module
        self.scd_ = Precomputed(scd__pre)
        self.scd_peak_ = Peak(alpha=self.scd__alpha,
                              min_duration=self.scd__min_duration,
                              percentile=False)

        # initialize speech turn embedding module
        self.emb_ = Precomputed(emb__pre)

        # initialize clustering module
        self.cls_ = my_cluster.ClusteringAP(metric=self.cls__metric,
                                            damping=self.cls__damping,
                                            preference=self.cls__preference)
예제 #3
0
    def __init__(self,
                 feature_extraction,
                 sad__pre,
                 scd__pre,
                 emb__pre,
                 sad__onset=0.7,
                 sad__offset=0.7,
                 sad__dimension=1,
                 scd__alpha=0.5,
                 scd__min_duration=1.,
                 scd__dimension=1,
                 emb__internal=False,
                 cls__method='average',
                 cls__threshold=5,
                 cls__metric='cosine'):

        super(SpeakerDiarizationHACPre, self).__init__()

        self.feature_extraction = feature_extraction

        # speech activity detection hyper-parameters
        self.sad__onset = sad__onset
        self.sad__offset = sad__offset
        self.sad__dimension = sad__dimension

        # speaker change detection hyper-parameters
        self.scd__alpha = scd__alpha
        self.scd__min_duration = scd__min_duration
        self.scd__dimension = scd__dimension

        # embedding hyper-parameters
        self.emb__internal = emb__internal

        # clustering hyper-parameters
        self.cls__method = cls__method
        self.cls__threshold = cls__threshold
        self.cls__metric = cls__metric

        step = self.feature_extraction.sliding_window().step

        # initialize speech activity detection module
        self.sad_ = Precomputed(sad__pre)
        self.sad_binarize_ = Binarize(onset=self.sad__onset,
                                      offset=self.sad__offset)

        # initialize speaker change detection module
        self.scd_ = Precomputed(scd__pre)
        self.scd_peak_ = Peak(alpha=self.scd__alpha,
                              min_duration=self.scd__min_duration,
                              percentile=False)

        # initialize speech turn embedding module
        self.emb_ = Precomputed(emb__pre)

        # initialize clustering module
        self.cls_ = my_cluster.ClusteringHAC(metric=self.cls__metric,
                                             method=self.cls__method,
                                             threshold=self.cls__threshold)
예제 #4
0
    def __init__(self, split='', n_support=0, n_query=0, n_way=0, if_cuda=False):
        self.precomputed = Precomputed(VCTK_FEATURE_DIR)
        self.dataset = self.load_dataset(from_disk=True)

        self.split = split
        self.n_support = n_support
        self.n_query = n_query
        self.n_way = n_way
        self.dataset = None
        self.transforms = None
        self.if_cuda = if_cuda
예제 #5
0
def update_distances(args):
    """Loads user annotation from json path, converts it to pyannote `Annotation`
    using regions timings.

    From the annotation uri and precomputed embeddings, it computes the
    in-cluster distances between every speech turns

    Dumps the updated (with correct distances) JSON file to a timestamped file.
    """
    json_path = Path(args['<json_path>'])
    uri = args['<uri>']
    with open(json_path, 'r') as file:
        gecko_json = json.load(file)
    hypothesis, _, _, _ = gecko_JSON_to_Annotation(gecko_json, uri, 'speaker')

    colors = get_colors(uri)

    precomputed = Precomputed(embeddings)
    protocol = args['<database.task.protocol>']
    protocol = get_protocol(protocol)
    for reference in getattr(protocol, 'test')():
        if reference['uri'] == uri:
            features = precomputed(reference)
            break
    distances_per_speaker = get_distances_per_speaker(features, hypothesis)
    gecko_json = annotation_to_GeckoJSON(hypothesis, distances_per_speaker,
                                         colors)
    name = f"{json_path.stem}.{TIMESTAMP}.json"
    updated_path = Path(json_path.parent, name)
    with open(updated_path, 'w') as file:
        json.dump(gecko_json, file)
    print(f"succefully dumped {updated_path}")
예제 #6
0
    def __init__(self,
                 feature_extraction,
                 emb__pre,
                 emb__internal=False,
                 cls__damping=0.8,
                 cls__preference=-20,
                 cls__metric='cosine'):

        super(SpeakerDiarizationOracleSegAP, self).__init__()

        self.feature_extraction = feature_extraction

        # embedding hyper-parameters
        self.emb__internal = emb__internal

        # clustering hyper-parameters
        self.cls__damping = cls__damping
        self.cls__preference = cls__preference
        self.cls__metric = cls__metric

        step = self.feature_extraction.sliding_window().step

        # initialize speech turn embedding module
        self.emb_ = Precomputed(emb__pre)

        # initialize clustering module
        self.cls_ = my_cluster.ClusteringAP(metric=self.cls__metric,
                                            damping=self.cls__damping,
                                            preference=self.cls__preference)
    def __init__(self, precomputed=None, **kwargs):
        super(SpeechActivityDetection, self).__init__()
        self.precomputed = precomputed

        self.precomputed_ = Precomputed(self.precomputed)
        self.has_overlap_ = self.precomputed_.dimension() == 3

        self.with_params(**kwargs)
def extract(protocol_name, file_finder, experiment_dir, robust=False, parallel=False):

    protocol = get_protocol(protocol_name)

    # load configuration file
    config_yml = experiment_dir + "/config.yml"
    with open(config_yml, "r") as fp:
        config = yaml.load(fp, Loader=yaml.SafeLoader)

    FeatureExtraction = get_class_by_name(
        config["feature_extraction"]["name"],
        default_module_name="pyannote.audio.features",
    )
    feature_extraction = FeatureExtraction(
        **config["feature_extraction"].get("params", {})
    )

    sliding_window = feature_extraction.sliding_window
    dimension = feature_extraction.dimension

    # create metadata file at root that contains
    # sliding window and dimension information

    precomputed = Precomputed(
        root_dir=experiment_dir, sliding_window=sliding_window, dimension=dimension
    )

    if parallel:

        extract_one = functools.partial(
            helper_extract,
            file_finder=file_finder,
            experiment_dir=experiment_dir,
            config_yml=config_yml,
            robust=robust,
        )

        n_jobs = cpu_count()
        pool = Pool(n_jobs)
        imap = pool.imap

    else:

        feature_extraction = init_feature_extraction(experiment_dir)
        extract_one = functools.partial(
            helper_extract,
            file_finder=file_finder,
            experiment_dir=experiment_dir,
            feature_extraction=feature_extraction,
            robust=robust,
        )
        imap = map

    for result in imap(extract_one, protocol.files()):
        if result is None:
            continue
        print(result)
예제 #9
0
def get_file(protocol, uri, embeddings=None):
    for reference in protocol.files():
        if reference['uri'] == uri:
            if embeddings:
                precomputed = Precomputed(embeddings)
                features = precomputed(reference)
                return reference, features
            return reference
    raise ValueError(f'{uri} is not in {protocol}')
예제 #10
0
    def with_params(self,
                    sad_onset=0.7,
                    sad_offset=0.7,
                    scd_alpha=0.5,
                    scd_min_duration=1.):

        # initialize speech activity detection
        self.sad_ = Precomputed(self.sad)
        self.sad_onset = sad_onset
        self.sad_offset = sad_offset
        self.sad_binarize_ = Binarize(onset=sad_onset, offset=sad_offset)

        # initialize speaker change detection
        self.scd_ = Precomputed(self.scd)
        self.scd_alpha = scd_alpha
        self.scd_min_duration = scd_min_duration
        self.scd_peak_ = Peak(alpha=scd_alpha, min_duration=scd_min_duration)

        return self
예제 #11
0
def main():
    usage = "%prog [options] database, raw_score_path"
    desc = "Write the output of the binary overlap detector into test based on a threshold"
    version = "%prog 0.1"
    parser = OptionParser(usage=usage, description=desc, version=version)
    parser.add_option("-t", "--onset", action="store", type="float", help="Onset Threshold", default=0.70)
    parser.add_option("-f", "--offset", action="store", type="float", help="Offset Threshold", default=0.70)
    parser.add_option("-d", "--dev", action="store_true", help="Print output based on development set", default=False)
    parser.add_option("-o", "--outputfile", action="store", type="string", help="Output file", default="./overlap.txt")
    (opt, args) = parser.parse_args()

    if(len(args)!=2):
        parser.error("Incorrect number of arguments")
    database, raw_score_path = args

    # get test file of protocol
    protocol = get_protocol(database)

    # load precomputed overlap scores as pyannote.core.SlidingWindowFeature
    precomputed = Precomputed(raw_score_path)
    # StackedRNN model
    # initialize binarizer
    # onset / offset are tunable parameters (and should be tuned for better 
    # performance). we use log_scale=True because of the final log-softmax in the 
    binarize = Binarize(onset=opt.onset, offset=opt.offset, log_scale=True)

    fw = open(opt.outputfile, 'wt')

    if opt.dev:
        for test_file in protocol.development():
            ovl_scores = precomputed(test_file)


            # binarize overlap scores to obtain overlap regions as pyannote.core.Timeline
            ovl_regions = binarize.apply(ovl_scores, dimension=1)
            ovl_regions.uri = test_file['uri']


            # write the output into text
            write_txt(fw, ovl_regions)
 
    else:
        for test_file in protocol.test():
            ovl_scores = precomputed(test_file)


            # binarize overlap scores to obtain overlap regions as pyannote.core.Timeline
            ovl_regions = binarize.apply(ovl_scores, dimension=1)
            ovl_regions.uri = test_file['uri']


            # write the output into text
            write_txt(fw, ovl_regions)
    fw.close()
예제 #12
0
def helper_extract(current_file, file_finder=None, experiment_dir=None,
                   config_yml=None, feature_extraction=None,
                   robust=False):

    if feature_extraction is None:
        feature_extraction = init_feature_extraction(experiment_dir)

    precomputed = Precomputed(root_dir=experiment_dir)
    return process_current_file(current_file, file_finder=file_finder,
                                precomputed=precomputed,
                                feature_extraction=feature_extraction,
                                robust=robust)
예제 #13
0
    def with_params(self,
                    sad_onset=0.7,
                    sad_offset=0.7,
                    scd_alpha=0.5,
                    scd_min_duration=1.,
                    cls_preference=-7.0,
                    cls_damping=0.8):

        # initialize speech activity detection and speaker change detection
        super().with_params(sad_onset=sad_onset,
                            sad_offset=sad_offset,
                            scd_alpha=scd_alpha,
                            scd_min_duration=scd_min_duration)

        # initialize speech turn embedding
        self.emb_ = Precomputed(self.emb)

        # initialize clustering module
        self.cls_damping = cls_damping

        self.cls_preference = cls_preference
        # NOTE cls_preference could be a multiplicative factor of a default
        # affinity value (e.g. median affinity value)
        self.cls_ = sklearn.cluster.AffinityPropagation(
            damping=cls_damping,
            preference=cls_preference,
            affinity='precomputed',
            max_iter=200,
            convergence_iter=15)

        # sklearn documentation: Preferences for each point - points with
        # larger values of preferences are more likely to be chosen as
        # exemplars. The number of exemplars, ie of clusters, is influenced by
        # the input preferences value. If the preferences are not passed as
        # arguments, they will be set to the median of the input similarities.

        # NOTE one could set the preference value of each speech turn
        # according to their duration. longer speech turns are expected to
        # have more accurate embeddings, therefore should be prefered for
        # exemplars

        return self
예제 #14
0
    def __init__(self,
                 emb__pre,
                 cls__method='average',
                 cls__threshold=5,
                 cls__metric='cosine'):

        super(SpeakerDiarizationOnSceneHAC, self).__init__()

        # clustering hyper-parameters
        self.cls__method = cls__method
        self.cls__threshold = cls__threshold
        self.cls__metric = cls__metric

        # initialize speech turn embedding module
        self.emb_ = Precomputed(emb__pre)

        # initialize clustering module
        self.cls_ = my_cluster.ClusteringHAC(metric=self.cls__metric,
                                             method=self.cls__method,
                                             threshold=self.cls__threshold)
예제 #15
0
    def with_params(self,
                    sad_onset=0.7,
                    sad_offset=0.7,
                    scd_alpha=0.5,
                    scd_min_duration=1.,
                    cls_threshold=0.8):

        # initialize speech activity detection and speaker change detection
        super().with_params(sad_onset=sad_onset,
                            sad_offset=sad_offset,
                            scd_alpha=scd_alpha,
                            scd_min_duration=scd_min_duration)

        # initialize speech turn embedding
        self.emb_ = Precomputed(self.emb)

        # initialize clustering module
        self.cls_threshold = cls_threshold

        self.cls_ = HierarchicalPoolingClustering(metric=self.metric)

        return self
예제 #16
0
def check(protocol_name, file_finder, experiment_dir):

    protocol = get_protocol(protocol_name)
    precomputed = Precomputed(experiment_dir)

    for subset in ['development', 'test', 'train']:

        try:
            file_generator = getattr(protocol, subset)()
            first_item = next(file_generator)
        except NotImplementedError as e:
            continue

        for current_file in getattr(protocol, subset)():

            try:
                audio = file_finder(current_file)
                current_file['audio'] = audio
            except ValueError as e:
                print(e)
                continue

            duration = get_audio_duration(current_file)

            try:
                features = precomputed(current_file)
            except PyannoteFeatureExtractionError as e:
                print(e)
                continue

            if not np.isclose(duration,
                              features.getExtent().duration,
                              atol=1.):
                uri = get_unique_identifier(current_file)
                print('Duration mismatch for "{uri}"'.format(uri=uri))

            if np.any(np.isnan(features.data)):
                uri = get_unique_identifier(current_file)
                print('NaN for "{uri}"'.format(uri=uri))
def xp_objective(args, **kwargs):
    import sys
    sys.path.append("/people/yin/projects/")
    from pyannote.database import get_protocol, get_annotated, FileFinder
    protocol = get_protocol('Etape.SpeakerDiarization.TV',
                            preprocessors={'audio': FileFinder()})

    from pyannote.metrics.diarization import GreedyDiarizationErrorRate
    metric = GreedyDiarizationErrorRate()

    from optimize_cluster import speaker_diarization
    from pyannote.audio.features import Precomputed

    feature_extraction = Precomputed(
        '/vol/work1/bredin/feature_extraction/mfcc')
    sad_pre = '/vol/work1/yin/speech_activity_detection/shallow/train/REPERE.SpeakerDiarization.All.train/tune/Etape.SpeakerDiarization.TV.development/apply'
    scd_pre = '/vol/work1/yin/speaker_change_detection/paper/train/REPERE.SpeakerDiarization.All.train/tune/Etape.SpeakerDiarization.Debug.development/apply'
    emb_pre = '/vol/work1/yin/embedding/20180124'

    args['cls__damping'] = float(args['cls__damping'])
    args['cls__preference'] = float(args['cls__preference'])

    pipeline = speaker_diarization.SpeakerDiarizationPre(
        feature_extraction, sad_pre, scd_pre, emb_pre, **args)
    try:
        for current_file in protocol.train():
            hypothesis = pipeline(current_file, annotated=True)
            if hypothesis is None:
                return 100
            reference = current_file['annotation']
            uem = get_annotated(current_file)
            metric(reference, hypothesis, uem=uem)
    except MemoryError as error:
        return 100

    return abs(metric)
예제 #18
0
from pyannote.core import Segment
from pyannote.audio.features import Precomputed, utils
import pandas as pd
import numpy as np
from glob import glob

# REPERE evaluation protocol
# cf. https://github.com/pyannote/pyannote-database#pyannote-database
from pyannote.database import get_protocol
#different from the files I use
protocol = get_protocol('REPERE.SpeakerDiarization.Plumcot')
precomputed = Precomputed('/vol/work1/dyab/training_set/mfcc')

train_dir = "/vol/work1/dyab/training_set/residual_local/"
output_dir_train = "/vol/work1/dyab/training_set/numpy_arrays_local_audio/"
y_labels_dir_train = "/vol/work1/dyab/training_set/numpy_arrays_local_landmarks/"

dev_dir = "/vol/work1/dyab/development_set/residual_cluster_old/"
output_dir_dev = "/vol/work1/dyab/development_set/numpy_arrays_cluster_old_audio/"
y_labels_dir_dev = "/vol/work1/dyab/development_set/numpy_arrays_cluster_old_landmarks/"

test_dir = "/vol/work1/dyab/test_set/residual/"
output_dir_test = "/vol/work1/dyab/test_set/numpy_arrays_audio/"
y_labels_dir_test = "/vol/work1/dyab/test_set/numpy_arrays_landmarks/"


def generate_audio_features(dir, output_dir, y_labels_dir):

    for current_file in protocol.test(
    ):  # iterate on all files of Phase2 training set
        print(current_file['uri'])
예제 #19
0
import numpy as np
import matplotlib.pyplot as plt
# AMI protocol
from pyannote.database import get_protocol
protocol = get_protocol('Test.SpeakerDiarization.MixHeadset')

from pyannote.database import get_annotated

# precomputed scores
from pyannote.audio.features import Precomputed
precomputed = Precomputed('./precomputed/scd')

from pyannote.metrics.diarization import DiarizationPurityCoverageFMeasure
metric = DiarizationPurityCoverageFMeasure()

from pyannote.metrics.segmentation import SegmentationPurityCoverageFMeasure
metric = SegmentationPurityCoverageFMeasure()

# peak detection
min_duration = 1.0
from pyannote.audio.signal import Peak
# alpha / min_duration are tunable parameters (and should be tuned for better performance)
# we use log_scale = True because of the final log-softmax in the StackedRNN model

alphas = np.linspace(0, 1, 20)

purity_list = []
coverage_list = []

for alpha in alphas:
예제 #20
0
def apply_pretrained(validate_dir: Path,
                     protocol_name: str,
                     subset: Optional[str] = "test",
                     duration: Optional[float] = None,
                     step: float = 0.25,
                     device: Optional[torch.device] = None,
                     batch_size: int = 32,
                     pretrained: Optional[str] = None,
                     Pipeline: type = None,
                     **kwargs):
    """Apply pre-trained model

    Parameters
    ----------
    validate_dir : Path
    protocol_name : `str`
    subset : 'train' | 'development' | 'test', optional
        Defaults to 'test'.
    duration : `float`, optional
    step : `float`, optional
    device : `torch.device`, optional
    batch_size : `int`, optional
    pretrained : `str`, optional
    Pipeline : `type`
    """

    if pretrained is None:
        pretrained = Pretrained(validate_dir=validate_dir,
                                duration=duration,
                                step=step,
                                batch_size=batch_size,
                                device=device)
        output_dir = validate_dir / 'apply' / f'{pretrained.epoch_:04d}'
    else:

        if pretrained in torch.hub.list('pyannote/pyannote-audio'):
            output_dir = validate_dir / pretrained
        else:
            output_dir = validate_dir

        pretrained = Wrapper(pretrained,
                             duration=duration,
                             step=step,
                             batch_size=batch_size,
                             device=device)

    params = {}
    try:
        params['classes'] = pretrained.classes
    except AttributeError as e:
        pass
    try:
        params['dimension'] = pretrained.dimension
    except AttributeError as e:
        pass

    # create metadata file at root that contains
    # sliding window and dimension information
    precomputed = Precomputed(root_dir=output_dir,
                              sliding_window=pretrained.sliding_window,
                              **params)

    # file generator
    protocol = get_protocol(protocol_name,
                            progress=True,
                            preprocessors=pretrained.preprocessors_)

    for current_file in getattr(protocol, subset)():
        fX = pretrained(current_file)
        precomputed.dump(current_file, fX)

    # do not proceed with the full pipeline
    # when there is no such thing for current task
    if Pipeline is None:
        return

    # do not proceed with the full pipeline when its parameters cannot be loaded.
    # this might happen when applying a model that has not been validated yet
    try:
        pipeline_params = pretrained.pipeline_params_
    except AttributeError as e:
        return

    # instantiate pipeline
    pipeline = Pipeline(scores=output_dir)
    pipeline.instantiate(pipeline_params)

    # load pipeline metric (when available)
    try:
        metric = pipeline.get_metric()
    except NotImplementedError as e:
        metric = None

    # apply pipeline and dump output to RTTM files
    output_rttm = output_dir / f'{protocol_name}.{subset}.rttm'
    with open(output_rttm, 'w') as fp:
        for current_file in getattr(protocol, subset)():
            hypothesis = pipeline(current_file)
            pipeline.write_rttm(fp, hypothesis)

            # compute evaluation metric (when possible)
            if 'annotation' not in current_file:
                metric = None

            # compute evaluation metric (when available)
            if metric is None:
                continue

            reference = current_file['annotation']
            uem = get_annotated(current_file)
            _ = metric(reference, hypothesis, uem=uem)

    # print pipeline metric (when available)
    if metric is None:
        return

    output_eval = output_dir / f'{protocol_name}.{subset}.eval'
    with open(output_eval, 'w') as fp:
        fp.write(str(metric))
import pickle
import random
from itertools import cycle, islice

dirname = os.path.dirname(os.path.realpath(__file__))
#VCTK_DATA_DIR = os.path.join(dirname, '../../data/vctk')
VCTK_DATA_DIR = '/w/148/spoclab/data3/jixuan/SpeakerEmbedding/few_shot_learning/data/vctk'
VCTK_AUDIO_DIR = '/p/spoclab/data3/jixuan/VCTK-Corpus/wav48'
VCTK_FEATURE_DIR = '/p/spoclab/data3/jixuan/VCTK-Corpus/playground/feature-extraction'
#VCTK_AUDIO_DIR = '/h/jixuan/Documents/data/VCTK-Corpus/wav48'
#VCTK_FEATURE_DIR = '/h/jixuan/Documents/data/VCTK-Corpus/playground/feature-extraction'
OMNIGLOT_CACHE = {}
DATASET_CACHE = {}

precomputed = Precomputed(VCTK_FEATURE_DIR)


def get_feature(cfile, seg):
    return precomputed.crop(cfile, seg, mode='center', fixed=2.0)


def convert_tensor(key, d):
    d[key] = torch.from_numpy(np.array(d[key], np.float32, copy=False))
    return d


def convert_cuda(key, d):
    if hasattr(d[key], 'cuda'):
        d[key] = d[key].cuda()
    return d
예제 #22
0
class SpeechActivityDetection(Pipeline):
    """Speech activity detection pipeline

    Parameters
    ----------
    precomputed : str
        Path to precomputed SAD scores.
    """
    def __init__(self, precomputed=None, **kwargs):
        super(SpeechActivityDetection, self).__init__()
        self.precomputed = precomputed

        self.precomputed_ = Precomputed(self.precomputed)
        self.has_overlap_ = self.precomputed_.dimension() == 3

        self.with_params(**kwargs)

    def get_tune_space(self):

        space = {
            'speech_onset': chocolate.uniform(0., 1.),
            'speech_offset': chocolate.uniform(0., 1.),
            'speech_min_duration_on': chocolate.uniform(0., 2.),
            'speech_min_duration_off': chocolate.uniform(0., 2.),
            'speech_pad_onset': chocolate.uniform(-1., 1.),
            'speech_pad_offset': chocolate.uniform(-1., 1.)
        }

        if self.has_overlap_:
            space.update({
                'overlap_onset': chocolate.uniform(0., 1.),
                'overlap_offset': chocolate.uniform(0., 1.),
                'overlap_min_duration_on': chocolate.uniform(0., 2.),
                'overlap_min_duration_off': chocolate.uniform(0., 2.),
                'overlap_pad_onset': chocolate.uniform(-1., 1.),
                'overlap_pad_offset': chocolate.uniform(-1., 1.)
            })

        return space

    def get_tune_metric(self):
        return DetectionErrorRate()

    def with_params(self, **params):

        # initialize speech/non-speech binarizer
        speech_params = {
            '_'.join(param.split('_')[1:]): value
            for param, value in params.items() if param.startswith('speech_')
        }
        self.speech_binarize_ = Binarize(**speech_params)

        # initialize overlap binarizer
        if self.has_overlap_:
            overlap_params = {
                '_'.join(param.split('_')[1:]): value
                for param, value in params.items()
                if param.startswith('overlap_')
            }
            self.overlap_binarize_ = Binarize(**overlap_params)

        return self

    def apply(self, current_file):

        # extract precomputed scores
        precomputed = self.precomputed_(current_file)

        # if this check has not been done yet, do it once and for all
        if not hasattr(self, "log_scale_"):
            # heuristic to determine whether scores are log-scaled
            if np.nanmean(precomputed.data) < 0:
                self.log_scale_ = True
            else:
                self.log_scale_ = False

        data = np.exp(precomputed.data) if self.log_scale_ \
               else precomputed.data

        # speech vs. non-speech
        speech_prob = SlidingWindowFeature(1. - data[:, 0],
                                           precomputed.sliding_window)
        speech = self.speech_binarize_.apply(speech_prob)

        if self.has_overlap_:

            # overlap vs. non-overlap
            overlap_prob = SlidingWindowFeature(data[:, 2],
                                                precomputed.sliding_window)
            overlap = self.overlap_binarize_.apply(overlap_prob)

            # overlap speech can only happen in speech regions
            overlap = overlap.crop(speech)
        else:
            # empty timeline
            overlap = Timeline()

        speech = speech.to_annotation(generator='string')
        overlap = overlap.to_annotation(generator='int')
        hypothesis = speech.update(overlap)

        return hypothesis
예제 #23
0
    def apply(self, protocol_name, output_dir, step=None, internal=False):

        # load best performing model
        with open(self.validate_txt_, 'r') as fp:
            eers = SortedDict(np.loadtxt(fp))
        best_epoch = int(eers.iloc[np.argmin(eers.values())])
        embedding = SequenceEmbeddingAutograd.load(self.train_dir_, best_epoch)

        # guess sequence duration from path (.../3.2+0.8/...)
        directory = basename(dirname(self.experiment_dir))
        duration, _, _, _ = self._directory_to_params(directory)
        if step is None:
            step = 0.5 * duration

        # initialize embedding extraction
        batch_size = self.approach_.batch_size
        extraction = Extraction(embedding,
                                self.feature_extraction_,
                                duration,
                                step=step,
                                batch_size=batch_size,
                                internal=internal)
        sliding_window = extraction.sliding_window
        dimension = extraction.dimension

        # create metadata file at root that contains
        # sliding window and dimension information
        path = Precomputed.get_config_path(output_dir)
        mkdir_p(dirname(path))
        f = h5py.File(path)
        f.attrs['start'] = sliding_window.start
        f.attrs['duration'] = sliding_window.duration
        f.attrs['step'] = sliding_window.step
        f.attrs['dimension'] = dimension
        f.close()

        # file generator
        protocol = get_protocol(protocol_name,
                                progress=True,
                                preprocessors=self.preprocessors_)

        for subset in ['development', 'test', 'train']:

            try:
                file_generator = getattr(protocol, subset)()
                first_item = next(file_generator)
            except NotImplementedError as e:
                continue

            file_generator = getattr(protocol, subset)()

            for current_file in file_generator:

                fX = extraction.apply(current_file)

                path = Precomputed.get_path(output_dir, current_file)
                mkdir_p(dirname(path))

                f = h5py.File(path)
                f.attrs['start'] = sliding_window.start
                f.attrs['duration'] = sliding_window.duration
                f.attrs['step'] = sliding_window.step
                f.attrs['dimension'] = dimension
                f.create_dataset('features', data=fX.data)
                f.close()
예제 #24
0
    def __init__(self, wrappable: Wrappable, **params):
        super().__init__()

        from pyannote.audio.features import Pretrained
        from pyannote.audio.features import Precomputed
        from pyannote.audio.features import FeatureExtraction
        from pyannote.audio.features import RawAudio

        scorer = None
        msg = ""

        # corner
        if isinstance(wrappable, dict):
            wrappable, custom_params = dict(wrappable).popitem()
            params.update(**custom_params)

        # If `wrappable` already complies with the `FeatureExtraction` API , it
        # is kept unchanged. This includes instances of any `FeatureExtraction`
        # subclass,`RawAudio` instances, `Precomputed` instances, and
        # `Pretrained` instances.
        if isinstance(wrappable,
                      (FeatureExtraction, RawAudio, Pretrained, Precomputed)):
            scorer = wrappable

        elif Path(wrappable).is_dir():
            directory = Path(wrappable)

            # If `wrappable` is a `Path` to a directory containing precomputed
            # features or scores, wrap the corresponding `Precomputed` instance
            try:
                scorer = Precomputed(root_dir=directory)
            except Exception as e:
                scorer = None

            # If `wrappable` is a `Path` to a validation directory,
            # wrap the corresponding `Pretrained` instance
            if scorer is None:
                try:
                    scorer = Pretrained(validate_dir=directory, **params)
                except Exception as e:
                    scorer = None

            if scorer is None:
                msg = (f'"{wrappable}" directory does not seem to be the path '
                       f"to precomputed features nor the path to a model "
                       f"validation step.")

        # If `wrappable` is a `Path` to a pretrined model checkpoint,
        # wrap the corresponding `Pretrained` instance
        elif Path(wrappable).is_file():
            checkpoint = Path(wrappable)

            try:
                validate_dir = checkpoint.parents[1] / "validate" / "fake"
                epoch = int(checkpoint.stem)
                scorer = Pretrained(validate_dir=validate_dir,
                                    epoch=epoch,
                                    **params)
            except Exception as e:
                msg = (f'"{wrappable}" directory does not seem to be the path '
                       f"to a pretrained model checkpoint.")
                scorer = None

        elif isinstance(wrappable, Text):

            # If `wrappable` is a `Text` starting with '@' such as '@key',
            # it means that one should read the "key" key of protocol files
            if wrappable.startswith("@"):
                key = wrappable[1:]

                scorer = partial(_use_existing_key, key)
                # scorer = lambda current_file: current_file[key]

            # If `wrappable` is a `Text` containing the name of an existing
            # `torch.hub` model, wrap the corresponding `Pretrained`.
            else:
                try:
                    import torch

                    scorer = torch.hub.load("pyannote/pyannote-audio",
                                            wrappable, **params)
                    if not isinstance(scorer, Pretrained):
                        msg = (
                            f'"{wrappable}" exists on torch.hub but does not '
                            f"return a `Pretrained` model instance.")
                        scorer = None

                except Exception as e:
                    msg = (f"Could not load {wrappable} model from torch.hub. "
                           f"The following exception was raised:\n{e}")
                    scorer = None

        # warn the user the something went wrong
        if scorer is None:
            raise ValueError(msg)

        self.scorer_ = scorer
예제 #25
0
    res['scores'] = pscores
    return res


if __name__ == '__main__':
    arguments = docopt(__doc__, version='Speaker-spotting')
    # protocol
    protocol_name = arguments['<database.task.protocol>']
    embedding_path = arguments['<embedding_path>']
    protocol = get_protocol(protocol_name, progress=True)

    # subset (train, development, or test)
    subset = arguments['--subset']
    output_file = arguments['<output_file>']
    from pyannote.audio.features import Precomputed
    precomputed = Precomputed(embedding_path)

    models = {}
    enrolments = getattr(protocol,
                         '{subset}_enrolment'.format(subset=subset))()
    for current_enrolment in enrolments:
        model_id = current_enrolment.pop('model_id')
        models[model_id] = speaker_spotting_enrol(current_enrolment)
    if arguments['oracle']:
        REFERENCE = {}
        for current_file in getattr(protocol, subset)():
            uri = current_file['uri']
            if uri not in REFERENCE:
                REFERENCE[uri] = Annotation(uri=uri)
            REFERENCE[uri].update(current_file['annotation'])
class SpeechActivityDetection(Pipeline):
    """Speech activity detection pipeline

    Parameters
    ----------
    precomputed : str
        Path to precomputed SAD scores.
    """

    def __init__(self, precomputed=None, **kwargs):
        super(SpeechActivityDetection, self).__init__()
        self.precomputed = precomputed

        self.precomputed_ = Precomputed(self.precomputed)
        self.has_overlap_ = self.precomputed_.dimension() == 3

        self.with_params(**kwargs)

    def get_tune_space(self):

        space = {
            'speech_onset': chocolate.uniform(0., 1.),
            'speech_offset': chocolate.uniform(0., 1.),
            'speech_min_duration_on': chocolate.uniform(0., 2.),
            'speech_min_duration_off': chocolate.uniform(0., 2.),
            'speech_pad_onset': chocolate.uniform(-1., 1.),
            'speech_pad_offset': chocolate.uniform(-1., 1.)
        }

        if self.has_overlap_:
            space.update({
                'overlap_onset': chocolate.uniform(0., 1.),
                'overlap_offset': chocolate.uniform(0., 1.),
                'overlap_min_duration_on': chocolate.uniform(0., 2.),
                'overlap_min_duration_off': chocolate.uniform(0., 2.),
                'overlap_pad_onset': chocolate.uniform(-1., 1.),
                'overlap_pad_offset': chocolate.uniform(-1., 1.)
            })

        return space

    def get_tune_metric(self):
        return DetectionErrorRate()

    def with_params(self, **params):

        # initialize speech/non-speech binarizer
        speech_params = {
            '_'.join(param.split('_')[1:]): value
            for param, value in params.items()
            if param.startswith('speech_')}
        self.speech_binarize_ = Binarize(**speech_params)

        # initialize overlap binarizer
        if self.has_overlap_:
            overlap_params = {
                '_'.join(param.split('_')[1:]): value
                for param, value in params.items()
                if param.startswith('overlap_')}
            self.overlap_binarize_ = Binarize(**overlap_params)

        return self

    def apply(self, current_file):

        # extract precomputed scores
        precomputed = self.precomputed_(current_file)

        # if this check has not been done yet, do it once and for all
        if not hasattr(self, "log_scale_"):
            # heuristic to determine whether scores are log-scaled
            if np.nanmean(precomputed.data) < 0:
                self.log_scale_ = True
            else:
                self.log_scale_ = False

        data = np.exp(precomputed.data) if self.log_scale_ \
               else precomputed.data

        # speech vs. non-speech
        speech_prob = SlidingWindowFeature(
            1. - data[:, 0],
            precomputed.sliding_window)
        speech = self.speech_binarize_.apply(speech_prob)

        if self.has_overlap_:

            # overlap vs. non-overlap
            overlap_prob = SlidingWindowFeature(
                data[:, 2], precomputed.sliding_window)
            overlap = self.overlap_binarize_.apply(overlap_prob)

            # overlap speech can only happen in speech regions
            overlap = overlap.crop(speech)
        else:
            # empty timeline
            overlap = Timeline()

        speech = speech.to_annotation(generator='string')
        overlap = overlap.to_annotation(generator='int')
        hypothesis = speech.update(overlap)

        return hypothesis
예제 #27
0
# coding: utf-8
import sys
sys.path.append("../")
import clustering
import numpy as np

from pyannote.audio.features import Precomputed
precomputed = Precomputed('/vol/work1/bredin/speaker_spotting/embeddings')


from pyannote.database import get_protocol, FileFinder
protocol = get_protocol('AMI.SpeakerSpotting.MixHeadset', progress=True)

from pyannote.core import Annotation,Segment, Timeline

# enrolment consists in summing all relevant embeddings
def speaker_spotting_enrol(current_enrolment):
    enrol_with = current_enrolment['enrol_with']
    embeddings = precomputed(current_enrolment)
    return np.sum(embeddings.crop(enrol_with), axis=0, keepdims=True)    

models = {}
for current_enrolment in protocol.test_enrolment():
    model_id = current_enrolment.pop('model_id')
    models[model_id] = speaker_spotting_enrol(current_enrolment)

REFERENCE = {}
for current_file in protocol.test():
    uri = current_file['uri']
    if uri not in REFERENCE:
        REFERENCE[uri] = Annotation(uri=uri)
예제 #28
0
class VCTKLoader:
    def __init__(self, split='', n_support=0, n_query=0, n_way=0, if_cuda=False):
        self.precomputed = Precomputed(VCTK_FEATURE_DIR)
        self.dataset = self.load_dataset(from_disk=True)

        self.split = split
        self.n_support = n_support
        self.n_query = n_query
        self.n_way = n_way
        self.dataset = None
        self.transforms = None
        self.if_cuda = if_cuda


    def shuffle_dataset(self):
        num_label = len(self.dataset['class'])
        num_data = len(self.dataset['data'][0]) # x: num_labels * num_samples_per_label

        # num_label * num_data
        data_index = np.tile(np.arange(num_data), (num_label,1))
        label_index = np.tile(np.arange(num_label).reshape(-1, 1), (1,num_data))
        label_index = np.expand_dims(label_index, axis=2)
        data_index = np.expand_dims(data_index, axis=2)
        data_label_idx = np.concatenate((data_index, label_index), axis=2)

        # shuffle rows (labels)
        np.random.shuffle(data_label_idx)
        # shuffle data index for each row (for each label)
        for dl in data_label_idx:
            np.random.shuffle(dl)

        index_batches = sub_matrix(data_label_idx, self.n_support+self.n_query, self.n_way)
        # optional: shuffle batches
        np.random.shuffle(index_batches)
        return index_batches

    def __iter__(self):
        if self.dataset is None:
            self.dataset = self.load_dataset(from_disk=True)[self.split]
            transforms = [partial(batch_from_index, self.dataset['data']), partial(convert_tensor, 'data')]
            if self.if_cuda:
                transforms.append(CudaTransform())
            self.transforms = compose(transforms)
        index_batches = self.shuffle_dataset()
        batches = TransformDataset(ListDataset(index_batches), self.transforms)

        print(f"\nSize of batches: {len(batches)}")
        for batch in batches:
            batch['n_way'] = self.n_way
            batch['n_support'] = self.n_support
            batch['n_query'] = self.n_query
            yield batch


    def get_feature(self, cfile, seg):
        return self.precomputed.crop(cfile, seg, mode='center', fixed=2.0)
        # return precomputed(cfile).crop(seg, mode='center', fixed=2.0)

    def load_speaker_file(self, protocol_name='SpeakerEmbedding.All', from_disk=False):
        database = VCTK()
        protocol = database.get_protocol(protocol_name.split('.')[0], protocol_name.split('.')[1])

        speaker_file = { 'train': {}, 'val': {}, 'test':{}, 'unseen':{}}
        if from_disk:
            print('Loading speaker_file from disk...')
            vctk_file_name = os.path.join(VCTK_DATA_DIR, 'vctk_speaker_file')
            if not os.path.isfile(vctk_file_name):
                raise ValueError(f'{vctk_file_name} not found')
            else:
                with open(vctk_file_name, 'rb') as vctk_file:
                    speaker_file = pickle.load(vctk_file)
                return speaker_file

        print('Loading unseen set...')
        for current_file in protocol.unseen_iter():
            speaker = current_file['uri'].split('_')[0]
            if not speaker in speaker_file['unseen']:
                speaker_file['unseen'][speaker] = []
            speaker_file['unseen'][speaker].append(current_file)

        print('Loading training set...')
        for current_file in protocol.train():
            speaker = current_file['uri'].split('_')[0]
            if not speaker in speaker_file['train']:
                speaker_file['train'][speaker] = []
            speaker_file['train'][speaker].append(current_file)

        print('Loading test set...')
        for current_file in protocol.test():
            speaker = current_file['uri'].split('_')[0]
            if not speaker in speaker_file['test']:
                speaker_file['test'][speaker] = []
            speaker_file['test'][speaker].append(current_file)

        print('Loading development set...')
        for current_file in protocol.development():
            speaker = current_file['uri'].split('_')[0]
            if not speaker in speaker_file['val']:
                speaker_file['val'][speaker] = []
            speaker_file['val'][speaker].append(current_file)


        with open(os.path.join(VCTK_DATA_DIR, 'vctk_speaker_file'), 'wb') as vctk_file:
            pickle.dump(speaker_file, vctk_file,  -1)
        return speaker_file


    def load_speaker_segments(self, seg_dur=2.0, overlap_ratio=0.25, from_disk=False):
        '''
        2 seconds segments, with overlapping ratio = 0.25
        |----||----|
           |----|
        '''
        spk_seg = { 'train': {}, 'val': {}, 'test':{}, 'unseen':{}}
        if from_disk:
            print('Loading speaker_segments from disk...')
            vctk_file_name = os.path.join(VCTK_DATA_DIR, 'vctk_speaker_segments')
            if os.path.isfile(vctk_file_name):
                with open(vctk_file_name, 'rb') as vctk_file:
                    spk_seg = pickle.load(vctk_file)
                return spk_seg
            else:
                raise ValueError(f'{vctk_file_name} not found')

        def fetch_spk_seg(speaker_file):
            speaker_seg = {}
            for spk, sfiles in speaker_file.items():
                speaker_seg[spk] = []

                for sfile in sfiles:
                    duration = sfile['annotated'].duration()
                    if duration < seg_dur:
                        continue
                    half_seg = seg_dur / 2
                    for mid in np.arange(half_seg, duration-half_seg, seg_dur*(1-overlap_ratio)):
                        speaker_seg[spk].append(
                            (Segment(mid-half_seg, mid+half_seg), sfile)
                        )
            return speaker_seg

        spk_file = self.load_speaker_file(from_disk=from_disk)
        for sub in ['train', 'val', 'test', 'unseen']:
            spk_file[sub] = fetch_spk_seg(spk_file[sub])

        with open(os.path.join(VCTK_DATA_DIR, 'vctk_speaker_segments'), 'wb') as vctk_file:
            pickle.dump(spk_file, vctk_file,  -1)
        return spk_file

    def load_dataset(self, from_disk=False):
        print('Loading dataset...')
        dataset = { 'train': {}, 'val': {}, 'test':{}, 'unseen':{}}
        if from_disk:
            vctk_file_name = os.path.join(VCTK_DATA_DIR, 'vctk_datasets')
            if os.path.isfile(vctk_file_name):
                with open(vctk_file_name, 'rb') as vctk_file:
                    dataset = pickle.load(vctk_file)
                    return  dataset
            else:
                raise ValueError(f'{vctk_file_name} not found')

        spk_seg = self.load_speaker_segments(from_disk=from_disk)
        with open(os.path.join(VCTK_DATA_DIR, 'vctk_datasets'), 'wb') as vctk_file:
            for sub in dataset.keys():
                speaker_segments = spk_seg[sub]
                seg_count = []
                for spk, seg in speaker_segments.items():
                    seg_count.append(len(seg))
                max_count = max(seg_count)
                for spk in speaker_segments.keys():
                    speaker_segments[spk] = list(islice(cycle(speaker_segments[spk]), max_count))
                y_labels = speaker_segments.keys()
                dataset[sub] = {
                    'class': list(y_labels),
                    'data': [speaker_segments[label] for label in y_labels]
                }
            pickle.dump(dataset, vctk_file,  -1)
        return dataset

    # load MFCC features into memory
    # take too much memory, not recommand
    def load_features(self, from_disk=False):
        feature_dataset = {'train': {}, 'val': {}, 'test': {}, 'unseen':{}}
        if from_disk:
            vctk_file_name = os.path.join(VCTK_DATA_DIR, 'vctk_feature_datasets')
            if os.path.isfile(vctk_file_name):
                with open(vctk_file_name, 'rb') as vctk_file:
                    feature_dataset = pickle.load(vctk_file)
                    return feature_dataset
            else:
                raise ValueError(f'{vctk_file_name} not found')

        dataset = self.load_dataset(from_disk=True)
        with open(os.path.join(VCTK_DATA_DIR, 'vctk_feature_datasets'), 'wb') as vctk_file:
            for sub in feature_dataset.items():
                print(f'Loading feature: {sub}')
                subset = dataset[sub]
                for spk, seg_list in subset.items():
                    print(f'Speaker: {spk}')
                    feature_list = []
                    for seg, cfile in seg_list:
                        feature_list.append(self.get_feature(cfile, seg))
                    feature_dataset[sub][spk] = np.array(feature_list)
            pickle.dump(feature_dataset, vctk_file, -1)
        return feature_dataset

    # load data for same/different experiments
    def load_same_diff_data(self, from_disk=False):
        exp_dataset = {'train': {},'val': {}, 'test': {}, 'unseen': {}}
        n_pair = 40
        n_pair_unseen = 100
        print(f'Loading same/diff data, #pair: {n_pair}, #pair_unseen: {n_pair_unseen}')
        file_name = os.path.join(VCTK_DATA_DIR, f'same_diff_exp_norepeat_{n_pair}_{n_pair_unseen}')

        if from_disk:
            if os.path.isfile(file_name):
                with open(file_name, 'rb') as dfile:
                    exp_dataset = pickle.load(dfile)
                    return exp_dataset
            else:
                raise ValueError(f'{file_name} not found, generate first?')

        def gen_same_diff(data, labels, first_n=-1, n_same_pair=20):
            same_pairs = []
            diff_pairs = []
            for spk, spk_data in zip(labels, data):
                first_n = len(spk_data) # if first_n == -1 or first_n > len(spk_data) else first_n
                # get same pairs
                ind = list(range(first_n))
                n = 0
                same_pair_ind = []
                ind_his = set()
                while n < n_same_pair:
                    i1 = random.choice(ind)
                    ind.remove(i1)
                    i2 = random.choice(ind)
                    if (not (i1, i2) in ind_his) and (not (i2, i1) in ind_his):
                        ind_his.add((i1, i2))
                        same_pair_ind.append((i1, i2))
                        n += 1
                    else:
                        print('skip repeated pair')
                    ind = list(range(first_n))

                spk_same_pairs = [(spk_data[ind[0]], spk_data[ind[1]]) for ind in same_pair_ind]
                same_pairs.extend(spk_same_pairs)

            # get different pairs
            labels_ind = list(range(len(labels)))
            n = 0
            pair_his = set()
            while n < n_same_pair * len(labels):
                s1 = random.choice(labels_ind)
                ind1 = random.choice(list(range(len(data[s1]))))
                labels_ind.remove(s1)
                s2 = random.choice(labels_ind)
                ind2 = random.choice(list(range(len(data[s2]))))

                pair1 = (s1, ind1); pair2 = (s2, ind2)
                if (not (pair1, pair2) in pair_his ) and (not (pair2, pair1) in pair_his):
                    pair_his.add((pair1, pair2))
                    diff_pairs.append((data[s1][ind1], data[s2][ind2]))
                    n += 1
                else:
                    print('skip repeated pair')
                labels_ind = list(range(len(labels)))
            print(len(same_pairs), len(diff_pairs))
            return same_pairs, diff_pairs

        print("Loading dataset from disk, instead of generating from scratch")
        dataset = self.load_dataset(from_disk=True)
        for subset in ['train', 'val', 'test', 'unseen']:
            first_n = -1 #  if subset == 'unseen' else 200
            npair = n_pair_unseen if subset == 'unseen' else n_pair
            data = dataset[subset]['data']
            labels = dataset[subset]['class']
            same_pairs, diff_pairs = gen_same_diff(data, labels, first_n=first_n, n_same_pair=npair)
            assert len(same_pairs) == len(diff_pairs)
            assert len(same_pairs) == len(labels) * npair
            exp_dataset[subset] = {
                'same': same_pairs,
                'diff': diff_pairs
            }
        with open(os.path.join(VCTK_DATA_DIR, file_name), 'wb') as dfile:
            pickle.dump(exp_dataset, dfile, -1)
        return exp_dataset