Exemplo n.º 1
0
def gecko(args):
    hypotheses_path = args['<hypotheses_path>']
    uri = args['<uri>']
    colors = get_colors(uri)
    distances = {}
    if Path(hypotheses_path).exists():
        hypotheses = load_rttm(hypotheses_path)
        hypothesis = hypotheses[uri]
    else:  # protocol
        protocol = get_protocol(args['<hypotheses_path>'])
        reference = get_file(protocol, uri)
        hypothesis = reference['annotation']
        annotated = get_annotated(reference)
    hypotheses_path = Path(hypotheses_path)
    protocol = args['--database.task.protocol']
    features = None
    if protocol:
        protocol = get_protocol(protocol)
        embeddings = args['--embeddings']
        reference, features = get_file(protocol, uri, embeddings=embeddings)
        if args['--map']:
            print(f"mapping {uri} with {protocol}")
            diarizationErrorRate = DiarizationErrorRate()
            annotated = get_annotated(reference)
            optimal_mapping = diarizationErrorRate.optimal_mapping(
                reference['annotation'], hypothesis, annotated)
            hypothesis = hypothesis.rename_labels(mapping=optimal_mapping)

    hypothesis = update_labels(hypothesis, distances)  # tag unsure clusters

    distances_per_speaker = get_distances_per_speaker(
        features, hypothesis) if features else {}

    if args['--tag_na']:
        whole_file = Segment(0., annotated.segments_boundaries_[-1])
        not_annotated = annotated.gaps(whole_file).to_annotation(na())
        hypothesis = hypothesis.crop(annotated).update(not_annotated)

    gecko_json = annotation_to_GeckoJSON(hypothesis, distances_per_speaker,
                                         colors)

    if hypotheses_path.exists():
        dir_path = hypotheses_path.parent
    else:
        dir_path = Path(".")

    json_path = os.path.join(dir_path, f'{uri}.json')
    with open(json_path, 'w') as file:
        json.dump(gecko_json, file)
    print(f"succefully dumped {json_path}")
Exemplo n.º 2
0
    def __init__(self,
                 protocol=None,
                 subset='train',
                 db_yml=None,
                 snr_min=5,
                 snr_max=20):
        super().__init__()

        self.protocol = protocol
        self.subset = subset
        self.db_yml = db_yml

        self.snr_min = snr_min
        self.snr_max = snr_max

        # returns gaps in annotation as pyannote.core.Timeline instance
        get_gaps = lambda f: f['annotation'].get_timeline().gaps(
            support=get_annotated(f))

        if isinstance(protocol, str):
            preprocessors = {
                'audio': FileFinder(config_yml=db_yml),
                'duration': get_audio_duration,
                'gaps': get_gaps
            }
            protocol = get_protocol(self.protocol, preprocessors=preprocessors)
        else:
            protocol.preprocessors['gaps'] = get_gaps

        self.files_ = list(getattr(protocol, self.subset)())
Exemplo n.º 3
0
def update_distances(args):
    """Loads user annotation from json path, converts it to pyannote `Annotation`
    using regions timings.

    From the annotation uri and precomputed embeddings, it computes the
    in-cluster distances between every speech turns

    Dumps the updated (with correct distances) JSON file to a timestamped file.
    """
    json_path = Path(args['<json_path>'])
    uri = args['<uri>']
    with open(json_path, 'r') as file:
        gecko_json = json.load(file)
    hypothesis, _, _, _ = gecko_JSON_to_Annotation(gecko_json, uri, 'speaker')

    colors = get_colors(uri)

    precomputed = Precomputed(embeddings)
    protocol = args['<database.task.protocol>']
    protocol = get_protocol(protocol)
    for reference in getattr(protocol, 'test')():
        if reference['uri'] == uri:
            features = precomputed(reference)
            break
    distances_per_speaker = get_distances_per_speaker(features, hypothesis)
    gecko_json = annotation_to_GeckoJSON(hypothesis, distances_per_speaker,
                                         colors)
    name = f"{json_path.stem}.{TIMESTAMP}.json"
    updated_path = Path(json_path.parent, name)
    with open(updated_path, 'w') as file:
        json.dump(gecko_json, file)
    print(f"succefully dumped {updated_path}")
Exemplo n.º 4
0
    def validate_init(self, protocol_name, subset='development'):
        """Initialize validation data

        Parameters
        ----------
        protocol_name : `str`
        subset : {'train', 'development', 'test'}
            Defaults to 'development'.

        Returns
        -------
        validation_data : object
            Validation data.

        """

        protocol = get_protocol(protocol_name,
                                progress=False,
                                preprocessors=self.preprocessors_)
        files = getattr(protocol, subset)()

        # convert lazy ProtocolFile to regular dict for multiprocessing
        files = [dict(file) for file in files]

        if isinstance(self.feature_extraction_, (Precomputed, RawAudio)):
            return files

        validation_data = []
        for current_file in tqdm(files, desc='Feature extraction'):
            current_file['features'] = self.feature_extraction_(current_file)
            validation_data.append(current_file)

        return validation_data
    def _validate_init_turn(self, protocol_name, subset='development'):

        np.random.seed(1337)

        protocol = get_protocol(protocol_name,
                                progress=False,
                                preprocessors=self.preprocessors_)

        batch_generator = SpeechTurnSubSegmentGenerator(
            self.feature_extraction_, self.duration, per_label=10, per_turn=5)
        batch = next(batch_generator(protocol, subset=subset))

        X = np.stack(batch['X'])
        y = np.stack(batch['y'])
        z = np.stack(batch['z'])

        # get list of labels from list of repeated labels:
        # z 0 0 0 1 1 1 2 2 2 2 3 3 3 3
        # y A A A A A A B B B B B B B B
        # becomes
        # z 0 0 0 1 1 1 2 2 2 2 3 3 3 3
        # y A B
        yz = np.vstack([y, z]).T
        y = []
        for _, yz_ in itertools.groupby(yz, lambda t: t[1]):
            yz_ = np.stack(yz_)
            y.append(yz_[0, 0])
        y = np.array(y).reshape((-1, 1))

        # precompute same/different groundtruth
        y = pdist(y, metric='equal')

        return {'X': X, 'y': y, 'z': z}
Exemplo n.º 6
0
    def __init__(self,
                 protocol=None,
                 subset: Subset = "train",
                 snr_min=5,
                 snr_max=20):
        super().__init__()

        self.protocol = protocol
        self.subset = subset
        self.snr_min = snr_min
        self.snr_max = snr_max

        # returns gaps in annotation as pyannote.core.Timeline instance
        get_gaps = (lambda f: f["annotation"].get_timeline().gaps(
            support=get_annotated(f)))

        if isinstance(protocol, str):
            preprocessors = {
                "audio": FileFinder(),
                "duration": get_audio_duration,
                "gaps": get_gaps,
            }
            protocol = get_protocol(self.protocol, preprocessors=preprocessors)
        else:
            protocol.preprocessors["gaps"] = get_gaps

        self.files_ = list(getattr(protocol, self.subset)())
    def _validate_init_turn(self, protocol_name, subset='development'):

        np.random.seed(1337)

        protocol = get_protocol(protocol_name, progress=False,
                                preprocessors=self.preprocessors_)

        batch_generator = SpeechTurnSubSegmentGenerator(
            self.feature_extraction_, self.duration,
            per_label=10, per_turn=5)
        batch = next(batch_generator(protocol, subset=subset))

        X = np.stack(batch['X'])
        y = np.stack(batch['y'])
        z = np.stack(batch['z'])

        # get list of labels from list of repeated labels:
        # z 0 0 0 1 1 1 2 2 2 2 3 3 3 3
        # y A A A A A A B B B B B B B B
        # becomes
        # z 0 0 0 1 1 1 2 2 2 2 3 3 3 3
        # y A B
        yz = np.vstack([y, z]).T
        y = []
        for _, yz_ in itertools.groupby(yz, lambda t: t[1]):
            yz_ = np.stack(yz_)
            y.append(yz_[0, 0])
        y = np.array(y).reshape((-1, 1))

        # precompute same/different groundtruth
        y = pdist(y, metric='equal')

        return {'X': X, 'y': y, 'z': z}
Exemplo n.º 8
0
def main(args):
    protocol_name = args['<database.task.protocol>']
    set = args['--set'] if args['--set'] else "train"
    filter_unk = args['--filter_unk']
    crop = float(args['--crop']) if args['--crop'] else None
    hist = args['--hist']
    verbose = args['--verbose']
    save = args['--save']

    protocol = get_protocol(protocol_name)
    print(f"getting stats from {protocol_name}.{set}...")
    stats = protocol.stats(set)
    print_stats(stats)
    if filter_unk:
        values = [
            value for label, value in stats['labels'].items()
            if '#unknown#' not in label
        ]
    else:
        values = list(stats['labels'].values())
    print(f"n_speaking_speakers: {np.array(values).nonzero()[0].shape[0]}")
    print("quartiles:")
    print(quartiles(values))

    print("deciles:")
    print(deciles(values))

    plot_speech_duration(values, protocol_name, set, hist, crop, save)
Exemplo n.º 9
0
def info(protocol: str):
    """Print protocol detailed information"""

    p = get_protocol(protocol)

    if isinstance(p, SpeakerDiarizationProtocol):
        subsets = ["train", "development", "test"]
        skip_annotation = False
        skip_annotated = False
    elif isinstance(p, CollectionProtocol):
        subsets = ["files"]
        skip_annotation = True
        skip_annotated = True
    else:
        typer.echo(
            "Only collections and speaker diarization protocols are supported."
        )
        typer.Exit(code=1)

    for subset in subsets:

        num_files = 0
        speakers = set()
        duration = 0.0
        speech = 0.0

        def iterate():
            try:
                for file in getattr(p, subset)():
                    yield file
            except (AttributeError, NotImplementedError):
                return

        for file in iterate():
            num_files += 1

            if not skip_annotation:
                annotation = file["annotation"]
                speakers.update(annotation.labels())
                speech += annotation.get_timeline().support().duration()

            if not skip_annotated:
                annotated = file["annotated"]
                duration += annotated.duration()

        if num_files > 0:
            typer.secho(f"{subset}",
                        fg=typer.colors.BRIGHT_GREEN,
                        underline=True,
                        bold=True)
            typer.echo(f"   {num_files} files")
            if not skip_annotated:
                typer.echo(f"   {duration_to_str(duration)} annotated")

            if not skip_annotation:
                typer.echo(
                    f"   {duration_to_str(speech)} of speech ({100 * speech / duration:.0f}%)"
                )
                typer.echo(f"   {len(speakers)} speakers")
def extract(protocol_name, file_finder, experiment_dir, robust=False, parallel=False):

    protocol = get_protocol(protocol_name)

    # load configuration file
    config_yml = experiment_dir + "/config.yml"
    with open(config_yml, "r") as fp:
        config = yaml.load(fp, Loader=yaml.SafeLoader)

    FeatureExtraction = get_class_by_name(
        config["feature_extraction"]["name"],
        default_module_name="pyannote.audio.features",
    )
    feature_extraction = FeatureExtraction(
        **config["feature_extraction"].get("params", {})
    )

    sliding_window = feature_extraction.sliding_window
    dimension = feature_extraction.dimension

    # create metadata file at root that contains
    # sliding window and dimension information

    precomputed = Precomputed(
        root_dir=experiment_dir, sliding_window=sliding_window, dimension=dimension
    )

    if parallel:

        extract_one = functools.partial(
            helper_extract,
            file_finder=file_finder,
            experiment_dir=experiment_dir,
            config_yml=config_yml,
            robust=robust,
        )

        n_jobs = cpu_count()
        pool = Pool(n_jobs)
        imap = pool.imap

    else:

        feature_extraction = init_feature_extraction(experiment_dir)
        extract_one = functools.partial(
            helper_extract,
            file_finder=file_finder,
            experiment_dir=experiment_dir,
            feature_extraction=feature_extraction,
            robust=robust,
        )
        imap = map

    for result in imap(extract_one, protocol.files()):
        if result is None:
            continue
        print(result)
Exemplo n.º 11
0
def main():
    usage = "%prog [options] database, raw_score_path"
    desc = "Write the output of the binary overlap detector into test based on a threshold"
    version = "%prog 0.1"
    parser = OptionParser(usage=usage, description=desc, version=version)
    parser.add_option("-t", "--onset", action="store", type="float", help="Onset Threshold", default=0.70)
    parser.add_option("-f", "--offset", action="store", type="float", help="Offset Threshold", default=0.70)
    parser.add_option("-d", "--dev", action="store_true", help="Print output based on development set", default=False)
    parser.add_option("-o", "--outputfile", action="store", type="string", help="Output file", default="./overlap.txt")
    (opt, args) = parser.parse_args()

    if(len(args)!=2):
        parser.error("Incorrect number of arguments")
    database, raw_score_path = args

    # get test file of protocol
    protocol = get_protocol(database)

    # load precomputed overlap scores as pyannote.core.SlidingWindowFeature
    precomputed = Precomputed(raw_score_path)
    # StackedRNN model
    # initialize binarizer
    # onset / offset are tunable parameters (and should be tuned for better 
    # performance). we use log_scale=True because of the final log-softmax in the 
    binarize = Binarize(onset=opt.onset, offset=opt.offset, log_scale=True)

    fw = open(opt.outputfile, 'wt')

    if opt.dev:
        for test_file in protocol.development():
            ovl_scores = precomputed(test_file)


            # binarize overlap scores to obtain overlap regions as pyannote.core.Timeline
            ovl_regions = binarize.apply(ovl_scores, dimension=1)
            ovl_regions.uri = test_file['uri']


            # write the output into text
            write_txt(fw, ovl_regions)
 
    else:
        for test_file in protocol.test():
            ovl_scores = precomputed(test_file)


            # binarize overlap scores to obtain overlap regions as pyannote.core.Timeline
            ovl_regions = binarize.apply(ovl_scores, dimension=1)
            ovl_regions.uri = test_file['uri']


            # write the output into text
            write_txt(fw, ovl_regions)
    fw.close()
Exemplo n.º 12
0
    def validate_init(self, protocol_name, subset='development'):

        protocol = get_protocol(protocol_name)

        if isinstance(
                protocol,
            (SpeakerVerificationProtocol, SpeakerDiarizationProtocol)):
            return

        msg = ('Only SpeakerVerification or SpeakerDiarization tasks are'
               'supported in "validation" mode.')
        raise ValueError(msg)
Exemplo n.º 13
0
 def __init__(self,
              batch_size: int,
              segment_size_millis: int,
              segments_per_speaker: int = 1):
     self.sample_rate = 16000
     self.batch_size = batch_size
     self.segments_per_speaker = segments_per_speaker
     self.segment_size_s = segment_size_millis / 1000
     self.nfeat = self.sample_rate * segment_size_millis // 1000
     self.config = self._create_config(self.segment_size_s)
     self.protocol = get_protocol(self.config.protocol_name,
                                  preprocessors=self.config.preprocessors)
     self.train_gen, self.dev_gen, self.test_gen = None, None, None
     print(f"[Segment Size: {self.segment_size_s}s]")
     print(f"[Network Input Size: {self.nfeat}]")
Exemplo n.º 14
0
    def __init__(self, collection: Optional[NoiseCollection] = None):

        if collection is None:
            collection = "MUSAN.Collection.BackgroundNoise"

        if not isinstance(collection, (list, tuple)):
            collection = [collection]

        self.collection = collection

        self.files_ = []
        preprocessors = {'audio': FileFinder(), 'duration': get_audio_duration}
        for collection in self.collection:
            protocol = get_protocol(collection, preprocessors=preprocessors)
            self.files_.extend(protocol.files())
Exemplo n.º 15
0
def speakers(args):
    hypotheses_path = args['<hypotheses_path>']
    uri = args['<uri>']
    if Path(hypotheses_path).exists():
        hypotheses = load_rttm(hypotheses_path)
        hypothesis = hypotheses[uri]
    else:  # protocol
        distances = {}
        protocol = get_protocol(args['<hypotheses_path>'])
        reference = get_file(protocol, uri)
        hypothesis = reference['annotation']
        annotated = get_annotated(reference)
    print(uri)
    print(f"Number of speakers: {len(hypothesis.labels())}")
    print(f"Chart:\n{hypothesis.chart()}")
Exemplo n.º 16
0
    def _validation_set(self, protocol_name, subset='development'):
        # this generator is hacked to generate y_true
        # (which is stored in its internal preprocessed_ attribute)
        batch_generator = SpeechActivityDetectionBatchGenerator(
            self.feature_extraction_)
        batch_generator.cache_preprocessed_ = True

        # iterate over each test file and generate y_true
        protocol = get_protocol(protocol_name,
                                progress=False,
                                preprocessors=self.preprocessors_)
        file_generator = getattr(protocol, subset)()
        for current_file in file_generator:
            identifier = get_unique_identifier(current_file)
            batch_generator.preprocess(current_file, identifier=identifier)

        return batch_generator.preprocessed_['y']
Exemplo n.º 17
0
def tune_binarizer(app, epoch, protocol_name, subset='development'):
    """Tune binarizer

    Parameters
    ----------
    app : SpeechActivityDetection
    epoch : int
        Epoch number.
    protocol_name : str
        E.g. 'Etape.SpeakerDiarization.TV'
    subset : {'train', 'development', 'test'}, optional
        Defaults to 'development'.

    Returns
    -------
    params : dict
        See Binarize.tune
    metric : float
        Best achieved detection error rate
    """

    # initialize protocol
    protocol = get_protocol(protocol_name,
                            progress=False,
                            preprocessors=app.preprocessors_)

    # load model for epoch 'epoch'
    sequence_labeling = SequenceLabeling.from_disk(app.train_dir_, epoch)

    # initialize sequence labeling
    duration = app.config_['sequences']['duration']
    step = app.config_['sequences']['step']
    aggregation = SequenceLabelingAggregation(sequence_labeling,
                                              app.feature_extraction_,
                                              duration=duration,
                                              step=step)
    aggregation.cache_preprocessed_ = False

    # tune Binarize thresholds (onset & offset)
    # with respect to detection error rate
    binarize_params, metric = Binarize.tune(getattr(protocol, subset)(),
                                            aggregation.apply,
                                            get_metric=DetectionErrorRate,
                                            dimension=1)

    return binarize_params, metric
    def _validate_init_segment(self, protocol_name, subset='development'):

        np.random.seed(1337)

        protocol = get_protocol(protocol_name, progress=False,
                                preprocessors=self.preprocessors_)

        batch_generator = SpeechSegmentGenerator(
            self.feature_extraction_, per_label=10, duration=self.duration)
        batch = next(batch_generator(protocol, subset=subset))

        X = np.stack(batch['X'])
        y = np.stack(batch['y']).reshape((-1, 1))

        # precompute same/different groundtruth
        y = pdist(y, metric='equal')
        return {'X': X, 'y': y}
Exemplo n.º 19
0
    def train(self, protocol_name, subset='train', restart=None, epochs=1000):

        train_dir = self.TRAIN_DIR.format(
            experiment_dir=self.experiment_dir,
            protocol=protocol_name,
            subset=subset)

        protocol = get_protocol(protocol_name, progress=True,
                                preprocessors=self.preprocessors_)

        self.task_.fit(
            self.model_, self.feature_extraction_,
            protocol, subset=subset, restart=restart, epochs=epochs,
            get_optimizer=self.get_optimizer_,
            get_scheduler=self.get_scheduler_,
            learning_rate=self.learning_rate_,
            log_dir=train_dir, device=self.device)
Exemplo n.º 20
0
    def __init__(self, collection=None, snr_min=5, snr_max=20):
        super().__init__()

        if collection is None:
            collection = 'MUSAN.Collection.BackgroundNoise'
        if not isinstance(collection, (list, tuple)):
            collection = [collection]
        self.collection = collection

        self.snr_min = snr_min
        self.snr_max = snr_max

        # load noise database
        self.files_ = []
        preprocessors = {'audio': FileFinder(), 'duration': get_audio_duration}
        for collection in self.collection:
            protocol = get_protocol(collection, preprocessors=preprocessors)
            self.files_.extend(protocol.files())
Exemplo n.º 21
0
    def _validate_init_segment(self, protocol_name, subset='development'):

        np.random.seed(1337)

        protocol = get_protocol(protocol_name,
                                progress=False,
                                preprocessors=self.preprocessors_)

        batch_generator = SpeechSegmentGenerator(self.feature_extraction_,
                                                 per_label=10,
                                                 duration=self.duration)
        batch = next(batch_generator(protocol, subset=subset))

        X = np.stack(batch['X'])
        y = np.stack(batch['y']).reshape((-1, 1))

        # precompute same/different groundtruth
        y = pdist(y, metric='equal')
        return {'X': X, 'y': y}
Exemplo n.º 22
0
    def train(self, protocol_name, subset='train'):

        train_dir = self.TRAIN_DIR.format(experiment_dir=self.experiment_dir,
                                          protocol=protocol_name,
                                          subset=subset)

        # sequence batch generator
        batch_size = self.config_['sequences'].get('batch_size', 8192)
        duration = self.config_['sequences']['duration']
        step = self.config_['sequences']['step']
        batch_generator = SpeechActivityDetectionBatchGenerator(
            self.feature_extraction_,
            duration=duration,
            step=step,
            batch_size=batch_size)
        batch_generator.cache_preprocessed_ = self.cache_preprocessed_

        protocol = get_protocol(protocol_name,
                                progress=False,
                                preprocessors=self.preprocessors_)

        # total train duration
        train_total = protocol.stats(subset)['annotated']
        # number of batches per epoch
        steps_per_epoch = int(np.ceil((train_total / step) / batch_size))

        # input shape (n_frames, n_features)
        input_shape = batch_generator.shape

        # generator that loops infinitely over all training files
        train_files = getattr(protocol, subset)()
        generator = batch_generator(train_files, infinite=True)

        labeling = SequenceLabeling()
        labeling.fit(input_shape,
                     self.architecture_,
                     generator,
                     steps_per_epoch,
                     1000,
                     optimizer=SSMORMS3(),
                     log_dir=train_dir)

        return labeling
    def validate_epoch(self, epoch, validation_data, protocol=None, **kwargs):

        _protocol = get_protocol(protocol)

        if isinstance(_protocol, SpeakerVerificationProtocol):
            return self._validate_epoch_verification(epoch,
                                                     validation_data,
                                                     protocol=protocol,
                                                     **kwargs)

        elif isinstance(_protocol, SpeakerDiarizationProtocol):
            return self._validate_epoch_diarization(epoch,
                                                    validation_data,
                                                    protocol=protocol,
                                                    **kwargs)

        else:
            msg = ("Only SpeakerVerification or SpeakerDiarization tasks are"
                   'supported in "validation" mode.')
            raise ValueError(msg)
Exemplo n.º 24
0
    def train(self, protocol_name, subset='train', restart=None, epochs=1000):

        train_dir = self.TRAIN_DIR.format(experiment_dir=self.experiment_dir,
                                          protocol=protocol_name,
                                          subset=subset)

        protocol = get_protocol(protocol_name,
                                progress=True,
                                preprocessors=self.preprocessors_)

        self.task_.fit(self.model_,
                       self.feature_extraction_,
                       protocol,
                       subset=subset,
                       restart=restart,
                       epochs=epochs,
                       get_optimizer=self.get_optimizer_,
                       get_scheduler=self.get_scheduler_,
                       learning_rate=self.learning_rate_,
                       log_dir=train_dir,
                       device=self.device)
Exemplo n.º 25
0
    def apply(self, protocol_name, output_dir, step=None):

        model = self.model_.to(self.device)
        model.eval()

        duration = self.duration
        if step is None:
            step = 0.25 * duration

        # do not use memmap as this would lead to too many open files
        if isinstance(self.feature_extraction_, Precomputed):
            self.feature_extraction_.use_memmap = False

        # initialize embedding extraction
        sequence_embedding = SequenceEmbedding(model,
                                               self.feature_extraction_,
                                               duration=duration,
                                               step=step,
                                               batch_size=self.batch_size,
                                               device=self.device)
        sliding_window = sequence_embedding.sliding_window
        dimension = sequence_embedding.dimension

        # create metadata file at root that contains
        # sliding window and dimension information
        precomputed = Precomputed(root_dir=output_dir,
                                  sliding_window=sliding_window,
                                  dimension=dimension)

        # file generator
        protocol = get_protocol(protocol_name,
                                progress=True,
                                preprocessors=self.preprocessors_)

        for current_file in FileFinder.protocol_file_iter(protocol,
                                                          extra_keys=['audio'
                                                                      ]):

            fX = sequence_embedding.apply(current_file)
            precomputed.dump(current_file, fX)
def check(protocol_name, file_finder, experiment_dir):

    protocol = get_protocol(protocol_name)
    precomputed = Precomputed(experiment_dir)

    for subset in ['development', 'test', 'train']:

        try:
            file_generator = getattr(protocol, subset)()
            first_item = next(file_generator)
        except NotImplementedError as e:
            continue

        for current_file in getattr(protocol, subset)():

            try:
                audio = file_finder(current_file)
                current_file['audio'] = audio
            except ValueError as e:
                print(e)
                continue

            duration = get_audio_duration(current_file)

            try:
                features = precomputed(current_file)
            except PyannoteFeatureExtractionError as e:
                print(e)
                continue

            if not np.isclose(duration,
                              features.getExtent().duration,
                              atol=1.):
                uri = get_unique_identifier(current_file)
                print('Duration mismatch for "{uri}"'.format(uri=uri))

            if np.any(np.isnan(features.data)):
                uri = get_unique_identifier(current_file)
                print('NaN for "{uri}"'.format(uri=uri))
Exemplo n.º 27
0
def check(protocol_name, file_finder, experiment_dir):

    protocol = get_protocol(protocol_name)
    precomputed = Precomputed(experiment_dir)

    for subset in ['development', 'test', 'train']:

        try:
            file_generator = getattr(protocol, subset)()
            first_item = next(file_generator)
        except NotImplementedError as e:
            continue

        for current_file in getattr(protocol, subset)():

            try:
                audio = file_finder(current_file)
                current_file['audio'] = audio
            except ValueError as e:
                print(e)
                continue

            duration = get_audio_duration(current_file)

            try:
                features = precomputed(current_file)
            except PyannoteFeatureExtractionError as e:
                print(e)
                continue

            if not np.isclose(duration,
                              features.getExtent().duration,
                              atol=1.):
                uri = get_unique_identifier(current_file)
                print('Duration mismatch for "{uri}"'.format(uri=uri))

            if np.any(np.isnan(features.data)):
                uri = get_unique_identifier(current_file)
                print('NaN for "{uri}"'.format(uri=uri))
Exemplo n.º 28
0
    def apply(self, protocol_name, output_dir, step=None):

        model = self.model_.to(self.device)
        model.eval()

        duration = self.task_.duration
        if step is None:
            step = 0.25 * duration

        # do not use memmap as this would lead to too many open files
        if isinstance(self.feature_extraction_, Precomputed):
            self.feature_extraction_.use_memmap = False

        # initialize embedding extraction
        sequence_labeling = SequenceLabeling(
            model, self.feature_extraction_, duration=duration,
            step=.25 * duration, batch_size=self.batch_size,
            source='audio', device=self.device)

        sliding_window = sequence_labeling.sliding_window
        n_classes = self.task_.n_classes

        # create metadata file at root that contains
        # sliding window and dimension information
        precomputed = Precomputed(
            root_dir=output_dir,
            sliding_window=sliding_window,
            dimension=n_classes)

        # file generator
        protocol = get_protocol(protocol_name, progress=True,
                                preprocessors=self.preprocessors_)

        for current_file in FileFinder.protocol_file_iter(
            protocol, extra_keys=['audio']):

            fX = sequence_labeling.apply(current_file)
            precomputed.dump(current_file, fX)
Exemplo n.º 29
0
        def fun(threshold):

            binarizer = Binarize(onset=threshold,
                                 offset=threshold,
                                 log_scale=False)

            protocol = get_protocol(protocol_name, progress=False,
                                    preprocessors=self.preprocessors_)

            metric = DetectionErrorRate()

            # NOTE -- embarrasingly parallel
            # TODO -- parallelize this
            file_generator = getattr(protocol, subset)()
            for current_file in file_generator:

                uri = get_unique_identifier(current_file)
                hypothesis = binarizer.apply(
                    predictions[uri], dimension=0).to_annotation()
                reference = current_file['annotation']
                uem = get_annotated(current_file)
                _ = metric(reference, hypothesis, uem=uem)

            return abs(metric)
Exemplo n.º 30
0
    def eval(self, model, partition: str = 'development'):
        model.eval()
        sequence_embedding = SequenceEmbedding(
            model=model,
            feature_extraction=self.config.feature_extraction,
            duration=self.config.duration,
            step=.5 * self.config.duration,
            batch_size=self.batch_size,
            device=common.DEVICE)
        protocol = get_protocol(self.config.protocol_name,
                                progress=False,
                                preprocessors=self.config.preprocessors)

        y_true, y_pred, cache = [], [], {}

        for trial in getattr(protocol, f"{partition}_trial")():

            # Compute embeddings
            emb1 = self._file_embedding(trial['file1'], sequence_embedding,
                                        cache)
            emb2 = self._file_embedding(trial['file2'], sequence_embedding,
                                        cache)

            # Compare embeddings
            dist = cdist(emb1, emb2,
                         metric=self.distance.to_sklearn_metric())[0, 0]

            y_pred.append(dist)
            y_true.append(trial['reference'])

        _, _, _, eer = det_curve(np.array(y_true),
                                 np.array(y_pred),
                                 distances=True)

        # Returning 1-eer because the evaluator keeps track of the highest metric value
        return 1 - eer, y_pred, y_true
    def validate_init(self, protocol_name: Text, subset: Subset = "development"):
        """Initialize validation data

        Parameters
        ----------
        protocol_name : `str`
        subset : {'train', 'development', 'test'}
            Defaults to 'development'.

        Returns
        -------
        validation_data : object
            Validation data.

        """

        preprocessors = self.preprocessors_
        if "audio" not in preprocessors:
            preprocessors["audio"] = FileFinder()
        if "duration" not in preprocessors:
            preprocessors["duration"] = get_audio_duration
        protocol = get_protocol(protocol_name, preprocessors=preprocessors)
        files = getattr(protocol, subset)()

        # convert lazy ProtocolFile to regular dict for multiprocessing
        files = [dict(file) for file in files]

        if isinstance(self.feature_extraction_, (Precomputed, RawAudio)):
            return files

        validation_data = []
        for current_file in tqdm(files, desc="Feature extraction"):
            current_file["features"] = self.feature_extraction_(current_file)
            validation_data.append(current_file)

        return validation_data
Exemplo n.º 32
0
    def apply(self, protocol_name, output_dir):

        # file generator
        protocol = get_protocol(protocol_name, progress=True,
                                preprocessors=self.preprocessors_)

        mkdir_p(output_dir)
        path = Path(output_dir) / f'{protocol_name}.txt'

        with open(path, mode='w') as fp:

            for current_file in FileFinder.protocol_file_iter(
                protocol, extra_keys=['audio']):

                uri = get_unique_identifier(current_file)
                hypothesis = self.pipeline_.apply(current_file)

                if isinstance(hypothesis, Timeline):
                    for s in hypothesis:
                        fp.write(f'{uri} {s.start:.3f} {s.end:.3f}\n')
                    continue

                for s, t, l in hypothesis.itertracks(yield_label=True):
                    fp.write(f'{uri} {s.start:.3f} {s.end:.3f} {t} {l}\n')
def xp_objective(args, **kwargs):
    import sys
    sys.path.append("/people/yin/projects/")
    from pyannote.database import get_protocol, get_annotated, FileFinder
    protocol = get_protocol('Etape.SpeakerDiarization.TV',
                            preprocessors={'audio': FileFinder()})

    from pyannote.metrics.diarization import GreedyDiarizationErrorRate
    metric = GreedyDiarizationErrorRate()

    from optimize_cluster import speaker_diarization
    from pyannote.audio.features import Precomputed

    feature_extraction = Precomputed(
        '/vol/work1/bredin/feature_extraction/mfcc')
    sad_pre = '/vol/work1/yin/speech_activity_detection/shallow/train/REPERE.SpeakerDiarization.All.train/tune/Etape.SpeakerDiarization.TV.development/apply'
    scd_pre = '/vol/work1/yin/speaker_change_detection/paper/train/REPERE.SpeakerDiarization.All.train/tune/Etape.SpeakerDiarization.Debug.development/apply'
    emb_pre = '/vol/work1/yin/embedding/20180124'

    args['cls__damping'] = float(args['cls__damping'])
    args['cls__preference'] = float(args['cls__preference'])

    pipeline = speaker_diarization.SpeakerDiarizationPre(
        feature_extraction, sad_pre, scd_pre, emb_pre, **args)
    try:
        for current_file in protocol.train():
            hypothesis = pipeline(current_file, annotated=True)
            if hypothesis is None:
                return 100
            reference = current_file['annotation']
            uem = get_annotated(current_file)
            metric(reference, hypothesis, uem=uem)
    except MemoryError as error:
        return 100

    return abs(metric)
Exemplo n.º 34
0
# coding: utf-8
import sys
sys.path.append("../")
import clustering
import numpy as np

from pyannote.audio.features import Precomputed
precomputed = Precomputed('/vol/work1/bredin/speaker_spotting/embeddings')


from pyannote.database import get_protocol, FileFinder
protocol = get_protocol('AMI.SpeakerSpotting.MixHeadset', progress=True)

from pyannote.core import Annotation,Segment, Timeline

# enrolment consists in summing all relevant embeddings
def speaker_spotting_enrol(current_enrolment):
    enrol_with = current_enrolment['enrol_with']
    embeddings = precomputed(current_enrolment)
    return np.sum(embeddings.crop(enrol_with), axis=0, keepdims=True)    

models = {}
for current_enrolment in protocol.test_enrolment():
    model_id = current_enrolment.pop('model_id')
    models[model_id] = speaker_spotting_enrol(current_enrolment)

REFERENCE = {}
for current_file in protocol.test():
    uri = current_file['uri']
    if uri not in REFERENCE:
        REFERENCE[uri] = Annotation(uri=uri)
def extract(protocol_name, file_finder, experiment_dir,
            robust=False, parallel=False):

    protocol = get_protocol(protocol_name, progress=False)

    # load configuration file
    config_yml = experiment_dir + '/config.yml'
    with open(config_yml, 'r') as fp:
        config = yaml.load(fp)

    feature_extraction_name = config['feature_extraction']['name']
    features = __import__('pyannote.audio.features',
                          fromlist=[feature_extraction_name])
    FeatureExtraction = getattr(features, feature_extraction_name)
    feature_extraction = FeatureExtraction(
        **config['feature_extraction'].get('params', {}))

    sliding_window = feature_extraction.sliding_window()
    dimension = feature_extraction.dimension()

    if 'normalization' in config:
        normalization_name = config['normalization']['name']
        normalization_module = __import__('pyannote.audio.features.normalization',
                                   fromlist=[normalization_name])
        Normalization = getattr(normalization_module, normalization_name)
        normalization = Normalization(
            **config['normalization'].get('params', {}))
    else:
        normalization = None

    # create metadata file at root that contains
    # sliding window and dimension information

    precomputed = Precomputed(root_dir=experiment_dir,
                              sliding_window=sliding_window,
                              dimension=dimension)

    if parallel:

        extract_one = functools.partial(helper_extract,
                                        file_finder=file_finder,
                                        experiment_dir=experiment_dir,
                                        config_yml=config_yml,
                                        normalization=normalization,
                                        robust=robust)

        n_jobs = cpu_count()
        pool = Pool(n_jobs)
        imap = pool.imap

    else:

        feature_extraction = init_feature_extraction(experiment_dir)
        extract_one = functools.partial(helper_extract,
                                        file_finder=file_finder,
                                        experiment_dir=experiment_dir,
                                        feature_extraction=feature_extraction,
                                        normalization=normalization,
                                        robust=robust)
        imap = map


    for result in imap(extract_one, FileFinder.protocol_file_iter(
        protocol, extra_keys=['audio'])):
        if result is None:
            continue
        print(result)
    return res

def process_trial(trial, scores):
    res = {}
    pscores = process_score(scores)
    res['uri'] = trial['uri']
    res['model_id'] = trial['model_id']
    res['scores'] = pscores
    return res


if __name__ == '__main__':
    arguments = docopt(__doc__, version='Speaker-spotting')
    # protocol
    protocol_name = arguments['<database.task.protocol>']
    protocol = get_protocol(protocol_name, progress=True)

    # subset (train, development, or test)
    subset = arguments['--subset']
    output_file = arguments['<output_file>']
    from pyannote.audio.features import Precomputed
    precomputed = Precomputed('/vol/work1/bredin/speaker_spotting/embeddings')

    models = {}
    enrolments = getattr(protocol, '{subset}_enrolment'.format(subset=subset))()
    for current_enrolment in enrolments:
        model_id = current_enrolment.pop('model_id')
        models[model_id] = speaker_spotting_enrol(current_enrolment)
    if arguments['oracle']:
        REFERENCE = {}
        for current_file in getattr(protocol,subset)():
Exemplo n.º 37
0
    def validate_epoch(self, epoch, protocol_name, subset='development',
                       validation_data=None):

        target_purity = self.purity

        # load model for current epoch
        model = self.load_model(epoch).to(self.device)
        model.eval()

        if isinstance(self.feature_extraction_, Precomputed):
            self.feature_extraction_.use_memmap = False

        duration = self.task_.duration
        step = .25 * duration
        sequence_labeling = SequenceLabeling(
            model, self.feature_extraction_, duration=duration,
            step=.25 * duration, batch_size=self.batch_size,
            source='audio', device=self.device)

        protocol = get_protocol(protocol_name, progress=False,
                                preprocessors=self.preprocessors_)

        # extract predictions for all files.
        predictions = {}
        for current_file in getattr(protocol, subset)():
            uri = get_unique_identifier(current_file)
            predictions[uri] = sequence_labeling.apply(current_file)

        # dichotomic search to find alpha that maximizes coverage
        # while having at least `target_purity`

        lower_alpha = 0.
        upper_alpha = 1.
        best_alpha = .5 * (lower_alpha + upper_alpha)
        best_coverage = 0.

        for _ in range(10):
            current_alpha = .5 * (lower_alpha + upper_alpha)
            peak = Peak(alpha=current_alpha, min_duration=0.0,
                        log_scale=model.logsoftmax)
            metric = DiarizationPurityCoverageFMeasure()

            # NOTE -- embarrasingly parallel
            # TODO -- parallelize this
            for current_file in getattr(protocol, subset)():
                reference = current_file['annotation']
                uri = get_unique_identifier(current_file)
                hypothesis = peak.apply(predictions[uri], dimension=1)
                hypothesis = hypothesis.to_annotation()
                uem = get_annotated(current_file)
                metric(reference, hypothesis, uem=uem)

            purity, coverage, _ = metric.compute_metrics()

            if purity < target_purity:
                upper_alpha = current_alpha
            else:
                lower_alpha = current_alpha
                if coverage > best_coverage:
                    best_coverage = coverage
                    best_alpha = current_alpha

        task = 'speaker_change_detection'
        metric_name = f'{task}/coverage@{target_purity:.2f}purity'
        return {
            metric_name: {'minimize': False, 'value': best_coverage},
            f'{task}/threshold': {'minimize': 'NA', 'value': best_alpha}}
Exemplo n.º 38
0
    def train(self, protocol_name, subset='development', n_calls=1):

        train_dir = self.TRAIN_DIR.format(
            experiment_dir=self.experiment_dir,
            protocol=protocol_name,
            subset=subset)

        mkdir_p(train_dir)

        protocol = get_protocol(protocol_name, progress=False,
                                preprocessors=self.preprocessors_)

        tune_db = f'{train_dir}/tune.db'
        params_yml = f'{train_dir}/params.yml'
        params_yml_lock = f'{train_dir}/params.yml.lock'

        pid = os.getpid()
        writer = SummaryWriter(log_dir=f"{train_dir}/{pid}")

        progress_bar = tqdm(unit='trial')
        progress_bar.set_description('Trial #1 : ...')
        progress_bar.update(0)

        iterations = self.pipeline_.tune_iter(
            tune_db, protocol, subset=subset,
            sampler=self.sampler_)

        for s, status in enumerate(iterations):

            if s+1 == n_calls:
                break

            loss = status['latest']['loss']
            writer.add_scalar(f'train/{protocol_name}.{subset}/loss/latest',
                              loss, global_step=s + 1)
            writer.add_scalars(
                f'train/{protocol_name}.{subset}/params/latest',
                status['latest']['params'], global_step=s + 1)

            if 'new_best' in status:
                _ = self.dump(status['new_best'], params_yml, params_yml_lock)
                n_trials = status['new_best']['n_trials']
                best_loss = status['new_best']['loss']
                writer.add_scalar(f'train/{protocol_name}.{subset}/loss/best',
                                  best_loss, global_step=n_trials)
                writer.add_scalars(
                    f'train/{protocol_name}.{subset}/params/best',
                    status['new_best']['params'], global_step=n_trials)

            # progress bar
            desc = f"Trial #{s+1}"
            loss = status['latest']['loss']
            if abs(loss) < 1:
                desc += f" = {100 * loss:.3f}%"
                desc += f" : Best = {100 * best_loss:.3f}% after {n_trials} trials"
            else:
                desc += f" = {loss:.3f}"
                desc += f" : Best = {best_loss:.3f} after {n_trials} trials"

            progress_bar.set_description(desc=desc)
            progress_bar.update(1)

        best = self.pipeline_.best(tune_db)
        content = self.dump(best, params_yml, params_yml_lock)

        sep = "=" * max(len(params_yml),
                        max(len(l) for l in content.split('\n')))
        print(f"\n{sep}\n{params_yml}\n{sep}\n{content}{sep}")
        print(f"Loss = {best['loss']:g} | {best['n_trials']} trials")
        print(f"{sep}")
# coding: utf-8

# ```bash
# $ pip install pyannote.metrics==1.4.1
# $ pip install pyannote.db.odessa.ami==0.5.1
# ```

import clustering
import numpy as np

from pyannote.audio.features import Precomputed
precomputed = Precomputed('/vol/work1/bredin/speaker_spotting/embeddings')


from pyannote.database import get_protocol, FileFinder
protocol = get_protocol('AMI.SpeakerSpotting.MixHeadset', progress=True)



# enrolment consists in summing all relevant embeddings
def speaker_spotting_enrol(current_enrolment):
    enrol_with = current_enrolment['enrol_with']
    embeddings = precomputed(current_enrolment)
    return np.sum(embeddings.crop(enrol_with), axis=0, keepdims=True)    

models = {}
for current_enrolment in protocol.development_enrolment():
    model_id = current_enrolment.pop('model_id')
    models[model_id] = speaker_spotting_enrol(current_enrolment)

REFERENCE = {current_file['uri']: current_file['annotation'] for current_file in protocol.development()}
    def validate_epoch(self, epoch, protocol_name, subset='development',
                       validation_data=None):

        target_precision = self.precision

        # load model for current epoch
        model = self.load_model(epoch).to(self.device)
        model.eval()

        if isinstance(self.feature_extraction_, Precomputed):
            self.feature_extraction_.use_memmap = False

        duration = self.task_.duration
        step = .25 * duration
        sequence_labeling = SequenceLabeling(
            model, self.feature_extraction_, duration=duration,
            step=.25 * duration, batch_size=self.batch_size,
            source='audio', device=self.device)

        protocol = get_protocol(protocol_name, progress=False,
                                preprocessors=self.preprocessors_)

        predictions = {}
        references = {}

        file_generator = getattr(protocol, subset)()
        for current_file in file_generator:
            uri = get_unique_identifier(current_file)

            # build overlap reference
            reference = Timeline(uri=uri)
            annotation = current_file['annotation']
            for track1, track2 in annotation.co_iter(annotation):
                if track1 == track2:
                    continue
                reference.add(track1[0] & track2[0])
            references[uri] = reference.to_annotation()

            # extract overlap scores
            scores = sequence_labeling.apply(current_file)

            if model.logsoftmax:
                scores = SlidingWindowFeature(
                    np.exp(scores.data[:, 2]), scores.sliding_window)
            else:
                scores = SlidingWindowFeature(
                    scores.data[:, 2], scores.sliding_window)

            predictions[uri] = scores

        # dichotomic search to find threshold that maximizes recall
        # while having at least `target_precision`

        lower_alpha = 0.
        upper_alpha = 1.
        best_alpha = .5 * (lower_alpha + upper_alpha)
        best_recall = 0.

        for _ in range(10):
            current_alpha = .5 * (lower_alpha + upper_alpha)
            binarizer = Binarize(onset=current_alpha,
                                 offset=current_alpha,
                                 log_scale=False)

            precision = DetectionPrecision()
            recall = DetectionRecall()

            for current_file in getattr(protocol, subset)():
                uri = get_unique_identifier(current_file)
                reference = references[uri]
                hypothesis = binarizer.apply(predictions[uri], dimension=0)
                hypothesis = hypothesis.to_annotation()
                uem = get_annotated(current_file)
                _ = precision(reference, hypothesis, uem=uem)
                _ = recall(reference, hypothesis, uem=uem)

            if abs(precision) < target_precision:
                # precision is not high enough: try higher thresholds
                lower_alpha = current_alpha
            else:
                upper_alpha = current_alpha
                r = abs(recall)
                if r > best_recall:
                    best_recall = r
                    best_alpha = current_alpha

        task = 'overlap_speech_detection'
        metric_name = f'{task}/recall@{target_precision:.2f}precision'
        return {
            metric_name: {'minimize': False, 'value': best_recall},
            f'{task}/threshold': {'minimize': 'NA', 'value': best_alpha}}
    def _validate_epoch_verification(self, epoch, protocol_name,
                                     subset='development',
                                     validation_data=None):
        """Perform a speaker verification experiment using model at `epoch`

        Parameters
        ----------
        epoch : int
            Epoch to validate.
        protocol_name : str
            Name of speaker verification protocol
        subset : {'train', 'development', 'test'}, optional
            Name of subset.
        validation_data : provided by `validate_init`

        Returns
        -------
        metrics : dict
        """


        # load current model
        model = self.load_model(epoch).to(self.device)
        model.eval()

        # use user-provided --duration when available
        # otherwise use 'duration' used for training
        if self.duration is None:
            duration = self.task_.duration
        else:
            duration = self.duration
        min_duration = None

        # if 'duration' is still None, it means that
        # network was trained with variable lengths
        if duration is None:
            duration = self.task_.max_duration
            min_duration = self.task_.min_duration

        step = .5 * duration

        if isinstance(self.feature_extraction_, Precomputed):
            self.feature_extraction_.use_memmap = False

        # initialize embedding extraction
        sequence_embedding = SequenceEmbedding(
            model, self.feature_extraction_, duration=duration,
            step=step, min_duration=min_duration,
            batch_size=self.batch_size, device=self.device)

        metrics = {}
        protocol = get_protocol(protocol_name, progress=False,
                                preprocessors=self.preprocessors_)

        enrolment_models, enrolment_khashes = {}, {}
        enrolments = getattr(protocol, '{0}_enrolment'.format(subset))()
        for i, enrolment in enumerate(enrolments):
            data = sequence_embedding.apply(enrolment,
                                            crop=enrolment['enrol_with'])
            model_id = enrolment['model_id']
            model = np.mean(np.stack(data), axis=0, keepdims=True)
            enrolment_models[model_id] = model

            # in some specific speaker verification protocols,
            # enrolment data may be  used later as trial data.
            # therefore, we cache information about enrolment data
            # to speed things up by reusing the enrolment as trial
            h = hash((get_unique_identifier(enrolment),
                      tuple(enrolment['enrol_with'])))
            enrolment_khashes[h] = model_id

        trial_models = {}
        trials = getattr(protocol, '{0}_trial'.format(subset))()
        y_true, y_pred = [], []
        for i, trial in enumerate(trials):
            model_id = trial['model_id']

            h = hash((get_unique_identifier(trial),
                      tuple(trial['try_with'])))

            # re-use enrolment model whenever possible
            if h in enrolment_khashes:
                model = enrolment_models[enrolment_khashes[h]]

            # re-use trial model whenever possible
            elif h in trial_models:
                model = trial_models[h]

            else:
                data = sequence_embedding.apply(trial, crop=trial['try_with'])
                model = np.mean(data, axis=0, keepdims=True)
                # cache trial model for later re-use
                trial_models[h] = model

            distance = cdist(enrolment_models[model_id], model,
                             metric=self.metric)[0, 0]
            y_pred.append(distance)
            y_true.append(trial['reference'])

        _, _, _, eer = det_curve(np.array(y_true), np.array(y_pred),
                                 distances=True)
        metrics['EER'] = {'minimize': True, 'value': eer}

        return metrics
Exemplo n.º 42
0
def main():
    arguments = docopt(__doc__, version='Evaluation')

    collar = float(arguments['--collar'])
    skip_overlap = arguments['--skip-overlap']
    tolerance = float(arguments['--tolerance'])

    # protocol
    protocol_name = arguments['<database.task.protocol>']

    preprocessors = dict()
    if arguments['overlap']:
        if skip_overlap:
            msg = ('Option --skip-overlap is not supported '
                   'when evaluating overlapped speech detection.')
            sys.exit(msg)
        preprocessors = {'annotation': to_overlap}

    protocol = get_protocol(protocol_name,
                            progress=True,
                            preprocessors=preprocessors)

    # subset (train, development, or test)
    subset = arguments['--subset']

    if arguments['spotting']:

        hypothesis_json = arguments['<hypothesis.json>']
        with open(hypothesis_json, mode='r') as fp:
            hypotheses = json.load(fp)

        output_prefix = hypothesis_json[:-5]

        latencies = [float(l) for l in arguments['--latency']]

        filters = arguments['--filter']
        if filters:
            from sympy import sympify, lambdify, symbols
            speech = symbols('speech')
            filter_funcs = []
            filter_funcs = [
                lambdify([speech], sympify(expression))
                for expression in filters
            ]
            filter_func = lambda speech: \
                any(~func(speech) for func in filter_funcs)
        else:
            filter_func = None

        spotting(protocol,
                 subset,
                 latencies,
                 hypotheses,
                 output_prefix,
                 filter_func=filter_func)

        sys.exit(0)

    hypothesis_rttm = arguments['<hypothesis.rttm>']

    try:
        hypotheses = load_rttm(hypothesis_rttm)

    except FileNotFoundError:
        msg = f'Could not find file {hypothesis_rttm}.'
        sys.exit(msg)

    except:
        msg = (f'Failed to load {hypothesis_rttm}, please check its format '
               f'(only RTTM files are supported).')
        sys.exit(msg)

    if arguments['detection']:
        detection(protocol,
                  subset,
                  hypotheses,
                  collar=collar,
                  skip_overlap=skip_overlap)

    if arguments['overlap']:
        detection(protocol,
                  subset,
                  hypotheses,
                  collar=collar,
                  skip_overlap=skip_overlap)

    if arguments['segmentation']:
        segmentation(protocol, subset, hypotheses, tolerance=tolerance)

    if arguments['diarization']:
        greedy = arguments['--greedy']
        diarization(protocol,
                    subset,
                    hypotheses,
                    greedy=greedy,
                    collar=collar,
                    skip_overlap=skip_overlap)

    if arguments['identification']:
        identification(protocol,
                       subset,
                       hypotheses,
                       collar=collar,
                       skip_overlap=skip_overlap)
Exemplo n.º 43
0
    def validate_epoch(self, epoch, protocol_name, subset='development',
                       validation_data=None):

        # load model for current epoch
        model = self.load_model(epoch).to(self.device)
        model.eval()

        if isinstance(self.feature_extraction_, Precomputed):
            self.feature_extraction_.use_memmap = False

        duration = self.task_.duration
        step = .25 * duration
        sequence_labeling = SequenceLabeling(
            model, self.feature_extraction_, duration=duration,
            step=.25 * duration, batch_size=self.batch_size,
            source='audio', device=self.device)

        protocol = get_protocol(protocol_name, progress=False,
                                preprocessors=self.preprocessors_)

        metric = DetectionErrorRate()

        predictions = {}

        file_generator = getattr(protocol, subset)()
        for current_file in file_generator:
            uri = get_unique_identifier(current_file)
            scores = sequence_labeling.apply(current_file)

            if model.logsoftmax:
                scores = SlidingWindowFeature(
                    1. - np.exp(scores.data[:, 0]),
                    scores.sliding_window)
            else:
                scores = SlidingWindowFeature(
                    1. - scores.data[:, 0],
                    scores.sliding_window)

            predictions[uri] = scores

        def fun(threshold):

            binarizer = Binarize(onset=threshold,
                                 offset=threshold,
                                 log_scale=False)

            protocol = get_protocol(protocol_name, progress=False,
                                    preprocessors=self.preprocessors_)

            metric = DetectionErrorRate()

            # NOTE -- embarrasingly parallel
            # TODO -- parallelize this
            file_generator = getattr(protocol, subset)()
            for current_file in file_generator:

                uri = get_unique_identifier(current_file)
                hypothesis = binarizer.apply(
                    predictions[uri], dimension=0).to_annotation()
                reference = current_file['annotation']
                uem = get_annotated(current_file)
                _ = metric(reference, hypothesis, uem=uem)

            return abs(metric)

        res = scipy.optimize.minimize_scalar(
            fun, bounds=(0., 1.), method='bounded', options={'maxiter': 10})

        return {
            'speech_activity_detection/error': {'minimize': True,
                                                'value': res.fun},
            'speech_activity_detection/threshold': {'minimize': 'NA',
                                                    'value': res.x}}