예제 #1
0
def protocol(
    database: str = typer.Option(
        "",
        "--database",
        "-d",
        metavar="DATABASE",
        help="Filter protocols by DATABASE.",
        case_sensitive=False,
    ),
    task: Task = typer.Option(
        "Any",
        "--task",
        "-t",
        help="Filter protocols by TASK.",
        case_sensitive=False,
    ),
):
    """Print list of protocols"""

    if database == "":
        databases = get_databases()
    else:
        databases = [database]

    for database_name in databases:
        db: Database = get_database(database_name)
        tasks = db.get_tasks() if task == "Any" else [task]
        for task_name in tasks:
            try:
                protocols = db.get_protocols(task_name)
            except KeyError:
                continue
            for protocol in protocols:
                typer.echo(f"{database_name}.{task_name}.{protocol}")
예제 #2
0
def task(database: str = typer.Option(
    "",
    "--database",
    "-d",
    metavar="DATABASE",
    help="Filter tasks by DATABASE.",
    case_sensitive=False,
)):
    """Print list of tasks"""

    if database == "":
        tasks = get_tasks()
    else:
        db: Database = get_database(database)
        tasks = db.get_tasks()

    for task in tasks:
        typer.echo(f"{task}")
예제 #3
0
    min_duration = float(tokens[0]) if len(tokens) == 2 else None
    duration = float(tokens[0]) if len(tokens) == 1 else float(tokens[1])
    return duration, min_duration, step, heterogeneous


if __name__ == '__main__':

    arguments = docopt(__doc__, version='Speaker embedding')

    db_yml = os.path.expanduser(arguments['--database'])
    preprocessors = {'wav': FileFinder(db_yml)}

    if '<database.task.protocol>' in arguments:
        protocol = arguments['<database.task.protocol>']
        database_name, task_name, protocol_name = protocol.split('.')
        database = get_database(database_name, preprocessors=preprocessors)
        protocol = database.get_protocol(task_name, protocol_name, progress=True)

    subset = arguments['--subset']

    if arguments['train']:
        experiment_dir = arguments['<experiment_dir>']

        if subset is None:
            subset = 'train'

        duration = float(arguments['--duration'])

        min_duration = arguments['--min-duration']
        if min_duration is not None:
            min_duration = float(min_duration)
def train(dataset, medium_template, config_yml):

    # load configuration file
    with open(config_yml, 'r') as fp:
        config = yaml.load(fp)

    # deduce workdir from path of configuration file
    workdir = os.path.dirname(config_yml)

    # this is where model weights are saved after each epoch
    log_dir = workdir + '/' + dataset

    # -- DATASET --
    db, task, protocol, subset = dataset.split('.')
    database = get_database(db, medium_template=medium_template)
    protocol = database.get_protocol(task, protocol)

    if not hasattr(protocol, subset):
        raise NotImplementedError('')

    file_generator = getattr(protocol, subset)()

    # -- FEATURE EXTRACTION --
    # input sequence duration
    duration = config['feature_extraction']['duration']
    # MFCCs
    feature_extractor = YaafeMFCC(**config['feature_extraction']['mfcc'])
    # normalization
    normalize = config['feature_extraction']['normalize']

    # -- NETWORK STRUCTURE --
    # internal model structure
    lstm = config['network']['lstm']
    dense = config['network']['dense']
    # bi-directional
    bidirectional = config['network']['bidirectional']

    # -- TRAINING --
    # number training set hours (speech + non speech) to use in each epoch
    # FIXME -- update ETAPE so that we can query this information directly
    hours_per_epoch = config['training']['hours_per_epoch']
    # overlap ratio between each window
    overlap = config['training']['overlap']
    # batch size
    batch_size = config['training']['batch_size']
    # number of epochs
    nb_epoch = config['training']['nb_epoch']
    # optimizer
    optimizer = config['training']['optimizer']

    # labeling
    n_classes = 2
    design_model = StackedLSTM(n_classes=n_classes,
                               lstm=lstm,
                               bidirectional=bidirectional,
                               dense=dense)

    labeling = SequenceLabeling(design_model,
                                optimizer=optimizer,
                                log_dir=log_dir)

    # segment generator for training
    step = duration * (1. - overlap)
    batch_generator = SpeechActivityDetectionBatchGenerator(
        feature_extractor,
        duration=duration,
        normalize=normalize,
        step=step,
        batch_size=batch_size)

    # log loss and accuracy during training and
    # keep track of best models for both metrics
    log = [('train', 'loss'), ('train', 'accuracy')]
    callback = LoggingCallback(log_dir=log_dir, log=log)

    # number of samples per epoch + round it to closest batch
    samples_per_epoch = batch_size * int(
        np.ceil((3600 * hours_per_epoch / step) / batch_size))

    # input shape (n_frames, n_features)
    input_shape = batch_generator.get_shape()

    generator = batch_generator(file_generator, infinite=True)

    labeling.fit(input_shape,
                 generator,
                 samples_per_epoch,
                 nb_epoch,
                 callbacks=[callback])
def test(dataset, medium_template, config_yml, weights_h5, output_dir):

    # load configuration file
    with open(config_yml, 'r') as fp:
        config = yaml.load(fp)

    # this is where model architecture was saved
    architecture_yml = os.path.dirname(
        os.path.dirname(weights_h5)) + '/architecture.yml'

    # -- DATASET --
    db, task, protocol, subset = dataset.split('.')
    database = get_database(db, medium_template=medium_template)
    protocol = database.get_protocol(task, protocol)

    if not hasattr(protocol, subset):
        raise NotImplementedError('')

    file_generator = getattr(protocol, subset)()

    # -- FEATURE EXTRACTION --
    # input sequence duration
    duration = config['feature_extraction']['duration']
    # MFCCs
    feature_extractor = YaafeMFCC(**config['feature_extraction']['mfcc'])
    # normalization
    normalize = config['feature_extraction']['normalize']

    # -- TESTING --
    # overlap ratio between each window
    overlap = config['testing']['overlap']
    step = duration * (1. - overlap)

    # prediction smoothing
    onset = config['testing']['binarize']['onset']
    offset = config['testing']['binarize']['offset']
    binarizer = Binarize(onset=0.5, offset=0.5)

    sequence_labeling = SequenceLabeling.from_disk(architecture_yml,
                                                   weights_h5)

    aggregation = SequenceLabelingAggregation(sequence_labeling,
                                              feature_extractor,
                                              normalize=normalize,
                                              duration=duration,
                                              step=step)

    collar = 0.500
    error_rate = DetectionErrorRate(collar=collar)
    accuracy = DetectionAccuracy(collar=collar)
    precision = DetectionPrecision(collar=collar)
    recall = DetectionRecall(collar=collar)

    LINE = '{uri} {e:.3f} {a:.3f} {p:.3f} {r:.3f} {f:.3f}\n'

    PATH = '{output_dir}/eval.{dataset}.{subset}.txt'
    path = PATH.format(output_dir=output_dir, dataset=dataset, subset=subset)

    with open(path, 'w') as fp:

        header = '# uri error accuracy precision recall f_measure\n'
        fp.write(header)
        fp.flush()

        for current_file in file_generator:

            uri = current_file['uri']
            wav = current_file['medium']['wav']
            annotated = current_file['annotated']
            annotation = current_file['annotation']

            predictions = aggregation.apply(wav)
            hypothesis = binarizer.apply(predictions, dimension=1)

            e = error_rate(annotation, hypothesis, uem=annotated)
            a = accuracy(annotation, hypothesis, uem=annotated)
            p = precision(annotation, hypothesis, uem=annotated)
            r = recall(annotation, hypothesis, uem=annotated)
            f = f_measure(p, r)

            line = LINE.format(uri=uri, e=e, a=a, p=p, r=r, f=f)
            fp.write(line)
            fp.flush()

            PATH = '{output_dir}/{uri}.json'
            path = PATH.format(output_dir=output_dir, uri=uri)
            dump_to(hypothesis, path)

        # average on whole corpus
        uri = '{dataset}.{subset}'.format(dataset=dataset, subset=subset)
        e = abs(error_rate)
        a = abs(accuracy)
        p = abs(precision)
        r = abs(recall)
        f = f_measure(p, r)
        line = LINE.format(uri=uri, e=e, a=a, p=p, r=r, f=f)
        fp.write(line)
        fp.flush()
예제 #6
0
def train(dataset, medium_template, config_yml):

    # load configuration file
    with open(config_yml, 'r') as fp:
        config = yaml.load(fp)

    # deduce workdir from path of configuration file
    workdir = os.path.dirname(config_yml)

    # this is where model weights are saved after each epoch
    log_dir = workdir + '/' + dataset

    # -- DATASET --
    db, task, protocol, subset = dataset.split('.')
    database = get_database(db, medium_template=medium_template)
    protocol = database.get_protocol(task, protocol)

    if not hasattr(protocol, subset):
        raise NotImplementedError('')

    file_generator = getattr(protocol, subset)()

    # -- FEATURE EXTRACTION --
    # input sequence duration
    duration = config['feature_extraction']['duration']
    # MFCCs
    feature_extractor = YaafeMFCC(**config['feature_extraction']['mfcc'])
    # normalization
    normalize = config['feature_extraction']['normalize']

    # -- NETWORK STRUCTURE --
    # internal model structure
    output_dim = config['network']['output_dim']
    lstm = config['network']['lstm']
    pooling = config['network'].get('pooling', 'last')
    dense = config['network']['dense']
    # bi-directional
    bidirectional = config['network']['bidirectional']
    space = config['network']['space']

    # -- TRAINING --
    # batch size
    batch_size = config['training']['batch_size']
    # number of epochs
    nb_epoch = config['training']['nb_epoch']
    # optimizer
    optimizer = config['training']['optimizer']

    # -- TRIPLET LOSS --
    margin = config['training']['triplet_loss']['margin']
    per_fold = config['training']['triplet_loss']['per_fold']
    per_label = config['training']['triplet_loss']['per_label']
    overlap = config['training']['triplet_loss']['overlap']

    # embedding
    get_embedding = TristouNet(lstm=lstm,
                               bidirectional=bidirectional,
                               pooling=pooling,
                               dense=dense,
                               output_dim=output_dim,
                               space=space)

    loss = TripletLoss(get_embedding, margin=margin)

    embedding = SequenceEmbedding(loss=loss,
                                  optimizer=optimizer,
                                  log_dir=log_dir)

    # triplet generator for training
    generator = TripletBatchGenerator(feature_extractor,
                                      file_generator,
                                      embedding,
                                      margin=margin,
                                      duration=duration,
                                      overlap=overlap,
                                      normalize=normalize,
                                      per_fold=per_fold,
                                      per_label=per_label,
                                      batch_size=batch_size)

    # log loss during training and keep track of best model
    log = [('train', 'loss')]
    callback = LoggingCallback(log_dir=log_dir,
                               log=log,
                               get_model=loss.get_embedding)

    # estimated number of triplets per epoch
    # (rounded to closest batch_size multiple)
    samples_per_epoch = per_label * (per_label - 1) * generator.n_labels
    samples_per_epoch = samples_per_epoch - (samples_per_epoch % batch_size)

    # input shape (n_samples, n_features)
    input_shape = generator.get_shape()

    embedding.fit(input_shape,
                  generator,
                  samples_per_epoch,
                  nb_epoch,
                  callbacks=[callback])
예제 #7
0
def generate_test(dataset, medium_template, config):

    # -- DATASET --
    db, task, protocol, subset = dataset.split('.')
    database = get_database(db, medium_template=medium_template)
    protocol = database.get_protocol(task, protocol)

    if not hasattr(protocol, subset):
        raise NotImplementedError('')

    file_generator = getattr(protocol, subset)()

    # -- FEATURE EXTRACTION --
    # input sequence duration
    duration = config['feature_extraction']['duration']
    # MFCCs
    feature_extractor = YaafeMFCC(**config['feature_extraction']['mfcc'])
    # normalization
    normalize = config['feature_extraction']['normalize']

    overlap = config['testing']['overlap']
    per_label = config['testing']['per_label']
    batch_size = config['testing']['batch_size']

    batch_generator = LabeledFixedDurationSequencesBatchGenerator(
        feature_extractor,
        duration=duration,
        normalize=normalize,
        step=(1 - overlap) * duration,
        batch_size=-1)

    X, y = [], []
    for sequences, labels in batch_generator(file_generator):
        X.append(sequences)
        y.append(labels)
    X = np.vstack(X)
    y = np.hstack(y)

    unique, y, counts = np.unique(y, return_inverse=True, return_counts=True)

    # randomly (but deterministically) select 'per_label' samples from each class
    # only compute (positive vs. negative distances for those samples)
    # this should ensure all speakers have the same weights
    np.random.seed(1337)

    # indices contains the list of indices of all sequences
    # to be used for later triplet selection
    indices = []

    n_labels = len(unique)
    for label in range(n_labels):

        # randomly choose 'per_label' sequences
        # from the set of available sequences
        i = np.random.choice(np.where(y == label)[0],
                             size=per_label,
                             replace=True)

        # append indices of selected sequences
        indices.append(i)

    # turn indices into a 1-dimensional numpy array.
    indices = np.hstack(indices)

    # selected sequences
    X = X[indices]

    # their pairwise similarity
    y_true = pdist(y[indices, np.newaxis], metric='chebyshev') < 1

    return X, y_true
예제 #8
0
from pyannote.database import get_database
from pyannote.database import get_protocol
from pyannote.database.protocol import CollectionProtocol
from pyannote.database.protocol import Protocol
from pyannote.database.protocol import SpeakerDiarizationProtocol
from pyannote.database.protocol import SpeakerVerificationProtocol

assert "MyDatabase" in get_databases()

tasks = get_tasks()
assert "Collection" in tasks
assert "Protocol" in tasks
assert "SpeakerDiarization" in tasks
assert "SpeakerVerification" in tasks

database = get_database("MyDatabase")
tasks = database.get_tasks()
assert "Collection" in tasks
assert "Protocol" in tasks
assert "SpeakerDiarization" in tasks
assert "SpeakerVerification" in tasks

assert "MyCollection" in database.get_protocols("Collection")
assert "MyProtocol" in database.get_protocols("Protocol")
assert "MySpeakerDiarization" in database.get_protocols("SpeakerDiarization")
assert "MySpeakerVerification" in database.get_protocols("SpeakerVerification")

collection = get_protocol("MyDatabase.Collection.MyCollection")
assert isinstance(collection, CollectionProtocol)

protocol = get_protocol("MyDatabase.Protocol.MyProtocol")
예제 #9
0
def extract(database_name,
            task_name,
            protocol_name,
            preprocessors,
            experiment_dir,
            robust=False):

    database = get_database(database_name, preprocessors=preprocessors)
    protocol = database.get_protocol(task_name, protocol_name, progress=True)

    if task_name == 'SpeakerDiarization':
        items = itertools.chain(protocol.train(), protocol.development(),
                                protocol.test())

    elif task_name == 'SpeakerRecognition':
        items = itertools.chain(protocol.train(yield_name=False),
                                protocol.development_enroll(yield_name=False),
                                protocol.development_test(yield_name=False),
                                protocol.test_enroll(yield_name=False),
                                protocol.test_test(yield_name=False))

    # load configuration file
    config_yml = experiment_dir + '/config.yml'
    with open(config_yml, 'r') as fp:
        config = yaml.load(fp)

    feature_extraction_name = config['feature_extraction']['name']
    features = __import__('pyannote.audio.features',
                          fromlist=[feature_extraction_name])
    FeatureExtraction = getattr(features, feature_extraction_name)
    feature_extraction = FeatureExtraction(
        **config['feature_extraction'].get('params', {}))

    sliding_window = feature_extraction.sliding_window()
    dimension = feature_extraction.dimension()

    # create metadata file at root that contains
    # sliding window and dimension information
    path = Precomputed.get_config_path(experiment_dir)
    f = h5py.File(path)
    f.attrs['start'] = sliding_window.start
    f.attrs['duration'] = sliding_window.duration
    f.attrs['step'] = sliding_window.step
    f.attrs['dimension'] = dimension
    f.close()

    for item in items:

        uri = get_unique_identifier(item)
        path = Precomputed.get_path(experiment_dir, item)

        if os.path.exists(path):
            continue

        try:
            # NOTE item contains the 'channel' key
            features = feature_extraction(item)
        except PyannoteFeatureExtractionError as e:
            if robust:
                msg = 'Feature extraction failed for file "{uri}".'
                msg = msg.format(uri=uri)
                warnings.warn(msg)
                continue
            else:
                raise e

        if features is None:
            msg = 'Feature extraction returned None for file "{uri}".'
            msg = msg.format(uri=uri)
            if not robust:
                raise PyannoteFeatureExtractionError(msg)
            warnings.warn(msg)
            continue

        data = features.data

        if np.any(np.isnan(data)):
            msg = 'Feature extraction returned NaNs for file "{uri}".'
            msg = msg.format(uri=uri)
            if not robust:
                raise PyannoteFeatureExtractionError(msg)
            warnings.warn(msg)
            continue

        # create parent directory
        mkdir_p(os.path.dirname(path))

        f = h5py.File(path)
        f.attrs['start'] = sliding_window.start
        f.attrs['duration'] = sliding_window.duration
        f.attrs['step'] = sliding_window.step
        f.attrs['dimension'] = dimension
        f.create_dataset('features', data=data)
        f.close()