Exemplos de SequenceEmbedding em Python, exemplos de pyannote.audio.embedding.base.SequenceEmbedding em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: speaker_embedding.py Projeto: zXpp/pyannote-audio

def train(protocol, duration, experiment_dir, train_dir, subset='train',
          min_duration=None, step=None, heterogeneous=False,
          cache=False, robust=False, parallel=False):

    # -- TRAINING --
    nb_epoch = 1000
    optimizer = SSMORMS3()
    batch_size = 8192

    # load configuration file
    config_yml = experiment_dir + '/config.yml'
    with open(config_yml, 'r') as fp:
        config = yaml.load(fp)

    # -- PREPROCESSORS --
    for key, preprocessor in config.get('preprocessors', {}).items():
        preprocessor_name = preprocessor['name']
        preprocessor_params = preprocessor.get('params', {})
        preprocessors = __import__('pyannote.audio.preprocessors',
                                   fromlist=[preprocessor_name])
        Preprocessor = getattr(preprocessors, preprocessor_name)
        protocol.preprocessors[key] = Preprocessor(**preprocessor_params)

    # -- FEATURE EXTRACTION --
    feature_extraction_name = config['feature_extraction']['name']
    features = __import__('pyannote.audio.features',
                          fromlist=[feature_extraction_name])
    FeatureExtraction = getattr(features, feature_extraction_name)
    feature_extraction = FeatureExtraction(
        **config['feature_extraction'].get('params', {}))

    # -- ARCHITECTURE --
    architecture_name = config['architecture']['name']
    models = __import__('pyannote.audio.embedding.models',
                        fromlist=[architecture_name])
    Architecture = getattr(models, architecture_name)
    architecture = Architecture(
        **config['architecture'].get('params', {}))

    # -- GLUE --
    glue_name = config['glue']['name']
    glues = __import__('pyannote.audio.embedding',
                        fromlist=[glue_name])
    Glue = getattr(glues, glue_name)
    glue = Glue(feature_extraction,
                duration=duration,
                step=step,
                min_duration=min_duration,
                heterogeneous=heterogeneous,
                cache=cache,
                robust=robust,
                **config['glue'].get('params', {}))

    # actual training
    embedding = SequenceEmbedding(glue=glue)
    embedding.fit(architecture, protocol, nb_epoch, train=subset,
                  optimizer=optimizer, batch_size=batch_size,
                  log_dir=train_dir, max_q_size=1 if parallel else 0)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: callbacks.py Projeto: zXpp/pyannote-audio

    def on_train_begin(self, logs={}):
        current_embedding = self.extract_embedding(self.model)
        architecture = model_from_yaml(current_embedding.to_yaml(),
                                       custom_objects=CUSTOM_OBJECTS)
        current_weights = current_embedding.get_weights()

        from pyannote.audio.embedding.base import SequenceEmbedding
        sequence_embedding = SequenceEmbedding()

        sequence_embedding.embedding_ = architecture
        sequence_embedding.embedding_.set_weights(current_weights)
        setattr(self.generator, self.name, sequence_embedding)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: speaker_embedding.py Projeto: zXpp/pyannote-audio

    def objective_function(parameters, beta=1.0):

        epoch = parameters[0]

        weights_h5 = WEIGHTS_H5.format(epoch=epoch)
        sequence_embedding = SequenceEmbedding.from_disk(
            architecture_yml, weights_h5)

        fX = sequence_embedding.transform(X, batch_size=batch_size)

        # compute distance between every pair of sequences
        y_distance = pdist(fX, metric=distance)

        # compute same/different groundtruth
        y_true = pdist(y, metric='chebyshev') < 1

        # false positive / true positive
        fpr, tpr, thresholds = sklearn.metrics.roc_curve(
            y_true, -y_distance, pos_label=True, drop_intermediate=True)

        fnr = 1. - tpr
        far = fpr

        thresholds = -thresholds
        fscore = 1. - f_measure(1. - fnr, 1. - far, beta=beta)

        i = np.nanargmin(fscore)
        alphas[epoch] = float(thresholds[i])
        return fscore[i]

Exemplo n.º 4

0

Exibir arquivo

Arquivo: speaker_embedding.py Projeto: GregGovit/pyannote-audio

def tune(dataset, medium_template, config_yml, weights_dir, output_dir):

    # load configuration file
    with open(config_yml, 'r') as fp:
        config = yaml.load(fp)

    X, y_true = generate_test(dataset, medium_template, config)

    # this is where model architecture was saved
    architecture_yml = os.path.dirname(weights_dir) + '/architecture.yml'

    output_dir = output_dir + '/' + dataset

    try:
        os.makedirs(output_dir)
    except Exception as e:
        pass

    nb_epoch = config['training']['nb_epoch']
    WEIGHTS_H5 = weights_dir + '/{epoch:04d}.h5'

    LINE = '{epoch:04d} {eer:.6f}\n'
    PATH = output_dir + '/eer.txt'
    with open(PATH.format(dataset=dataset), 'w') as fp:

        for epoch in range(nb_epoch):

            # load model for this epoch
            weights_h5 = WEIGHTS_H5.format(epoch=epoch)
            if not os.path.isfile(weights_h5):
                continue

            sequence_embedding = SequenceEmbedding.from_disk(
                architecture_yml, weights_h5)

            # pairwise euclidean distances between embeddings
            batch_size = config['testing']['batch_size']
            x = sequence_embedding.transform(X,
                                             batch_size=batch_size,
                                             verbose=0)
            distances = pdist(x, metric='euclidean')
            PATH = output_dir + '/plot.{epoch:04d}'
            eer = plot_det_curve(y_true, -distances, PATH.format(epoch=epoch))

            msg = 'Epoch #{epoch:04d} | EER = {eer:.2f}%'
            print(msg.format(epoch=epoch, eer=100 * eer))

            fp.write(LINE.format(epoch=epoch, eer=eer))
            fp.flush()

            # save distribution plots after each epoch
            space = config['network']['space']
            xlim = (0, 2 if space == 'sphere' else np.sqrt(2.))
            plot_distributions(y_true,
                               distances,
                               PATH.format(epoch=epoch),
                               xlim=xlim,
                               ymax=3,
                               nbins=100)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: speaker_embedding.py Projeto: GregGovit/pyannote-audio

def test(dataset, medium_template, config_yml, weights_h5, output_dir):

    # load configuration file
    with open(config_yml, 'r') as fp:
        config = yaml.load(fp)

    X, y_true = generate_test(dataset, medium_template, config)

    # this is where model architecture was saved
    architecture_yml = os.path.dirname(
        os.path.dirname(weights_h5)) + '/architecture.yml'

    sequence_embedding = SequenceEmbedding.from_disk(architecture_yml,
                                                     weights_h5)

    # pairwise euclidean distances between embeddings
    batch_size = config['testing']['batch_size']
    x = sequence_embedding.transform(X, batch_size=batch_size, verbose=0)
    distances = pdist(x, metric='euclidean')

    # -- distances distributions
    space = config['network']['space']
    xlim = (0, 2 if space == 'sphere' else np.sqrt(2.))
    plot_distributions(y_true,
                       distances,
                       output_dir + '/plot',
                       xlim=xlim,
                       ymax=3,
                       nbins=100)

    # -- precision / recall curve
    auc = plot_precision_recall_curve(y_true, -distances, output_dir + '/plot')
    msg = 'AUC = {auc:.2f}%'
    print(msg.format(auc=100 * auc))

    # -- det curve
    eer = plot_det_curve(y_true, -distances, output_dir + '/plot')
    msg = 'EER = {eer:.2f}%'
    print(msg.format(eer=100 * eer))

Exemplo n.º 6

0

Exibir arquivo

    def objective_function(parameters, beta=1.0):

        epoch, alpha = parameters

        weights_h5 = WEIGHTS_H5.format(epoch=epoch)
        sequence_embedding = SequenceEmbedding.from_disk(
            architecture_yml, weights_h5)

        segmentation = Segmentation(
            sequence_embedding, feature_extraction,
            duration=duration, step=0.100)

        if epoch not in predictions:
            predictions[epoch] = {}

        purity = SegmentationPurity()
        coverage = SegmentationCoverage()

        f, n = 0., 0
        for dev_file in getattr(protocol, subset)():

            uri = get_unique_identifier(dev_file)
            reference = dev_file['annotation']
            n += 1

            if uri in predictions[epoch]:
                prediction = predictions[epoch][uri]
            else:
                prediction = segmentation.apply(dev_file)
                predictions[epoch][uri] = prediction

            peak = Peak(alpha=alpha)
            hypothesis = peak.apply(prediction)

            p = purity(reference, hypothesis)
            c = coverage(reference, hypothesis)
            f += f_measure(c, p, beta=beta)

        return 1 - (f / n)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: speaker_embedding.py Projeto: zXpp/pyannote-audio

def embed(protocol, tune_dir, apply_dir, subset='test', step=None,
          internal=None, aggregate=False):

    mkdir_p(apply_dir)

    train_dir = os.path.dirname(os.path.dirname(tune_dir))

    duration, _, _, heterogeneous = \
        path_to_duration(os.path.basename(train_dir))

    config_dir = os.path.dirname(os.path.dirname(os.path.dirname(train_dir)))
    config_yml = config_dir + '/config.yml'
    with open(config_yml, 'r') as fp:
        config = yaml.load(fp)

    # -- FEATURE EXTRACTION --
    feature_extraction_name = config['feature_extraction']['name']
    features = __import__('pyannote.audio.features',
                          fromlist=[feature_extraction_name])
    FeatureExtraction = getattr(features, feature_extraction_name)
    feature_extraction = FeatureExtraction(
        **config['feature_extraction'].get('params', {}))

    # -- HYPER-PARAMETERS --
    tune_yml = tune_dir + '/tune.yml'
    with open(tune_yml, 'r') as fp:
        tune = yaml.load(fp)

    architecture_yml = train_dir + '/architecture.yml'
    WEIGHTS_H5 = train_dir + '/weights/{epoch:04d}.h5'
    weights_h5 = WEIGHTS_H5.format(epoch=tune['epoch'])

    sequence_embedding = SequenceEmbedding.from_disk(
        architecture_yml, weights_h5)

    extraction = Extraction(sequence_embedding, feature_extraction,
                            duration=duration, step=step,
                            internal=internal, aggregate=aggregate)

    dimension = extraction.dimension
    sliding_window = extraction.sliding_window

    # create metadata file at root that contains
    # sliding window and dimension information
    path = Precomputed.get_config_path(apply_dir)
    f = h5py.File(path)
    f.attrs['start'] = sliding_window.start
    f.attrs['duration'] = sliding_window.duration
    f.attrs['step'] = sliding_window.step
    f.attrs['dimension'] = dimension
    f.close()

    for item in getattr(protocol, subset)():

        uri = get_unique_identifier(item)
        path = Precomputed.get_path(apply_dir, item)

        extracted = extraction.apply(item)

        # create parent directory
        mkdir_p(os.path.dirname(path))

        f = h5py.File(path)
        f.attrs['start'] = sliding_window.start
        f.attrs['duration'] = sliding_window.duration
        f.attrs['step'] = sliding_window.step
        f.attrs['dimension'] = dimension
        f.create_dataset('features', data=extracted.data)
        f.close()

Exemplo n.º 8

0

Exibir arquivo

Arquivo: speaker_embedding.py Projeto: zXpp/pyannote-audio

def test(protocol, tune_dir, test_dir, subset, beta=1.0):

    batch_size = 32

    try:
        os.makedirs(test_dir)
    except Exception as e:
        pass

    train_dir = os.path.dirname(os.path.dirname(tune_dir))

    # -- DURATIONS --
    duration, min_duration, step, heterogeneous = \
        path_to_duration(os.path.basename(train_dir))

    config_dir = os.path.dirname(os.path.dirname(os.path.dirname(train_dir)))
    config_yml = config_dir + '/config.yml'
    with open(config_yml, 'r') as fp:
        config = yaml.load(fp)

    # -- PREPROCESSORS --
    for key, preprocessor in config.get('preprocessors', {}).items():
        preprocessor_name = preprocessor['name']
        preprocessor_params = preprocessor.get('params', {})
        preprocessors = __import__('pyannote.audio.preprocessors',
                                   fromlist=[preprocessor_name])
        Preprocessor = getattr(preprocessors, preprocessor_name)
        protocol.preprocessors[key] = Preprocessor(**preprocessor_params)

    # -- FEATURE EXTRACTION --
    feature_extraction_name = config['feature_extraction']['name']
    features = __import__('pyannote.audio.features',
                          fromlist=[feature_extraction_name])
    FeatureExtraction = getattr(features, feature_extraction_name)
    feature_extraction = FeatureExtraction(
        **config['feature_extraction'].get('params', {}))

    distance = config['glue'].get('params', {}).get('distance', 'sqeuclidean')

    # -- HYPER-PARAMETERS --
    tune_yml = tune_dir + '/tune.yml'
    with open(tune_yml, 'r') as fp:
        tune = yaml.load(fp)

    architecture_yml = train_dir + '/architecture.yml'
    WEIGHTS_H5 = train_dir + '/weights/{epoch:04d}.h5'
    weights_h5 = WEIGHTS_H5.format(epoch=tune['epoch'])

    sequence_embedding = SequenceEmbedding.from_disk(
        architecture_yml, weights_h5)

    X, y = generate_test(protocol, subset, feature_extraction,
                         duration, min_duration=min_duration, step=step)
    fX = sequence_embedding.transform(X, batch_size=batch_size)
    y_distance = pdist(fX, metric=distance)
    y_true = pdist(y, metric='chebyshev') < 1

    fpr, tpr, thresholds = sklearn.metrics.roc_curve(
        y_true, -y_distance, pos_label=True, drop_intermediate=True)

    frr = 1. - tpr
    far = fpr
    thresholds = -thresholds

    eer_index = np.where(far > frr)[0][0]
    eer = .25 * (far[eer_index-1] + far[eer_index] +
                 frr[eer_index-1] + frr[eer_index])

    fscore = 1. - f_measure(1. - frr, 1. - far, beta=beta)

    opt_i = np.nanargmin(fscore)
    opt_alpha = float(thresholds[opt_i])
    opt_far = far[opt_i]
    opt_frr = frr[opt_i]
    opt_fscore = fscore[opt_i]

    alpha = tune['alpha']
    actual_i = np.searchsorted(thresholds, alpha)
    actual_far = far[actual_i]
    actual_frr = frr[actual_i]
    actual_fscore = fscore[actual_i]

    save_to = test_dir + '/' + subset
    plot_distributions(y_true, y_distance, save_to)
    eer = plot_det_curve(y_true, -y_distance, save_to)
    plot_precision_recall_curve(y_true, -y_distance, save_to)

    with open(save_to + '.txt', 'w') as fp:
        fp.write('# cond. thresh  far     frr     fscore  eer\n')
        TEMPLATE = '{condition} {alpha:.5f} {far:.5f} {frr:.5f} {fscore:.5f} {eer:.5f}\n'
        fp.write(TEMPLATE.format(condition='optimal',
                                 alpha=opt_alpha,
                                 far=opt_far,
                                 frr=opt_frr,
                                 fscore=opt_fscore,
                                 eer=eer))
        fp.write(TEMPLATE.format(condition='actual ',
                                 alpha=alpha,
                                 far=actual_far,
                                 frr=actual_frr,
                                 fscore=actual_fscore,
                                 eer=eer))

Exemplo n.º 9

0

Exibir arquivo

Arquivo: speaker_embedding.py Projeto: zXpp/pyannote-audio

def validate(protocol, train_dir, validation_dir, subset='development'):

    mkdir_p(validation_dir)

    # -- DURATIONS --
    duration, min_duration, step, heterogeneous = \
        path_to_duration(os.path.basename(train_dir))

    # -- CONFIGURATION --
    config_dir = os.path.dirname(os.path.dirname(os.path.dirname(train_dir)))
    config_yml = config_dir + '/config.yml'
    with open(config_yml, 'r') as fp:
        config = yaml.load(fp)

    # -- DISTANCE --
    distance = config['glue'].get('params', {}).get('distance', 'sqeuclidean')

    # -- PREPROCESSORS --
    for key, preprocessor in config.get('preprocessors', {}).items():
        preprocessor_name = preprocessor['name']
        preprocessor_params = preprocessor.get('params', {})
        preprocessors = __import__('pyannote.audio.preprocessors',
                                   fromlist=[preprocessor_name])
        Preprocessor = getattr(preprocessors, preprocessor_name)
        protocol.preprocessors[key] = Preprocessor(**preprocessor_params)

    # -- FEATURE EXTRACTION --
    feature_extraction_name = config['feature_extraction']['name']
    features = __import__('pyannote.audio.features',
                          fromlist=[feature_extraction_name])
    FeatureExtraction = getattr(features, feature_extraction_name)
    feature_extraction = FeatureExtraction(
        **config['feature_extraction'].get('params', {}))

    architecture_yml = train_dir + '/architecture.yml'
    WEIGHTS_H5 = train_dir + '/weights/{epoch:04d}.h5'

    EER_TEMPLATE = '{epoch:04d} {now} {eer:5f}\n'
    eers = []

    path = validation_dir + '/{subset}.eer.txt'.format(subset=subset)
    with open(path, mode='w') as fp:

        epoch = 0
        while True:

            # wait until weight file is available
            weights_h5 = WEIGHTS_H5.format(epoch=epoch)
            if not os.path.isfile(weights_h5):
                time.sleep(60)
                continue

            now = datetime.datetime.now().isoformat()

            # load current model
            sequence_embedding = SequenceEmbedding.from_disk(
                architecture_yml, weights_h5)

            # if speaker recognition protocol
            if isinstance(protocol, SpeakerRecognitionProtocol):

                aggregation = SequenceEmbeddingAggregation(
                    sequence_embedding, feature_extraction,
                    duration=duration, min_duration=min_duration,
                    step=step, internal=-2, batch_size=8192)
                aggregation.cache_preprocessed_ = False

                # compute equal error rate
                _, _, _, eer = speaker_recognition_xp(
                    aggregation, protocol, subset=subset, distance=distance)

            elif isinstance(protocol, SpeakerDiarizationProtocol):

                if epoch == 0:
                    X, y = generate_test(
                        protocol, subset, feature_extraction,
                        duration, min_duration=min_duration, step=step,
                        heterogeneous=heterogeneous)

                _, _, _, eer = speaker_diarization_xp(
                    sequence_embedding, X, y, distance=distance)

            fp.write(EER_TEMPLATE.format(epoch=epoch, eer=eer, now=now))
            fp.flush()

            eers.append(eer)
            best_epoch = np.argmin(eers)
            best_value = np.min(eers)
            fig = plt.figure()
            plt.plot(eers, 'b')
            plt.plot([best_epoch], [best_value], 'bo')
            plt.plot([0, epoch], [best_value, best_value], 'k--')
            plt.grid(True)
            plt.xlabel('epoch')
            plt.ylabel('EER on {subset}'.format(subset=subset))
            TITLE = 'EER = {best_value:.5g} on {subset} @ epoch #{best_epoch:d}'
            title = TITLE.format(best_value=best_value,
                                 best_epoch=best_epoch,
                                 subset=subset)
            plt.title(title)
            plt.tight_layout()
            path = validation_dir + '/{subset}.eer.png'.format(subset=subset)
            plt.savefig(path, dpi=75)
            plt.close(fig)

            # skip to next epoch
            epoch += 1

Exemplo n.º 10

0

Exibir arquivo

Arquivo: callbacks.py Projeto: zXpp/pyannote-audio

    def on_epoch_end(self, epoch, logs={}):

        # keep track of current time
        now = datetime.datetime.now().isoformat()
        prefix = self.log_dir + '/{subset}.plot.{epoch:04d}'.format(
            epoch=epoch, subset=self.subset)

        from pyannote.audio.embedding.base import SequenceEmbedding
        sequence_embedding = SequenceEmbedding()
        sequence_embedding.embedding_ = self.glue.extract_embedding(self.model)

        from pyannote.audio.embedding.aggregation import \
            SequenceEmbeddingAggregation
        aggregation = SequenceEmbeddingAggregation(
            sequence_embedding,
            self.glue.feature_extractor,
            duration=self.glue.duration,
            min_duration=self.glue.min_duration,
            step=self.glue.step,
            internal=-2)

        # TODO / pass internal as parameter
        aggregation.cache_preprocessed_ = False

        # embed enroll and test recordings

        method = '{subset}_enroll'.format(subset=self.subset)
        enroll = getattr(self.protocol, method)(yield_name=True)

        method = '{subset}_test'.format(subset=self.subset)
        test = getattr(self.protocol, method)(yield_name=True)

        fX = {}
        for name, item in itertools.chain(enroll, test):
            if name in fX:
                continue
            embeddings = aggregation.apply(item)
            fX[name] = np.sum(embeddings.data, axis=0)

        # perform trials

        method = '{subset}_keys'.format(subset=self.subset)
        keys = getattr(self.protocol, method)()

        enroll_fX = l2_normalize(np.vstack([fX[name] for name in keys.index]))
        test_fX = l2_normalize(np.vstack([fX[name] for name in keys]))

        D = cdist(enroll_fX, test_fX, metric=self.glue.distance)

        y_true = []
        y_pred = []
        key_mapping = {0: None, -1: 0, 1: 1}
        for i, _ in enumerate(keys.index):
            for j, _ in enumerate(keys):
                y = key_mapping[keys.iloc[i, j]]
                if y is None:
                    continue

                y_true.append(y)
                y_pred.append(D[i, j])

        y_true = np.array(y_true)
        y_pred = np.array(y_pred)

        # plot DET curve once every 20 epochs (and 10 first epochs)
        if (epoch < 10) or (epoch % 20 == 0):
            eer = plot_det_curve(y_true,
                                 y_pred,
                                 prefix,
                                 distances=True,
                                 dpi=75)
        else:
            _, _, _, eer = det_curve(y_true, y_pred, distances=True)

        # store equal error rate in file
        mode = 'a' if epoch else 'w'
        path = self.log_dir + '/{subset}.eer.txt'.format(subset=self.subset)
        with open(path, mode=mode) as fp:
            fp.write(self.EER_TEMPLATE_.format(epoch=epoch, eer=eer, now=now))
            fp.flush()

        # plot eer = f(epoch)
        self.eer_.append(eer)
        best_epoch = np.argmin(self.eer_)
        best_value = np.min(self.eer_)
        fig = plt.figure()
        plt.plot(self.eer_, 'b')
        plt.plot([best_epoch], [best_value], 'bo')
        plt.plot([0, epoch], [best_value, best_value], 'k--')
        plt.grid(True)
        plt.xlabel('epoch')
        plt.ylabel('EER on {subset}'.format(subset=self.subset))
        TITLE = 'EER = {best_value:.5g} on {subset} @ epoch #{best_epoch:d}'
        title = TITLE.format(best_value=best_value,
                             best_epoch=best_epoch,
                             subset=self.subset)
        plt.title(title)
        plt.tight_layout()
        path = self.log_dir + '/{subset}.eer.png'.format(subset=self.subset)
        plt.savefig(path, dpi=75)
        plt.close(fig)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: speaker_embedding.py Projeto: GregGovit/pyannote-audio

def train(dataset, medium_template, config_yml):

    # load configuration file
    with open(config_yml, 'r') as fp:
        config = yaml.load(fp)

    # deduce workdir from path of configuration file
    workdir = os.path.dirname(config_yml)

    # this is where model weights are saved after each epoch
    log_dir = workdir + '/' + dataset

    # -- DATASET --
    db, task, protocol, subset = dataset.split('.')
    database = get_database(db, medium_template=medium_template)
    protocol = database.get_protocol(task, protocol)

    if not hasattr(protocol, subset):
        raise NotImplementedError('')

    file_generator = getattr(protocol, subset)()

    # -- FEATURE EXTRACTION --
    # input sequence duration
    duration = config['feature_extraction']['duration']
    # MFCCs
    feature_extractor = YaafeMFCC(**config['feature_extraction']['mfcc'])
    # normalization
    normalize = config['feature_extraction']['normalize']

    # -- NETWORK STRUCTURE --
    # internal model structure
    output_dim = config['network']['output_dim']
    lstm = config['network']['lstm']
    pooling = config['network'].get('pooling', 'last')
    dense = config['network']['dense']
    # bi-directional
    bidirectional = config['network']['bidirectional']
    space = config['network']['space']

    # -- TRAINING --
    # batch size
    batch_size = config['training']['batch_size']
    # number of epochs
    nb_epoch = config['training']['nb_epoch']
    # optimizer
    optimizer = config['training']['optimizer']

    # -- TRIPLET LOSS --
    margin = config['training']['triplet_loss']['margin']
    per_fold = config['training']['triplet_loss']['per_fold']
    per_label = config['training']['triplet_loss']['per_label']
    overlap = config['training']['triplet_loss']['overlap']

    # embedding
    get_embedding = TristouNet(lstm=lstm,
                               bidirectional=bidirectional,
                               pooling=pooling,
                               dense=dense,
                               output_dim=output_dim,
                               space=space)

    loss = TripletLoss(get_embedding, margin=margin)

    embedding = SequenceEmbedding(loss=loss,
                                  optimizer=optimizer,
                                  log_dir=log_dir)

    # triplet generator for training
    generator = TripletBatchGenerator(feature_extractor,
                                      file_generator,
                                      embedding,
                                      margin=margin,
                                      duration=duration,
                                      overlap=overlap,
                                      normalize=normalize,
                                      per_fold=per_fold,
                                      per_label=per_label,
                                      batch_size=batch_size)

    # log loss during training and keep track of best model
    log = [('train', 'loss')]
    callback = LoggingCallback(log_dir=log_dir,
                               log=log,
                               get_model=loss.get_embedding)

    # estimated number of triplets per epoch
    # (rounded to closest batch_size multiple)
    samples_per_epoch = per_label * (per_label - 1) * generator.n_labels
    samples_per_epoch = samples_per_epoch - (samples_per_epoch % batch_size)

    # input shape (n_samples, n_features)
    input_shape = generator.get_shape()

    embedding.fit(input_shape,
                  generator,
                  samples_per_epoch,
                  nb_epoch,
                  callbacks=[callback])

Exemplo n.º 12

0

Exibir arquivo

def test(protocol, tune_dir, apply_dir, subset='test', beta=1.0):

    os.makedirs(apply_dir)

    train_dir = os.path.dirname(os.path.dirname(os.path.dirname(tune_dir)))

    duration = float(os.path.basename(train_dir))
    config_dir = os.path.dirname(os.path.dirname(os.path.dirname(train_dir)))
    config_yml = config_dir + '/config.yml'
    with open(config_yml, 'r') as fp:
        config = yaml.load(fp)

    # -- FEATURE EXTRACTION --
    feature_extraction_name = config['feature_extraction']['name']
    features = __import__('pyannote.audio.features',
                          fromlist=[feature_extraction_name])
    FeatureExtraction = getattr(features, feature_extraction_name)
    feature_extraction = FeatureExtraction(
        **config['feature_extraction'].get('params', {}))

    # -- HYPER-PARAMETERS --
    tune_yml = tune_dir + '/tune.yml'
    with open(tune_yml, 'r') as fp:
        tune = yaml.load(fp)

    architecture_yml = train_dir + '/architecture.yml'
    WEIGHTS_H5 = train_dir + '/weights/{epoch:04d}.h5'
    weights_h5 = WEIGHTS_H5.format(epoch=tune['epoch'])

    sequence_embedding = SequenceEmbedding.from_disk(
        architecture_yml, weights_h5)

    segmentation = Segmentation(
        sequence_embedding, feature_extraction,
        duration=duration, step=0.100)

    peak = Peak(alpha=tune['alpha'])

    HARD_JSON = apply_dir + '/{uri}.hard.json'
    SOFT_PKL = apply_dir + '/{uri}.soft.pkl'

    eval_txt = apply_dir + '/eval.txt'
    TEMPLATE = '{uri} {purity:.5f} {coverage:.5f} {f_measure:.5f}\n'
    purity = SegmentationPurity()
    coverage = SegmentationCoverage()
    fscore = []

    for test_file in getattr(protocol, subset)():

        soft = segmentation.apply(test_file)
        hard = peak.apply(soft)

        uri = get_unique_identifier(test_file)

        path = SOFT_PKL.format(uri=uri)
        mkdir_p(os.path.dirname(path))
        with open(path, 'w') as fp:
            pickle.dump(soft, fp)

        path = HARD_JSON.format(uri=uri)
        mkdir_p(os.path.dirname(path))
        with open(path, 'w') as fp:
            pyannote.core.json.dump(hard, fp)

        try:
            reference = test_file['annotation']
            uem = test_file['annotated']
        except KeyError as e:
            continue

        p = purity(reference, hard)
        c = coverage(reference, hard)
        f = f_measure(c, p, beta=beta)
        fscore.append(f)

        line = TEMPLATE.format(
            uri=uri, purity=p, coverage=c, f_measure=f)
        with open(eval_txt, 'a') as fp:
            fp.write(line)

    p = abs(purity)
    c = abs(coverage)
    f = np.mean(fscore)
    line = TEMPLATE.format(
        uri='ALL', purity=p, coverage=c, f_measure=f)
    with open(eval_txt, 'a') as fp:
        fp.write(line)

Exemplo n.º 13

0

Exibir arquivo

                              DD=True)

# ETAPE database
medium_template = {'wav': WAV_TEMPLATE}
from pyannote.database import Etape
database = Etape(medium_template=medium_template)

# experimental protocol (ETAPE TV subset)
protocol = database.get_protocol('SpeakerDiarization', 'TV')

from pyannote.audio.embedding.base import SequenceEmbedding

# load pre-trained embedding
architecture_yml = LOG_DIR + '/architecture.yml'
weights_h5 = LOG_DIR + '/weights/{epoch:04d}.h5'.format(epoch=nb_epoch - 1)
embedding = SequenceEmbedding.from_disk(architecture_yml, weights_h5)

from pyannote.audio.embedding.segmentation import Segmentation
segmentation = Segmentation(embedding,
                            feature_extractor,
                            duration=duration,
                            step=0.100)

# process files from development set
# (and, while we are at it, load groundtruth for later comparison)
predictions = {}
groundtruth = {}
for test_file in protocol.development():
    uri = test_file['uri']
    groundtruth[uri] = test_file['annotation']
    wav = test_file['medium']['wav']

Exemplo n.º 14

0

Exibir arquivo

Arquivo: train.py Projeto: zhouliping3712/TristouNet

database = Etape(medium_template=medium_template)

# experimental protocol (ETAPE TV subset)
protocol = database.get_protocol('SpeakerDiarization', 'TV')

# TristouNet architecture
from pyannote.audio.embedding.models import TristouNet
architecture = TristouNet()

# triplet loss
from pyannote.audio.embedding.losses import TripletLoss
margin = 0.2  # `alpha` in the paper
loss = TripletLoss(architecture, margin=margin)

from pyannote.audio.embedding.base import SequenceEmbedding
embedding = SequenceEmbedding(loss=loss, optimizer='rmsprop', log_dir=LOG_DIR)

# triplet sampling
# this might take some time as the whole corpus is loaded in memory,
# and the whole set of MFCC features sequences is precomputed
from pyannote.audio.embedding.generator import TripletBatchGenerator
per_label = 40  # `n` in the paper
batch_size = 8192
generator = TripletBatchGenerator(feature_extractor,
                                  protocol.train(),
                                  embedding,
                                  margin=margin,
                                  duration=duration,
                                  per_label=per_label,
                                  batch_size=batch_size)