def protocol( database: str = typer.Option( "", "--database", "-d", metavar="DATABASE", help="Filter protocols by DATABASE.", case_sensitive=False, ), task: Task = typer.Option( "Any", "--task", "-t", help="Filter protocols by TASK.", case_sensitive=False, ), ): """Print list of protocols""" if database == "": databases = get_databases() else: databases = [database] for database_name in databases: db: Database = get_database(database_name) tasks = db.get_tasks() if task == "Any" else [task] for task_name in tasks: try: protocols = db.get_protocols(task_name) except KeyError: continue for protocol in protocols: typer.echo(f"{database_name}.{task_name}.{protocol}")
def task(database: str = typer.Option( "", "--database", "-d", metavar="DATABASE", help="Filter tasks by DATABASE.", case_sensitive=False, )): """Print list of tasks""" if database == "": tasks = get_tasks() else: db: Database = get_database(database) tasks = db.get_tasks() for task in tasks: typer.echo(f"{task}")
min_duration = float(tokens[0]) if len(tokens) == 2 else None duration = float(tokens[0]) if len(tokens) == 1 else float(tokens[1]) return duration, min_duration, step, heterogeneous if __name__ == '__main__': arguments = docopt(__doc__, version='Speaker embedding') db_yml = os.path.expanduser(arguments['--database']) preprocessors = {'wav': FileFinder(db_yml)} if '<database.task.protocol>' in arguments: protocol = arguments['<database.task.protocol>'] database_name, task_name, protocol_name = protocol.split('.') database = get_database(database_name, preprocessors=preprocessors) protocol = database.get_protocol(task_name, protocol_name, progress=True) subset = arguments['--subset'] if arguments['train']: experiment_dir = arguments['<experiment_dir>'] if subset is None: subset = 'train' duration = float(arguments['--duration']) min_duration = arguments['--min-duration'] if min_duration is not None: min_duration = float(min_duration)
def train(dataset, medium_template, config_yml): # load configuration file with open(config_yml, 'r') as fp: config = yaml.load(fp) # deduce workdir from path of configuration file workdir = os.path.dirname(config_yml) # this is where model weights are saved after each epoch log_dir = workdir + '/' + dataset # -- DATASET -- db, task, protocol, subset = dataset.split('.') database = get_database(db, medium_template=medium_template) protocol = database.get_protocol(task, protocol) if not hasattr(protocol, subset): raise NotImplementedError('') file_generator = getattr(protocol, subset)() # -- FEATURE EXTRACTION -- # input sequence duration duration = config['feature_extraction']['duration'] # MFCCs feature_extractor = YaafeMFCC(**config['feature_extraction']['mfcc']) # normalization normalize = config['feature_extraction']['normalize'] # -- NETWORK STRUCTURE -- # internal model structure lstm = config['network']['lstm'] dense = config['network']['dense'] # bi-directional bidirectional = config['network']['bidirectional'] # -- TRAINING -- # number training set hours (speech + non speech) to use in each epoch # FIXME -- update ETAPE so that we can query this information directly hours_per_epoch = config['training']['hours_per_epoch'] # overlap ratio between each window overlap = config['training']['overlap'] # batch size batch_size = config['training']['batch_size'] # number of epochs nb_epoch = config['training']['nb_epoch'] # optimizer optimizer = config['training']['optimizer'] # labeling n_classes = 2 design_model = StackedLSTM(n_classes=n_classes, lstm=lstm, bidirectional=bidirectional, dense=dense) labeling = SequenceLabeling(design_model, optimizer=optimizer, log_dir=log_dir) # segment generator for training step = duration * (1. - overlap) batch_generator = SpeechActivityDetectionBatchGenerator( feature_extractor, duration=duration, normalize=normalize, step=step, batch_size=batch_size) # log loss and accuracy during training and # keep track of best models for both metrics log = [('train', 'loss'), ('train', 'accuracy')] callback = LoggingCallback(log_dir=log_dir, log=log) # number of samples per epoch + round it to closest batch samples_per_epoch = batch_size * int( np.ceil((3600 * hours_per_epoch / step) / batch_size)) # input shape (n_frames, n_features) input_shape = batch_generator.get_shape() generator = batch_generator(file_generator, infinite=True) labeling.fit(input_shape, generator, samples_per_epoch, nb_epoch, callbacks=[callback])
def test(dataset, medium_template, config_yml, weights_h5, output_dir): # load configuration file with open(config_yml, 'r') as fp: config = yaml.load(fp) # this is where model architecture was saved architecture_yml = os.path.dirname( os.path.dirname(weights_h5)) + '/architecture.yml' # -- DATASET -- db, task, protocol, subset = dataset.split('.') database = get_database(db, medium_template=medium_template) protocol = database.get_protocol(task, protocol) if not hasattr(protocol, subset): raise NotImplementedError('') file_generator = getattr(protocol, subset)() # -- FEATURE EXTRACTION -- # input sequence duration duration = config['feature_extraction']['duration'] # MFCCs feature_extractor = YaafeMFCC(**config['feature_extraction']['mfcc']) # normalization normalize = config['feature_extraction']['normalize'] # -- TESTING -- # overlap ratio between each window overlap = config['testing']['overlap'] step = duration * (1. - overlap) # prediction smoothing onset = config['testing']['binarize']['onset'] offset = config['testing']['binarize']['offset'] binarizer = Binarize(onset=0.5, offset=0.5) sequence_labeling = SequenceLabeling.from_disk(architecture_yml, weights_h5) aggregation = SequenceLabelingAggregation(sequence_labeling, feature_extractor, normalize=normalize, duration=duration, step=step) collar = 0.500 error_rate = DetectionErrorRate(collar=collar) accuracy = DetectionAccuracy(collar=collar) precision = DetectionPrecision(collar=collar) recall = DetectionRecall(collar=collar) LINE = '{uri} {e:.3f} {a:.3f} {p:.3f} {r:.3f} {f:.3f}\n' PATH = '{output_dir}/eval.{dataset}.{subset}.txt' path = PATH.format(output_dir=output_dir, dataset=dataset, subset=subset) with open(path, 'w') as fp: header = '# uri error accuracy precision recall f_measure\n' fp.write(header) fp.flush() for current_file in file_generator: uri = current_file['uri'] wav = current_file['medium']['wav'] annotated = current_file['annotated'] annotation = current_file['annotation'] predictions = aggregation.apply(wav) hypothesis = binarizer.apply(predictions, dimension=1) e = error_rate(annotation, hypothesis, uem=annotated) a = accuracy(annotation, hypothesis, uem=annotated) p = precision(annotation, hypothesis, uem=annotated) r = recall(annotation, hypothesis, uem=annotated) f = f_measure(p, r) line = LINE.format(uri=uri, e=e, a=a, p=p, r=r, f=f) fp.write(line) fp.flush() PATH = '{output_dir}/{uri}.json' path = PATH.format(output_dir=output_dir, uri=uri) dump_to(hypothesis, path) # average on whole corpus uri = '{dataset}.{subset}'.format(dataset=dataset, subset=subset) e = abs(error_rate) a = abs(accuracy) p = abs(precision) r = abs(recall) f = f_measure(p, r) line = LINE.format(uri=uri, e=e, a=a, p=p, r=r, f=f) fp.write(line) fp.flush()
def train(dataset, medium_template, config_yml): # load configuration file with open(config_yml, 'r') as fp: config = yaml.load(fp) # deduce workdir from path of configuration file workdir = os.path.dirname(config_yml) # this is where model weights are saved after each epoch log_dir = workdir + '/' + dataset # -- DATASET -- db, task, protocol, subset = dataset.split('.') database = get_database(db, medium_template=medium_template) protocol = database.get_protocol(task, protocol) if not hasattr(protocol, subset): raise NotImplementedError('') file_generator = getattr(protocol, subset)() # -- FEATURE EXTRACTION -- # input sequence duration duration = config['feature_extraction']['duration'] # MFCCs feature_extractor = YaafeMFCC(**config['feature_extraction']['mfcc']) # normalization normalize = config['feature_extraction']['normalize'] # -- NETWORK STRUCTURE -- # internal model structure output_dim = config['network']['output_dim'] lstm = config['network']['lstm'] pooling = config['network'].get('pooling', 'last') dense = config['network']['dense'] # bi-directional bidirectional = config['network']['bidirectional'] space = config['network']['space'] # -- TRAINING -- # batch size batch_size = config['training']['batch_size'] # number of epochs nb_epoch = config['training']['nb_epoch'] # optimizer optimizer = config['training']['optimizer'] # -- TRIPLET LOSS -- margin = config['training']['triplet_loss']['margin'] per_fold = config['training']['triplet_loss']['per_fold'] per_label = config['training']['triplet_loss']['per_label'] overlap = config['training']['triplet_loss']['overlap'] # embedding get_embedding = TristouNet(lstm=lstm, bidirectional=bidirectional, pooling=pooling, dense=dense, output_dim=output_dim, space=space) loss = TripletLoss(get_embedding, margin=margin) embedding = SequenceEmbedding(loss=loss, optimizer=optimizer, log_dir=log_dir) # triplet generator for training generator = TripletBatchGenerator(feature_extractor, file_generator, embedding, margin=margin, duration=duration, overlap=overlap, normalize=normalize, per_fold=per_fold, per_label=per_label, batch_size=batch_size) # log loss during training and keep track of best model log = [('train', 'loss')] callback = LoggingCallback(log_dir=log_dir, log=log, get_model=loss.get_embedding) # estimated number of triplets per epoch # (rounded to closest batch_size multiple) samples_per_epoch = per_label * (per_label - 1) * generator.n_labels samples_per_epoch = samples_per_epoch - (samples_per_epoch % batch_size) # input shape (n_samples, n_features) input_shape = generator.get_shape() embedding.fit(input_shape, generator, samples_per_epoch, nb_epoch, callbacks=[callback])
def generate_test(dataset, medium_template, config): # -- DATASET -- db, task, protocol, subset = dataset.split('.') database = get_database(db, medium_template=medium_template) protocol = database.get_protocol(task, protocol) if not hasattr(protocol, subset): raise NotImplementedError('') file_generator = getattr(protocol, subset)() # -- FEATURE EXTRACTION -- # input sequence duration duration = config['feature_extraction']['duration'] # MFCCs feature_extractor = YaafeMFCC(**config['feature_extraction']['mfcc']) # normalization normalize = config['feature_extraction']['normalize'] overlap = config['testing']['overlap'] per_label = config['testing']['per_label'] batch_size = config['testing']['batch_size'] batch_generator = LabeledFixedDurationSequencesBatchGenerator( feature_extractor, duration=duration, normalize=normalize, step=(1 - overlap) * duration, batch_size=-1) X, y = [], [] for sequences, labels in batch_generator(file_generator): X.append(sequences) y.append(labels) X = np.vstack(X) y = np.hstack(y) unique, y, counts = np.unique(y, return_inverse=True, return_counts=True) # randomly (but deterministically) select 'per_label' samples from each class # only compute (positive vs. negative distances for those samples) # this should ensure all speakers have the same weights np.random.seed(1337) # indices contains the list of indices of all sequences # to be used for later triplet selection indices = [] n_labels = len(unique) for label in range(n_labels): # randomly choose 'per_label' sequences # from the set of available sequences i = np.random.choice(np.where(y == label)[0], size=per_label, replace=True) # append indices of selected sequences indices.append(i) # turn indices into a 1-dimensional numpy array. indices = np.hstack(indices) # selected sequences X = X[indices] # their pairwise similarity y_true = pdist(y[indices, np.newaxis], metric='chebyshev') < 1 return X, y_true
from pyannote.database import get_database from pyannote.database import get_protocol from pyannote.database.protocol import CollectionProtocol from pyannote.database.protocol import Protocol from pyannote.database.protocol import SpeakerDiarizationProtocol from pyannote.database.protocol import SpeakerVerificationProtocol assert "MyDatabase" in get_databases() tasks = get_tasks() assert "Collection" in tasks assert "Protocol" in tasks assert "SpeakerDiarization" in tasks assert "SpeakerVerification" in tasks database = get_database("MyDatabase") tasks = database.get_tasks() assert "Collection" in tasks assert "Protocol" in tasks assert "SpeakerDiarization" in tasks assert "SpeakerVerification" in tasks assert "MyCollection" in database.get_protocols("Collection") assert "MyProtocol" in database.get_protocols("Protocol") assert "MySpeakerDiarization" in database.get_protocols("SpeakerDiarization") assert "MySpeakerVerification" in database.get_protocols("SpeakerVerification") collection = get_protocol("MyDatabase.Collection.MyCollection") assert isinstance(collection, CollectionProtocol) protocol = get_protocol("MyDatabase.Protocol.MyProtocol")
def extract(database_name, task_name, protocol_name, preprocessors, experiment_dir, robust=False): database = get_database(database_name, preprocessors=preprocessors) protocol = database.get_protocol(task_name, protocol_name, progress=True) if task_name == 'SpeakerDiarization': items = itertools.chain(protocol.train(), protocol.development(), protocol.test()) elif task_name == 'SpeakerRecognition': items = itertools.chain(protocol.train(yield_name=False), protocol.development_enroll(yield_name=False), protocol.development_test(yield_name=False), protocol.test_enroll(yield_name=False), protocol.test_test(yield_name=False)) # load configuration file config_yml = experiment_dir + '/config.yml' with open(config_yml, 'r') as fp: config = yaml.load(fp) feature_extraction_name = config['feature_extraction']['name'] features = __import__('pyannote.audio.features', fromlist=[feature_extraction_name]) FeatureExtraction = getattr(features, feature_extraction_name) feature_extraction = FeatureExtraction( **config['feature_extraction'].get('params', {})) sliding_window = feature_extraction.sliding_window() dimension = feature_extraction.dimension() # create metadata file at root that contains # sliding window and dimension information path = Precomputed.get_config_path(experiment_dir) f = h5py.File(path) f.attrs['start'] = sliding_window.start f.attrs['duration'] = sliding_window.duration f.attrs['step'] = sliding_window.step f.attrs['dimension'] = dimension f.close() for item in items: uri = get_unique_identifier(item) path = Precomputed.get_path(experiment_dir, item) if os.path.exists(path): continue try: # NOTE item contains the 'channel' key features = feature_extraction(item) except PyannoteFeatureExtractionError as e: if robust: msg = 'Feature extraction failed for file "{uri}".' msg = msg.format(uri=uri) warnings.warn(msg) continue else: raise e if features is None: msg = 'Feature extraction returned None for file "{uri}".' msg = msg.format(uri=uri) if not robust: raise PyannoteFeatureExtractionError(msg) warnings.warn(msg) continue data = features.data if np.any(np.isnan(data)): msg = 'Feature extraction returned NaNs for file "{uri}".' msg = msg.format(uri=uri) if not robust: raise PyannoteFeatureExtractionError(msg) warnings.warn(msg) continue # create parent directory mkdir_p(os.path.dirname(path)) f = h5py.File(path) f.attrs['start'] = sliding_window.start f.attrs['duration'] = sliding_window.duration f.attrs['step'] = sliding_window.step f.attrs['dimension'] = dimension f.create_dataset('features', data=data) f.close()