def save(self, path): """ Save embedding set as pickled file. Args: path (string_types): output path """ mkdir_p(os.path.dirname(path)) with open(path, 'wb') as f: pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)
def install_scripts(directory): """ Call cmd commands to install extra software/repositories. Args: directory (str): path """ if KALDI_ROOT_PATH is None or not os.path.isdir(KALDI_ROOT_PATH): raise ValueError('Please, set path to correct kaldi installation.') nnet_copy_binary = os.path.join(KALDI_ROOT_PATH, 'src', 'nnet3bin', 'nnet3-copy') if not os.path.isfile(nnet_copy_binary): raise ValueError('nnet3-copy binary not found in `{}`.'.format( os.path.dirname(nnet_copy_binary))) copy_matrix_binary = os.path.join(KALDI_ROOT_PATH, 'src', 'bin', 'copy-matrix') if not os.path.isfile(copy_matrix_binary): raise ValueError('copy-matrix binary not found in `{}`.'.format( os.path.dirname(copy_matrix_binary))) mkdir_p(XVEC_MODELS_DIR) with tempfile.NamedTemporaryFile() as f: urllib.urlretrieve( 'http://kaldi-asr.org/models/0003_sre16_v2_1a.tar.gz', f.name) tar = tarfile.open(os.path.join(f.name), 'r:gz') tar.extractall(XVEC_MODELS_DIR) tar.close() # replace input of the last layer, so we can easily extract xvectors nnet_raw_path = os.path.join(XVEC_MODELS_DIR, '0003_sre16_v2_1a', 'exp', 'xvector_nnet_1a', 'final.raw') old_line = 'output-node name=output input=output.log-softmax objective=linear' new_line = 'output-node name=output input=tdnn6.affine objective=linear' check_call([ 'sed', '-i', '-e', 's@{}@{}@g'.format(old_line, new_line), nnet_raw_path ]) # convert LDA matrix to text format lda_path = os.path.join(os.path.dirname(nnet_raw_path), '..', 'xvectors_sre_combined', 'transform.mat') check_call([ copy_matrix_binary, '--binary=false', lda_path, lda_path.replace('.mat', '.txt') ])
def extract_embeddings(self): """ Extract normalization embeddings using averaging. Returns: Tuple[np.array, np.array]: vectors for individual speakers, global mean over all speakers """ speakers_dict, fns = {}, [] with open(self.norm_list) as f: for line in f: if len(line.split()) > 1: # number of speakers is defined line = line.split()[0] else: line = line.replace(os.linesep, '') fns.append(line) speakers_dict = process_files( fns, speakers_dict=speakers_dict, features_extractor=self.features_extractor, embedding_extractor=self.embedding_extractor, audio_dir=self.audio_dir, wav_suffix=self.wav_suffix, in_rttm_dir=self.in_rttm_dir, rttm_suffix=self.rttm_suffix, min_length=self.min_length, n_jobs=self.n_jobs) assert len(speakers_dict) == len(fns) # all are the same merged_speakers_dict = speakers_dict[0] if self.out_emb_dir: for speaker in merged_speakers_dict: out_path = os.path.join(self.out_emb_dir, f'{speaker}.pkl') mkdir_p(os.path.dirname(out_path)) with open(out_path, 'wb') as f: pickle.dump(merged_speakers_dict[speaker], f, pickle.HIGHEST_PROTOCOL) for speaker in merged_speakers_dict: merged_speakers_dict[speaker] = np.mean( merged_speakers_dict[speaker], axis=0) return np.array(list(merged_speakers_dict.values()))
def dump_rttm(self, scores, out_dir): """ Dump rttm files to output directory. This function requires initialized embeddings. Args: scores (Dict): dictionary containing scores out_dir (string_types): path to output directory """ for embedding_set in self.embeddings: if len(embedding_set) > 0: name = embedding_set.name reg_name = re.sub('/.*', '', embedding_set.name) mkdir_p(os.path.join(out_dir, os.path.dirname(name))) with open(os.path.join(out_dir, name + '.rttm'), 'w') as f: for i, ivec in enumerate(embedding_set.embeddings): start, end = ivec.window_start, ivec.window_end idx = np.argmax(scores[name].T[i]) f.write('SPEAKER {} 1 {} {} <NA> <NA> {}_spkr_{} <NA>\n'.format( reg_name, float(start / 1000.0), float((end - start) / 1000.0), reg_name, idx)) else: logger.warning('No embedding to dump in {}.'.format(embedding_set.name))
def process_file(wav_dir, vad_dir, out_dir, file_name, features_extractor, embedding_extractor, min_size, max_size, overlap, tolerance, wav_suffix='.wav', vad_suffix='.lab.gz'): """ Process single audio file. Args: wav_dir (str): directory with wav files vad_dir (str): directory with vad files out_dir (str): output directory file_name (str): name of the file features_extractor (Any): intialized object for feature extraction embedding_extractor (Any): initialized object for embedding extraction max_size (int): maximal size of window in ms max_size (int): maximal size of window in ms overlap (int): size of window overlap in ms tolerance (int): accept given number of frames as speech even when it is marked as silence wav_suffix (str): suffix of wav files vad_suffix (str): suffix of vad files Returns: EmbeddingSet """ logger.info('Processing file {}.'.format(file_name.split()[0])) num_speakers = None if len(file_name.split()) > 1: # number of speakers is defined file_name, num_speakers = file_name.split()[0], int( file_name.split()[1]) wav_dir, vad_dir = os.path.abspath(wav_dir), os.path.abspath(vad_dir) if out_dir: out_dir = os.path.abspath(out_dir) # extract features features = features_extractor.audio2features( os.path.join(wav_dir, f'{file_name}{wav_suffix}')) # load voice activity detection from file vad, _, _ = get_vad(f'{os.path.join(vad_dir, file_name)}{vad_suffix}', features.shape[0]) # parse segments and split features features_dict = {} for seg in get_segments(vad, max_size, tolerance): seg_start, seg_end = seg start, end = get_time_from_frames(seg_start), get_time_from_frames( seg_end) if start >= overlap: seg_start = get_frames_from_time(start - overlap) if seg_start > features.shape[0] - 1 or seg_end > features.shape[0] - 1: logger.warning( f'Frames not aligned, number of frames {features.shape[0]} and got ending segment {seg_end}' ) seg_end = features.shape[0] features_dict[(start, end)] = features[seg_start:seg_end] # extract embedding for each segment embedding_set = extract_embeddings(features_dict, embedding_extractor) embedding_set.name = file_name embedding_set.num_speakers = num_speakers # save embeddings if required if out_dir is not None: mkdir_p(os.path.join(out_dir, os.path.dirname(file_name))) embedding_set.save(os.path.join(out_dir, '{}.pkl'.format(file_name))) return embedding_set
else: mean = norm.mean # run diarization diar = Diarization(args.input_list, embeddings, embeddings_mean=mean, lda=lda, use_l2_norm=use_l2_norm, plda=plda, norm=norm) result = diar.score_embeddings(args.min_window_size, args.max_num_speakers, args.mode) if args.mode == 'diarization': if args.in_rttm_dir: diar.evaluate(scores=result, in_rttm_dir=args.in_rttm_dir, collar_size=0.25, evaluate_overlaps=False) if args.out_rttm_dir is not None: diar.dump_rttm(result, args.out_rttm_dir) else: if args.out_clusters_dir: for name in result: mkdir_p( os.path.join(args.out_clusters_dir, os.path.dirname(name))) np.save(os.path.join(args.out_clusters_dir, name), result[name])
def process_file(wav_dir, vad_dir, out_dir, file_name, features_extractor, embedding_extractor, max_size, tolerance, wav_suffix='.wav', vad_suffix='.lab.gz'): """ Process single audio file. Args: wav_dir (str): directory with wav files vad_dir (str): directory with vad files out_dir (str): output directory file_name (str): name of the file features_extractor (Any): intialized object for feature extraction embedding_extractor (Any): initialized object for embedding extraction max_size (int): maximal size of window in ms tolerance (int): accept given number of frames as speech even when it is marked as silence wav_suffix (str): suffix of wav files vad_suffix (str): suffix of vad files Returns: EmbeddingSet """ logger.info('Processing file {}.'.format(file_name.split()[0])) num_speakers = None if len(file_name.split()) > 1: # number of speakers is defined file_name, num_speakers = file_name.split()[0], int( file_name.split()[1]) wav_dir, vad_dir = os.path.abspath(wav_dir), os.path.abspath(vad_dir) if out_dir: out_dir = os.path.abspath(out_dir) # extract features _, features = features_extractor.audio2features( os.path.join(wav_dir, '{}{}'.format(file_name, wav_suffix))) # load voice activity detection from file vad, _, _ = get_vad( '{}{}'.format(os.path.join(vad_dir, file_name), vad_suffix), features.shape[0]) # parse segments and split features features_dict = {} for seg in get_segments(vad, max_size, tolerance): start, end = get_num_segments(seg[0]), get_num_segments(seg[1]) if seg[0] > features.shape[0] - 1 or seg[1] > features.shape[0] - 1: raise ValueError( 'Unexpected features dimensionality - check VAD input or audio.' ) features_dict['{}_{}'.format(start, end)] = features[seg[0]:seg[1]] # extract embedding for each segment embedding_set = extract_embeddings(features_dict, embedding_extractor) embedding_set.name = file_name embedding_set.num_speakers = num_speakers # save embeddings if required if out_dir is not None: mkdir_p(os.path.join(out_dir, os.path.dirname(file_name))) embedding_set.save(os.path.join(out_dir, '{}.pkl'.format(file_name))) return embedding_set