def extract_features_for_subset(language, subset, feat_type, output_fn): """ Extract features for the subset in this language. The `feat_type` parameter can be "mfcc" or "fbank". """ # Get speakers for subset speakers_fn = path.join("..", "data", subset + "_spk.list") print("Reading:", speakers_fn) speakers = read_speakers(speakers_fn, language) # Convert shorten audio to wav wav_dir = path.join("wav", language, subset) if not path.isdir(wav_dir): os.makedirs(wav_dir) print("Converting shn audio to wav:") shorten_to_wav(language, speakers, wav_dir) # Extract raw features print("Extracting features:") if feat_type == "mfcc": feat_dict = features.extract_mfcc_dir(wav_dir) elif feat_type == "fbank": feat_dict = features.extract_fbank_dir(wav_dir) else: assert False, "invalid feature type" # Perform per speaker mean and variance normalisation print("Per speaker mean and variance normalisation:") feat_dict = features.speaker_mvn(feat_dict) # Write output print("Writing:", output_fn) np.savez_compressed(output_fn, **feat_dict)
def extract_features(feat_type, output_fn): """ Extract specified features. The `feat_type` parameter can be "mfcc" or "fbank". """ # Raw features feat_dict = {} if feat_type == "mfcc": feat_dict_wavkey = features.extract_mfcc_dir(xitsonga_datadir) elif feat_type == "fbank": feat_dict_wavkey = features.extract_fbank_dir(xitsonga_datadir) else: assert False, "invalid feature type" for wav_key in feat_dict_wavkey: feat_key = utils.uttlabel_to_uttkey(wav_key) feat_dict[feat_key] = feat_dict_wavkey[wav_key] # Read voice activity regions fa_fn = path.join("..", "data", "xitsonga.wrd") print("Reading:", fa_fn) vad_dict = utils.read_vad_from_fa(fa_fn) # Only keep voice active regions print("Extracting VAD regions:") feat_dict = features.extract_vad(feat_dict, vad_dict) # Perform per speaker mean and variance normalisation print("Per speaker mean and variance normalisation:") feat_dict = features.speaker_mvn(feat_dict) # Write output print("Writing:", output_fn) np.savez_compressed(output_fn, **feat_dict)
def extract_features_for_subset(subset, feat_type, output_fn): """ Extract specified features for a subset. The `feat_type` parameter can be "mfcc" or "fbank". """ # Speakers for subset speaker_fn = path.join("..", "data", "buckeye_" + subset + "_speakers.list") print("Reading:", speaker_fn) speakers = set() with open(speaker_fn) as f: for line in f: speakers.add(line.strip()) print("Speakers:", ", ".join(sorted(speakers))) # Raw features feat_dict = {} print("Extracting features per speaker:") for speaker in sorted(speakers): if feat_type == "mfcc": speaker_feat_dict = features.extract_mfcc_dir( path.join(buckeye_datadir, speaker)) elif feat_type == "fbank": speaker_feat_dict = features.extract_fbank_dir( path.join(buckeye_datadir, speaker)) else: assert False, "invalid feature type" for wav_key in speaker_feat_dict: feat_dict[speaker + "_" + wav_key[3:]] = speaker_feat_dict[wav_key] # Read voice activity regions fa_fn = path.join("..", "data", "buckeye_english.wrd") print("Reading:", fa_fn) vad_dict = utils.read_vad_from_fa(fa_fn) # Only keep voice active regions print("Extracting VAD regions:") feat_dict = features.extract_vad(feat_dict, vad_dict) # Perform per speaker mean and variance normalisation print("Per speaker mean and variance normalisation:") feat_dict = features.speaker_mvn(feat_dict) # Write output print("Writing:", output_fn) np.savez_compressed(output_fn, **feat_dict)
def main(): args = check_argv() print(datetime.now()) # Speaker set for the indicated subset speaker_fn = path.join("..", "data", "buckeye_" + args.subset + "_speakers.list") print("Reading:", speaker_fn) speakers = set() with open(speaker_fn) as f: for line in f: speakers.add(line.strip()) print("Speakers:", sorted(speakers)) # Raw filterbanks feat_dict = {} print("Extracting features per speaker:") for speaker in sorted(speakers): speaker_feat_dict = features.extract_mfcc_dir( path.join(buckeye_datadir, speaker)) for wav_key in speaker_feat_dict: feat_dict[speaker + "_" + wav_key[3:]] = speaker_feat_dict[wav_key] # Read voice activity regions fa_fn = path.join("..", "data", "buckeye_english.wrd") print("Reading:", fa_fn) vad_dict = read_vad_from_fa(fa_fn) # Only keep voice active regions print("Extracting VAD regions:") feat_dict = features.extract_vad(feat_dict, vad_dict) # Perform per speaker mean and variance normalisation print("Per speaker mean and variance normalisation:") feat_dict = features.speaker_mvn(feat_dict) # Write output output_dir = path.join("mfcc", "buckeye") if not path.isdir(output_dir): os.makedirs(output_dir) output_fn = path.join(output_dir, args.subset + ".dd.npz") print("Writing:", output_fn) np.savez_compressed(output_fn, **feat_dict)
def main(): args = check_argv() print(datetime.now()) # Raw filterbanks data_dir = path.join(zerospeech2019_datadir, args.dataset, args.subset) if args.subset == "train": print("Extracting unit discovery features:") feat_dict = features.extract_mfcc_dir(path.join(data_dir, "unit")) print("Extracting target voice features:") feat_dict.update( features.extract_mfcc_dir(path.join(data_dir, "voice")) ) else: print("Extracting test features:") feat_dict = features.extract_mfcc_dir(data_dir) # Read voice activity regions vad_fn = path.join(zerospeech2019_datadir, args.dataset, "vads.txt") vad_dict = read_vad(vad_fn) # Only keep voice active regions print("Extracting VAD regions:") feat_dict = features.extract_vad(feat_dict, vad_dict) # Perform per speaker mean and variance normalisation print("Per speaker mean and variance normalisation:") feat_dict = features.speaker_mvn(feat_dict) # Write output output_dir = path.join("mfcc", args.dataset) if not path.isdir(output_dir): os.makedirs(output_dir) output_fn = path.join(output_dir, args.subset + ".dd.npz") print("Writing:", output_fn) np.savez_compressed(output_fn, **feat_dict) print(datetime.now())