예제 #1
0
def extract_features_for_subset(language, subset, feat_type, output_fn):
    """
    Extract features for the subset in this language.

    The `feat_type` parameter can be "mfcc" or "fbank".
    """

    # Get speakers for subset
    speakers_fn = path.join("..", "data", subset + "_spk.list")
    print("Reading:", speakers_fn)
    speakers = read_speakers(speakers_fn, language)

    # Convert shorten audio to wav
    wav_dir = path.join("wav", language, subset)
    if not path.isdir(wav_dir):
        os.makedirs(wav_dir)
    print("Converting shn audio to wav:")
    shorten_to_wav(language, speakers, wav_dir)

    # Extract raw features
    print("Extracting features:")
    if feat_type == "mfcc":
        feat_dict = features.extract_mfcc_dir(wav_dir)
    elif feat_type == "fbank":
        feat_dict = features.extract_fbank_dir(wav_dir)
    else:
        assert False, "invalid feature type"

    # Perform per speaker mean and variance normalisation
    print("Per speaker mean and variance normalisation:")
    feat_dict = features.speaker_mvn(feat_dict)

    # Write output
    print("Writing:", output_fn)
    np.savez_compressed(output_fn, **feat_dict)
예제 #2
0
def extract_features(feat_type, output_fn):
    """
    Extract specified features.

    The `feat_type` parameter can be "mfcc" or "fbank".
    """

    # Raw features
    feat_dict = {}
    if feat_type == "mfcc":
        feat_dict_wavkey = features.extract_mfcc_dir(xitsonga_datadir)
    elif feat_type == "fbank":
        feat_dict_wavkey = features.extract_fbank_dir(xitsonga_datadir)
    else:
        assert False, "invalid feature type"
    for wav_key in feat_dict_wavkey:
        feat_key = utils.uttlabel_to_uttkey(wav_key)
        feat_dict[feat_key] = feat_dict_wavkey[wav_key]

    # Read voice activity regions
    fa_fn = path.join("..", "data", "xitsonga.wrd")
    print("Reading:", fa_fn)
    vad_dict = utils.read_vad_from_fa(fa_fn)

    # Only keep voice active regions
    print("Extracting VAD regions:")
    feat_dict = features.extract_vad(feat_dict, vad_dict)

    # Perform per speaker mean and variance normalisation
    print("Per speaker mean and variance normalisation:")
    feat_dict = features.speaker_mvn(feat_dict)

    # Write output
    print("Writing:", output_fn)
    np.savez_compressed(output_fn, **feat_dict)
예제 #3
0
def extract_features_for_subset(subset, feat_type, output_fn):
    """
    Extract specified features for a subset.

    The `feat_type` parameter can be "mfcc" or "fbank".
    """

    # Speakers for subset
    speaker_fn = path.join("..", "data",
                           "buckeye_" + subset + "_speakers.list")
    print("Reading:", speaker_fn)
    speakers = set()
    with open(speaker_fn) as f:
        for line in f:
            speakers.add(line.strip())
    print("Speakers:", ", ".join(sorted(speakers)))

    # Raw features
    feat_dict = {}
    print("Extracting features per speaker:")
    for speaker in sorted(speakers):
        if feat_type == "mfcc":
            speaker_feat_dict = features.extract_mfcc_dir(
                path.join(buckeye_datadir, speaker))
        elif feat_type == "fbank":
            speaker_feat_dict = features.extract_fbank_dir(
                path.join(buckeye_datadir, speaker))
        else:
            assert False, "invalid feature type"
        for wav_key in speaker_feat_dict:
            feat_dict[speaker + "_" + wav_key[3:]] = speaker_feat_dict[wav_key]

    # Read voice activity regions
    fa_fn = path.join("..", "data", "buckeye_english.wrd")
    print("Reading:", fa_fn)
    vad_dict = utils.read_vad_from_fa(fa_fn)

    # Only keep voice active regions
    print("Extracting VAD regions:")
    feat_dict = features.extract_vad(feat_dict, vad_dict)

    # Perform per speaker mean and variance normalisation
    print("Per speaker mean and variance normalisation:")
    feat_dict = features.speaker_mvn(feat_dict)

    # Write output
    print("Writing:", output_fn)
    np.savez_compressed(output_fn, **feat_dict)
예제 #4
0
def main():
    args = check_argv()

    print(datetime.now())

    # Speaker set for the indicated subset
    speaker_fn = path.join("..", "data",
                           "buckeye_" + args.subset + "_speakers.list")
    print("Reading:", speaker_fn)
    speakers = set()
    with open(speaker_fn) as f:
        for line in f:
            speakers.add(line.strip())
    print("Speakers:", sorted(speakers))

    # Raw filterbanks
    feat_dict = {}
    print("Extracting features per speaker:")
    for speaker in sorted(speakers):
        speaker_feat_dict = features.extract_mfcc_dir(
            path.join(buckeye_datadir, speaker))
        for wav_key in speaker_feat_dict:
            feat_dict[speaker + "_" + wav_key[3:]] = speaker_feat_dict[wav_key]

    # Read voice activity regions
    fa_fn = path.join("..", "data", "buckeye_english.wrd")
    print("Reading:", fa_fn)
    vad_dict = read_vad_from_fa(fa_fn)

    # Only keep voice active regions
    print("Extracting VAD regions:")
    feat_dict = features.extract_vad(feat_dict, vad_dict)

    # Perform per speaker mean and variance normalisation
    print("Per speaker mean and variance normalisation:")
    feat_dict = features.speaker_mvn(feat_dict)

    # Write output
    output_dir = path.join("mfcc", "buckeye")
    if not path.isdir(output_dir):
        os.makedirs(output_dir)
    output_fn = path.join(output_dir, args.subset + ".dd.npz")
    print("Writing:", output_fn)
    np.savez_compressed(output_fn, **feat_dict)
예제 #5
0
def main():
    args = check_argv()

    print(datetime.now())

    # Raw filterbanks
    data_dir = path.join(zerospeech2019_datadir, args.dataset, args.subset)
    if args.subset == "train":
        print("Extracting unit discovery features:")
        feat_dict = features.extract_mfcc_dir(path.join(data_dir, "unit"))
        print("Extracting target voice features:")
        feat_dict.update(
            features.extract_mfcc_dir(path.join(data_dir, "voice"))
            )
    else:
        print("Extracting test features:")
        feat_dict = features.extract_mfcc_dir(data_dir)

    # Read voice activity regions
    vad_fn = path.join(zerospeech2019_datadir, args.dataset, "vads.txt")
    vad_dict = read_vad(vad_fn)

    # Only keep voice active regions
    print("Extracting VAD regions:")
    feat_dict = features.extract_vad(feat_dict, vad_dict)

    # Perform per speaker mean and variance normalisation
    print("Per speaker mean and variance normalisation:")
    feat_dict = features.speaker_mvn(feat_dict)

    # Write output
    output_dir = path.join("mfcc", args.dataset)
    if not path.isdir(output_dir):
        os.makedirs(output_dir)
    output_fn = path.join(output_dir, args.subset + ".dd.npz")
    print("Writing:", output_fn)
    np.savez_compressed(output_fn, **feat_dict)

    print(datetime.now())