예제 #1
0
def make_sre08_data(data_root, data_train_loc, data_test_loc):
    print('Making sre2008 lists...')
    train_loc = join_path(data_root, data_train_loc)
    test_loc = join_path(data_root, data_test_loc)

    model_key = join_path(test_loc, 'data/keys/NIST_SRE08_KEYS.v0.1/model-keys/NIST_SRE08_short2.model.key')
    trials_key = join_path(test_loc, 'data/keys/NIST_SRE08_KEYS.v0.1/trial-keys/NIST_SRE08_short2-short3.trial.key')

    train_file_list = get_file_list_as_dict(train_loc)
    test_file_list = get_file_list_as_dict(test_loc)
    file_list = {**train_file_list, **test_file_list}

    index_list = []
    location_list = []
    speaker_list = []
    channel_list = []
    read_list = []
    model_to_speaker = dict()
    with open(model_key, 'r') as f:
        for line in f.readlines()[1:]:
            tokens = re.split('[,:]+', line.strip())
            model_id = tokens[0]
            file_name = tokens[2]
            channel = 1 if tokens[3] == 'a' else 2
            speaker_id = tokens[4]
            model_to_speaker[model_id] = speaker_id
            try:
                file_loc = file_list[file_name]
                speaker_id = 'sre2008_' + speaker_id
                index_list.append('{}-sre2008_{}_ch{}'.format(speaker_id, file_name, channel))
                location_list.append(file_loc)
                channel_list.append(channel)
                speaker_list.append(speaker_id)
                read_list.append('sph2pipe -f wav -p -c {} {}'.format(channel, file_loc))
            except KeyError:
                pass

    with open(trials_key, 'r') as f:
        for line in f.readlines()[1:]:
            tokens = re.split('[,]+', line.strip())
            model_id = tokens[0]
            file_name = tokens[1]
            channel = 1 if tokens[2] == 'a' else 2
            target_type = tokens[3]
            try:
                file_loc = file_list[file_name]
                speaker_id = 'sre2008_' + model_to_speaker[model_id]
                if target_type == 'target':
                    index_list.append('{}-sre2008_{}_ch{}'.format(speaker_id, file_name, channel))
                    location_list.append(file_loc)
                    channel_list.append(channel)
                    speaker_list.append(speaker_id)
                    read_list.append('sph2pipe -f wav -p -c {} {}'.format(channel, file_loc))
                    del file_list[file_name]
            except KeyError:
                pass

    print('Made {:d} files from sre2008.'.format(len(index_list)))
    return np.vstack([index_list, location_list, channel_list, speaker_list, read_list])
예제 #2
0
def make_swbd_cellular(data_root, data_loc, cellular=1):
    print('Making swbd cellular {} lists...'.format(cellular))
    swbd_loc = join_path(data_root, data_loc)

    bad_audio = [40019, 45024, 40022]
    stats_key = join_path(swbd_loc, 'doc{}/swb_callstats.tbl'.format('' if cellular == 1 else 's'))
    swbd_type = 'swbd_c{:d}_'.format(cellular)

    file_list = get_file_list_as_dict(swbd_loc)

    for ba in bad_audio:
        try:
            del file_list['sw_' + str(ba)]
        except KeyError:
            pass

    index_list = []
    location_list = []
    channel_list = []
    speaker_list = []
    read_list = []
    with open(stats_key, 'r') as f:
        for line in f.readlines():
            tokens = re.split('[,]+', line.strip())
            file_name = tokens[0]
            speaker_id1 = 'sw_' + tokens[1]
            speaker_id2 = 'sw_' + tokens[2]
            try:
                file_loc = file_list['sw_' + str(file_name)]
                index_list.append(speaker_id1 + '-' + swbd_type + file_name + '_ch1')
                location_list.append(file_loc)
                channel_list.append(1)
                speaker_list.append(speaker_id1)
                read_list.append('sph2pipe -f wav -p -c 1 {}'.format(file_loc))
                index_list.append(speaker_id2 + '-' + swbd_type + file_name + '_ch2')
                location_list.append(file_loc)
                channel_list.append(2)
                speaker_list.append(speaker_id2)
                read_list.append('sph2pipe -f wav -p -c 2 {}'.format(file_loc))
                del file_list['sw_' + str(file_name)]
            except KeyError:
                pass

    print('Made {:d} files swbd cellular {}.'.format(len(index_list), cellular))
    return np.vstack([index_list, location_list, channel_list, speaker_list, read_list])
예제 #3
0
def make_mixer6_mic(data_root, data_loc):
    print('Making mixer6 mic lists...')
    mx6_loc = join_path(data_root, data_loc)
    mx6_mic_loc = join_path(mx6_loc, 'data/pcm_flac')

    bad_audio = ['20091208_091618_HRM_120831']

    stats_key = join_path(mx6_loc, 'docs/mx6_ivcomponents.csv')
    file_list = dict()
    mic_idx = ['02', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13']  # Omitting 01, 03 and 14
    for idx in mic_idx:
        mic_loc = join_path(mx6_mic_loc, 'CH' + idx)
        mic_file_list = get_file_list_as_dict(mic_loc, pattern='*.flac')
        file_list = {**mic_file_list, **file_list}

    index_list = []
    location_list = []
    channel_list = []
    speaker_list = []
    read_list = []
    with open(stats_key, 'r') as f:
        for line in f.readlines()[1:]:
            tokens = re.split('[,]+', line.strip())
            session_id = tokens[0]
            speaker_id = 'MX6_' + re.split('[_]+', session_id)[3]
            start_time = tokens[7]
            end_time = tokens[8]
            if session_id not in bad_audio:
                for idx in mic_idx:
                    file_name = '{}_CH{}'.format(session_id, idx)
                    try:
                        file_loc = file_list[file_name]
                        index_list.append('{}-MX6_MIC_{}'.format(speaker_id, file_name))
                        location_list.append(file_loc)
                        channel_list.append(1)
                        speaker_list.append(speaker_id)
                        read_list.append('sox -t flac {} -r 8k -t wav -V0 - trim {} {}'
                                         .format(file_loc, start_time, float(end_time) - float(start_time)))
                    except KeyError:
                        pass

    print('Made {:d} files from mixer6 mic.'.format(len(index_list)))
    return np.vstack([index_list, location_list, channel_list, speaker_list, read_list])
예제 #4
0
def make_mixer6_calls(data_root, data_loc):
    print('Making mixer6 calls lists...')
    mx6_loc = join_path(data_root, data_loc)
    mx6_calls_loc = join_path(mx6_loc, 'data/ulaw_sphere')

    stats_key = join_path(mx6_loc, 'docs/mx6_calls.csv')
    file_list = get_file_list_as_dict(mx6_calls_loc)

    call_to_file = dict()
    for key in file_list.keys():
        call_id = re.split('[_]+', key)[2]
        call_to_file[call_id] = key

    index_list = []
    location_list = []
    channel_list = []
    speaker_list = []
    read_list = []
    with open(stats_key, 'r') as f:
        for line in f.readlines()[1:]:
            tokens = re.split('[,]+', line.strip())
            call_id = tokens[0]
            speaker_id1 = 'MX6_' + tokens[4]
            speaker_id2 = 'MX6_' + tokens[12]
            try:
                file_name = call_to_file[call_id]
                file_loc = file_list[file_name]
                index_list.append('{}-MX6_CALLS_{}_ch1'.format(speaker_id1, file_name))
                location_list.append(file_loc)
                channel_list.append(1)
                speaker_list.append(speaker_id1)
                read_list.append('sph2pipe -f wav -p -c 1 {}'.format(file_loc))
                index_list.append('{}-MX6_CALLS_{}_ch2'.format(speaker_id2, file_name))
                location_list.append(file_loc)
                channel_list.append(2)
                speaker_list.append(speaker_id2)
                read_list.append('sph2pipe -f wav -p -c 2 {}'.format(file_loc))
            except KeyError:
                pass
    print('Made {:d} files from mixer6 calls.'.format(len(index_list)))
    return np.vstack([index_list, location_list, channel_list, speaker_list, read_list])
예제 #5
0
def make_swbd_phase(data_root, data_loc, phase=1):
    print('Making swbd phase {} lists...'.format(phase))
    swbd_loc = join_path(data_root, data_loc)

    bad_audio = ['sw_22602']

    stats_key = join_path(swbd_loc, 'docs/callinfo.tbl')
    swbd_type = 'swbd_p{:d}_'.format(phase)

    file_list = get_file_list_as_dict(swbd_loc)

    for ba in bad_audio:
        try:
            del file_list[ba]
        except KeyError:
            pass

    index_list = []
    location_list = []
    channel_list = []
    speaker_list = []
    read_list = []
    with open(stats_key, 'r') as f:
        for line in f.readlines():
            tokens = re.split('[,]+', line.strip())
            file_name = ('sw_' + tokens[0]) if phase == 3 else ('' + tokens[0].split('.')[0])
            speaker_id = 'sw_' + str(tokens[2])
            channel = 1 if tokens[3] == 'A' else 2
            try:
                file_loc = file_list[file_name]
                index_list.append(speaker_id + '-' + swbd_type + file_name + '_ch{:d}'.format(channel))
                location_list.append(file_loc)
                channel_list.append(channel)
                speaker_list.append(speaker_id)
                read_list.append('sph2pipe -f wav -p -c {} {}'.format(channel, file_loc))
            except KeyError:
                pass

    print('Made {:d} files swbd phase {}.'.format(len(index_list), phase))
    return np.vstack([index_list, location_list, channel_list, speaker_list, read_list])
예제 #6
0
def make_old_sre_data(data_root, data_loc, sre_year, speaker_key):
    print('Making sre{} lists...'.format(sre_year))
    sre_loc = join_path(data_root, data_loc)
    sre_year = 'sre' + str(sre_year)
    bad_audio = ['jagi', 'jaly', 'jbrg', 'jcli', 'jfmx']
    file_list = get_file_list_as_dict(sre_loc)

    for ba in bad_audio:
        try:
            del file_list[ba]
        except KeyError:
            pass

    index_list = []
    location_list = []
    speaker_list = []
    channel_list = []
    read_list = []
    with open(speaker_key, 'r') as f:
        for line in f.readlines():
            tokens = re.split('[\s]+', line.strip())
            speaker_id = tokens[0]
            file_name = tokens[3]
            channel = 1 if tokens[4] == 'A' else 2
            if sre_year == tokens[2]:
                try:
                    file_loc = file_list[file_name]
                    speaker_id = sre_year + '_' + speaker_id
                    index_list.append('{}-{}_{}_ch{}'.format(speaker_id, sre_year, file_name, channel))
                    location_list.append(file_loc)
                    speaker_list.append(speaker_id)
                    channel_list.append(channel)
                    read_list.append('sph2pipe -f wav -p -c {} {}'.format(channel, file_loc))
                except KeyError:
                    pass

    print('Made {:d} files from {}.'.format(len(index_list), sre_year))
    return np.vstack([index_list, location_list, channel_list, speaker_list, read_list])
예제 #7
0
def make_sre18_eval_data(sre_config):
    print('Making sre2018 eval lists...')
    with open(sre_config, 'r') as f:
        sre_data = load_json(f.read())
    data_root = sre_data['ROOT']
    data_loc = sre_data['LOCATION']['SRE18_EVAL']
    sre_loc = join_path(data_root, data_loc)

    sph_file_list = get_file_list_as_dict(join_path(sre_loc, 'data/enrollment'), pattern='*.sph', ext=True)
    flac_file_list = get_file_list_as_dict(join_path(sre_loc, 'data/enrollment'), pattern='*.flac', ext=True)
    diarization_file = join_path(sre_loc, 'docs/sre18_eval_enrollment_diarization.tsv')
    key_file = join_path(sre_loc, 'docs/sre18_eval_enrollment.tsv')

    diarization_dict = dict()
    with open(diarization_file) as f:
        for line in f.readlines()[1:]:
            tokens = re.split('[\s]+', line.strip())
            diarization_dict[tokens[0]] = (float(tokens[2]), float(tokens[3]))

    utt_to_spk = dict()
    with open(key_file) as f:
        for line in f.readlines()[1:]:
            tokens = re.split('[\s]+', line.strip())
            utt_to_spk[tokens[1]] = tokens[0]

    index_list = []
    location_list = []
    speaker_list = []
    channel_list = []
    read_list = []
    for key in sph_file_list.keys():
        file_loc = sph_file_list[key]
        index_list.append('sre18_eval_enroll_{}'.format(key))
        location_list.append(file_loc)
        speaker_list.append(utt_to_spk[key])
        channel_list.append(1)
        read_list.append('sph2pipe -f wav -p -c 1 {}'.format(file_loc))

    for key in flac_file_list.keys():
        file_loc = flac_file_list[key]
        index_list.append('sre18_eval_enroll_{}'.format(key))
        location_list.append(file_loc)
        speaker_list.append(utt_to_spk[key])
        channel_list.append(1)
        try:
            start_time, end_time = diarization_dict[key]
            read_list.append('sox -t flac {} -r 8k -t wav -V0 - trim {} {}'
                             .format(file_loc, start_time, float(end_time) - float(start_time)))
        except KeyError:
            read_list.append('sox -t flac {} -r 8k -t wav -V0 -'.format(file_loc))

    sre_eval_enroll = np.vstack([index_list, location_list, channel_list, speaker_list, read_list]).T

    sph_file_list = get_file_list_as_dict(join_path(sre_loc, 'data/test'), pattern='*.sph', ext=True)
    flac_file_list = get_file_list_as_dict(join_path(sre_loc, 'data/test'), pattern='*.flac', ext=True)
    file_list = {**sph_file_list, **flac_file_list}
    trials_key = join_path(sre_loc, 'docs/sre18_eval_trial_key.tsv')

    index_list = []
    location_list = []
    speaker_list = []
    channel_list = []
    read_list = []
    with open(trials_key) as f:
        for line in f.readlines()[1:]:
            tokens = re.split('[\s]+', line.strip())
            try:
                file_loc = file_list[tokens[1]]
                if tokens[3] == 'target':
                    index_list.append(tokens[1])
                    location_list.append(file_loc)
                    speaker_list.append(tokens[1])
                    channel_list.append(1)
                    if tokens[1][-3:] == 'sph':
                        read_list.append('sph2pipe -f wav -p -c 1 {}'.format(file_loc))
                    else:
                        read_list.append('sox -t flac {} -r 8k -t wav -V0 -'.format(file_loc))
                    del file_list[tokens[1]]
            except KeyError:
                pass

    sre_eval_test = np.vstack([index_list, location_list, channel_list, speaker_list, read_list]).T
    return sre_eval_enroll, sre_eval_test
예제 #8
0
def make_sre16_data(data_root, data_loc):
    print('Making sre2016 lists...')
    sre_loc = join_path(data_root, data_loc)
    file_list = get_file_list_as_dict(join_path(sre_loc, 'data/enrollment'))
    meta_key = join_path(sre_loc, 'docs/sre16_eval_enrollment.tsv')

    index_list = []
    location_list = []
    speaker_list = []
    channel_list = []
    read_list = []
    with open(meta_key, 'r') as f:
        for line in f.readlines()[1:]:
            tokens = re.split('[\s]+', line.strip())
            speaker_id = 'sre16_eval_enroll_' + tokens[0]
            file_name = tokens[1]
            try:
                file_loc = file_list[file_name]
                index_list.append('{}-sre16_eval_enroll_{}'.format(speaker_id, file_name))
                location_list.append(file_loc)
                speaker_list.append(speaker_id)
                channel_list.append(1)
                read_list.append('sph2pipe -f wav -p -c 1 {}'.format(file_loc))
                del file_list[file_name]
            except KeyError:
                pass

    print('Made {:d} enrollment files.'.format(len(index_list)))
    enrollment_data = np.vstack([index_list, location_list, channel_list, speaker_list, read_list])

    file_list = get_file_list_as_dict(join_path(sre_loc, 'data/test'))

    segment_key = join_path(sre_loc, 'docs/sre16_eval_segment_key.tsv')
    language_key = join_path(sre_loc, 'metadata/calls.tsv')
    trial_key = join_path(sre_loc, 'docs/sre16_eval_trial_key.tsv')

    utt_to_call = dict()
    with open(segment_key, 'r') as f:
        for line in f.readlines()[1:]:
            tokens = re.split('[\s]+', line.strip())
            utt_to_call[tokens[0]] = tokens[1]

    call_to_language = dict()
    with open(language_key, 'r') as f:
        for line in f.readlines()[1:]:
            tokens = re.split('[\s]+', line.strip())
            call_to_language[tokens[0]] = tokens[1]

    index_list = []
    location_list = []
    speaker_list = []
    channel_list = []
    read_list = []
    language_list = []
    target_list = []
    with open(trial_key, 'r') as f:
        for line in f.readlines()[1:]:
            tokens = re.split('[\s]+', line.strip())
            speaker_id = 'sre16_eval_enroll_' + tokens[0]
            file_name = tokens[1]
            target_type = tokens[3]
            call_id = utt_to_call[file_name]
            try:
                file_loc = file_list[file_name]
                index_list.append('{}-sre16_eval_test_{}'.format(speaker_id, file_name))
                location_list.append(file_loc)
                speaker_list.append(speaker_id)
                channel_list.append(1)
                read_list.append('sph2pipe -f wav -p -c 1 {}'.format(file_loc))
                language_list.append(call_to_language[call_id])
                target_list.append(target_type)
                del file_list[file_name]
            except KeyError:
                pass

    print('Made {:d} test files.'.format(len(index_list)))
    test_data = np.vstack([index_list, location_list, channel_list, speaker_list, read_list])
    return np.hstack([enrollment_data, test_data])
예제 #9
0
def make_sre10_data(data_root, data_loc):
    print('Making sre2010 lists...')
    sre_loc = join_path(data_root, data_loc)

    model_key = join_path(sre_loc, 'keys/coreext.modelkey.csv')
    train_key = join_path(sre_loc, 'train/coreext.trn')
    trials_key = join_path(sre_loc, 'keys/coreext-coreext.trialkey.csv')

    file_list = get_file_list_as_dict(join_path(sre_loc, 'data'))

    index_list = []
    location_list = []
    speaker_list = []
    channel_list = []
    read_list = []
    model_to_speaker = dict()
    with open(model_key, 'r') as f:
        for line in f.readlines()[1:]:
            tokens = re.split('[,]+', line.strip())
            model_id = tokens[0]
            speaker_id = tokens[1]
            if not speaker_id == 'NOT_SCORED':
                model_to_speaker[model_id] = speaker_id

    with open(train_key, 'r') as f:
        for line in f.readlines():
            tokens = re.split('[\s:]+', line.strip())
            model_id = tokens[0]
            file_name = tokens[2].split('/')[2].split('.sph')[0]
            channel = 1 if tokens[3] == 'A' else 2
            try:
                file_loc = file_list[file_name]
                speaker_id = 'sre2010_' + model_to_speaker[model_id]
                index_list.append('{}-sre2010_{}_ch{}'.format(speaker_id, file_name, channel))
                location_list.append(file_loc)
                speaker_list.append(speaker_id)
                channel_list.append(channel)
                read_list.append('sph2pipe -f wav -p -c {} {}'.format(channel, file_loc))
            except KeyError:
                pass

    with open(trials_key, 'r') as f:
        for line in f.readlines():
            tokens = re.split('[,]+', line.strip())
            model_id = tokens[0]
            file_name = tokens[1]
            channel = 1 if tokens[2] == 'A' else 2
            target_type = tokens[3]
            try:
                speaker_id = 'sre2010_' + model_to_speaker[model_id]
                file_loc = file_list[file_name]
                if target_type == 'target':
                    index_list.append('{}-sre2010_{}_ch{}'.format(speaker_id, file_name, channel))
                    location_list.append(file_loc)
                    speaker_list.append(speaker_id)
                    channel_list.append(channel)
                    read_list.append('sph2pipe -f wav -p -c {} {}'.format(channel, file_loc))
                    del file_list[file_name]
            except KeyError:
                pass

    print('Made {:d} files from sre2010.'.format(len(index_list)))
    return np.vstack([index_list, location_list, channel_list, speaker_list, read_list])