Пример #1
0
def gen_dummy_meta(num_spk, num_utt_per_spk):
    ''' Generate a dummy data. '''
    meta = kaldi_dir.KaldiMetaData()
    for spk_idx in range(num_spk):
        for utt_idx in range(num_utt_per_spk):
            spk = str(spk_idx)
            utt = '%s_%d' % (spk, utt_idx)
            utt_meta = kaldi_dir.Utt()
            utt_meta.feat = 'foo/bar/feat/%s' % (utt)
            utt_meta.vad = 'foo/bar/vad/%s' % (utt)
            utt_meta.spk = spk
            meta.utts[utt] = utt_meta
    meta.collect_spks_from_utts()
    return meta
Пример #2
0
def gen_dummy_data_dir(data_dir,
                       num_spk,
                       num_utt_per_spk,
                       feat_len=100,
                       feat_dim=40):
    ''' Generate a dummy data directory and return its meta. '''
    os.makedirs(data_dir, exist_ok=True)

    meta = kaldi_dir.KaldiMetaData()
    feats = {}
    vads = {}
    for spk_idx in range(num_spk):
        for utt_idx in range(num_utt_per_spk):
            spk = str(spk_idx)
            utt = '%s_%d' % (spk, utt_idx)
            utt_meta = kaldi_dir.Utt()
            feat_mat = np.ones((feat_len, feat_dim), dtype='float32')
            feats[utt] = feat_mat
            utt_meta.featlen = feat_len
            vad_mat = np.ones((feat_len, ), dtype='float32')
            vads[utt] = vad_mat
            utt_meta.spk = spk
            meta.utts[utt] = utt_meta
    meta.collect_spks_from_utts()
    meta.dump(data_dir, True)

    feats_ark_path = os.path.join(data_dir, 'feats.ark')
    feats_scp_path = os.path.join(data_dir, 'feats.scp')
    kaldiio.save_ark(feats_ark_path, feats, scp=feats_scp_path, text=True)
    vad_ark_path = os.path.join(data_dir, 'vad.ark')
    vad_scp_path = os.path.join(data_dir, 'vad.scp')
    kaldiio.save_ark(vad_ark_path, vads, scp=vad_scp_path, text=True)

    loaded_meta = kaldi_dir.KaldiMetaData()
    loaded_meta.load(data_dir)
    return loaded_meta
Пример #3
0
 def test_dump_and_load(self):
     ''' test dump and load data '''
     temp_dir = self.get_temp_dir()
     num_spk = 5
     num_utt_per_spk = 3
     meta = gen_dummy_meta(num_spk, num_utt_per_spk)
     meta.dump(temp_dir, True)
     with open(os.path.join(temp_dir, 'feats.scp'), 'r') as fp_in:
         logging.info('feats.scp:\n%s' % (fp_in.read()))
     loaded_meta = kaldi_dir.KaldiMetaData()
     loaded_meta.load(temp_dir)
     self.assertEqual(len(meta.utts), len(loaded_meta.utts))
     for utt_key in meta.utts.keys():
         self.assertIn(utt_key, loaded_meta.utts)
     self.assertEqual(len(meta.spks), len(loaded_meta.spks))
     for spk_key in meta.spks.keys():
         self.assertIn(spk_key, loaded_meta.spks)
Пример #4
0
def main():
    ''' The main function. '''
    logging.set_verbosity(logging.INFO)

    parser = argparse.ArgumentParser()
    parser.add_argument('--num-spk-cv', type=float, default=0)
    parser.add_argument('--num-utt-cv', type=float, default=0)
    parser.add_argument('--cv-spk-percent', type=float, default=0.0)
    parser.add_argument('--cv-utt-percent', type=float, default=0.0)
    parser.add_argument('--fair-choice', type=bool, default=True)
    parser.add_argument('data_dir')
    parser.add_argument('data_dir_tr')
    parser.add_argument('data_dir_cv')

    args = parser.parse_args()

    num_spk_cv = args.num_spk_cv
    num_utt_cv = args.num_utt_cv
    if args.cv_spk_percent > 0:
        if args.cv_spk_percent >= 100:
            raise ValueError('cv_spk_percent cannot >= 100')
        num_spk_cv = args.cv_spk_percent / 100
    if args.cv_utt_percent > 0:
        if args.cv_utt_percent >= 100:
            raise ValueError('cv_utt_percent cannot >= 100')
        num_utt_cv = args.cv_utt_percent / 100
    if num_spk_cv == 0 and num_utt_cv == 0:
        num_spk_cv = 0.1

    meta = kaldi_dir.KaldiMetaData()
    meta.load(args.data_dir)
    meta_tr, meta_cv = kaldi_dir_utils.subset_data_dir_tr_cv(
        meta,
        num_spk_cv=num_spk_cv,
        num_utt_cv=num_utt_cv,
        fair_choice=args.fair_choice)
    logging.info('#spks tr: %d, cv: %d; #utts tr: %d, cv: %d' %
                 (len(meta_tr.spks), len(meta_cv.spks), len(
                     meta_tr.utts), len(meta_cv.utts)))

    meta_tr.dump(args.data_dir_tr, overwrite=True)
    meta_cv.dump(args.data_dir_cv, overwrite=True)