def __init__(self, processor, linear_x="", spatial="", linear_y=None, truncated=False): self.processor = processor self.truncated = truncated self.linear_x = ScriptReader(linear_x) self.linear_y = [ScriptReader(ly) for ly in linear_y] self.spatial = ScriptReader(spatial) if spatial else None
def __init__(self, data_dir): depends = [op.join(data_dir, x) for x in ["feats.scp", "spk2utt"]] for depend in depends: if not op.exists(depend): raise RuntimeError("Missing {}!".format(depend)) self.reader = ScriptReader(depends[0]) self.spk2utt = Reader(depends[1], num_tokens=-1)
def re_decide(enroll_xvector_scp, test_xvector_scp, threshold_value): enroll_xvector_scp_reader = ScriptReader(enroll_xvector_scp) for utt, value in enroll_xvector_scp_reader: enroll_xvector = value re_decide_dict = {} test_xvector_scp_reader = ScriptReader(test_xvector_scp) for utt, test_xvector in test_xvector_scp_reader: # cos distance dist = np.dot(enroll_xvector,test_xvector)/(np.linalg.norm(enroll_xvector)*np.linalg.norm(test_xvector)) # print(dist) if dist >= threshold_value: re_decide_dict[utt] = 1 else: re_decide_dict[utt] = 0 return re_decide_dict
def test_archive_writer(ark, scp): # for matrix with ArchiveWriter(ark, scp) as writer: for i in range(10): mat = np.random.rand(100, 20) writer.write("mat-{:d}".format(i), mat) scp_reader = ScriptReader(scp) for key, mat in scp_reader: print("{0}: {1}".format(key, mat.shape)) # for vector with ArchiveWriter(ark, scp) as writer: for i in range(10): vec = np.random.rand(100) writer.write("vec-{:d}".format(i), vec) scp_reader = ScriptReader(scp) for key, vec in scp_reader: print("{0}: {1}".format(key, vec.size)) print("TEST *test_archieve_writer* DONE!")
def run(args): feats_reader = ScriptReader(args.feats) computer = NnetComputer(args.checkpoint, args.gpu) if not os.path.exists(args.dump_dir): os.makedirs(args.dump_dir) for key, feats in feats_reader: logger.info("Compute dvector on utterance {}...".format(key)) dvector = computer.compute(feats) np.save(os.path.join(args.dump_dir, key), dvector) logger.info("Compute over {:d} utterances".format(len(feats_reader)))
def ark2hdf_caching(scp_file, hdf_file): ark_reader = ScriptReader(scp_file) writer = vio.HDFWriter(file_name=hdf_file) cnt = 0 for fn in ark_reader.index_keys: feat = ark_reader[fn] # dump features writer.append(file_id=fn, feat=feat) cnt += 1 print("%d. processed: %s" % (cnt, fn)) writer.close()
def run(args): print(f"Arguments in args:\n{pprint.pformat(vars(args))}", flush=True) aligner = CtcAligner(args.am, cpt_tag=args.am_tag, device_id=args.device_id) if aligner.accept_raw: src_reader = AudioReader(args.feats_or_wav_scp, sr=args.sr, channel=args.channel) else: src_reader = ScriptReader(args.feats_or_wav_scp) if args.word_boundary: raise RuntimeError( "Now can't generate word boundary when using Kaldi's feature") txt_reader = Reader(args.text, num_tokens=-1, restrict=False) processor = TextPreProcessor(args.dict, space=args.space, spm=args.spm) ali_stdout, ali_fd = io_wrapper(args.alignment, "w") wdb_stdout, wdb_fd = False, None if args.word_boundary: wdb_stdout, wdb_fd = io_wrapper(args.word_boundary, "w") done = 0 tot_utts = len(src_reader) timer = SimpleTimer() for key, str_seq in txt_reader: done += 1 logger.info( f"Generate alignment for utterance {key} ({done}/{tot_utts}) ...") int_seq = processor.run(str_seq) wav_or_feats = src_reader[key] ali = aligner.run(wav_or_feats, int_seq) header = f"{ali['score']:.3f}, {len(ali['align_seq'])}" ali_fd.write(f"{key} {ali['align_str']}\n") logger.info(f"{key} ({header}) {ali['align_str']}") if wdb_fd: dur = wav_or_feats.shape[-1] * 1.0 / args.sr wdb = gen_word_boundary(key, dur, ali["align_str"]) wdb_fd.write("\n".join(wdb) + "\n") if not ali_stdout: ali_fd.close() if wdb_fd and not wdb_stdout: wdb_fd.close() cost = timer.elapsed() logger.info(f"Generate alignments for {tot_utts} utterance done, " + f"time cost = {cost:.2f}m")
def __init__(self, shuffle=True, mix_scp="", ref_scp="", emb_scp="", embed_format="kaldi", sr=16000): if embed_format not in ["kaldi", "numpy"]: raise RuntimeError( "Unknown embedding format {}".format(embed_format)) self.mix = WaveReader(mix_scp, sr=sr) self.ref = WaveReader(ref_scp, sr=sr) self.emb = NumpyReader( emb_scp) if embed_format == "numpy" else ScriptReader(emb_scp, matrix=False) self.shuffle = shuffle
def run(args): computer = NnetComputer(args.checkpoint, args.gpu) num_done = 0 feats_conf = load_json(args.checkpoint, "feats.json") spectra = Processor(args.spectra, **feats_conf) spatial = ScriptReader(args.spatial) if args.spatial else None dump_dir = Path(args.dump_dir) dump_dir.mkdir(exist_ok=True, parents=True) for key, feats in spectra: logger.info("Compute on utterance {}...".format(key)) if spatial: spa = spatial[key] feats = np.hstack([feats, spa]) spk_masks = computer.compute(feats) for i, m in enumerate(spk_masks): (dump_dir / f"spk{i + 1:d}").mkdir(exist_ok=True) np.save(dump_dir / f"spk{i + 1:d}" / key, m) num_done += 1 logger.info("Compute over {:d} utterances".format(num_done))
def run(args): computer = NnetComputer(args.checkpoint, args.gpu) num_done = 0 feats_conf = load_json(args.checkpoint, "feats.json") spectra = Processor(args.spectra, **feats_conf) spatial = ScriptReader(args.spatial) if args.spatial else None for key, feats in spectra: logger.info("Compute on utterance {}...".format(key)) if spatial: spa = spatial[key] feats = np.hstack([feats, spa]) spk_masks = computer.compute(feats) for i, m in enumerate(spk_masks): fdir = os.path.join(args.dump_dir, "spk{:d}".format(i + 1)) make_dir(fdir) np.save(os.path.join(fdir, key), m) num_done += 1 logger.info("Compute over {:d} utterances".format(num_done))
def test_multiprocess_script_reader(scp): # test ScriptReader scp_reader = ScriptReader(scp) pool = Pool(processes=2) try: utt_list = scp_reader.index_keys result_list = list() for (utt_id, utt_path) in utt_list: result = pool.apply_async(scp_reader.__getitem__, args = (utt_id)) result_list.append(result) pool.close() pool.join() for result in result_list: print(result.get()) except TypeError as e: print("Using ScriptReader leads to the error:\n", e) finally: del scp_reader del pool # test SynchronizedScriptReader scp_reader = SynchronizedScriptReader(scp) pool = Pool(processes=2) try: utt_list = scp_reader.index_keys result_list = list() for (utt_id, utt_path) in utt_list: result = pool.apply_async(scp_reader.__getitem__, args = (utt_id)) result_list.append(result) pool.close() pool.join() for result in result_list: print(result.get()) except TypeError as e: print("Using SynchronizedScriptReader leads to the error:\n", e) finally: del scp_reader del pool print("TEST *multiprocess_script_reader* DONE!")
def __init__(self, feats_scp: str, text: str, utt2num_frames: str, vocab_dict: Optional[Dict], skip_utts: str = "", min_token_num: int = 1, max_token_num: int = 400, max_frame_num: float = 3000, min_frame_num: float = 40) -> None: feats_reader = ScriptReader(feats_scp) super(Dataset, self).__init__(feats_reader, text, utt2num_frames, vocab_dict, max_dur=max_frame_num, min_dur=min_frame_num, dur_axis=0, skip_utts=skip_utts, min_token_num=min_token_num, max_token_num=max_token_num)
def test_script_reader(scp): scp_reader = ScriptReader(scp) for key, obj in scp_reader: print("{0}: {1}".format(key, obj.shape)) print("TEST *test_script_reader* DONE!")
def read_vad(vad_scp): scp_reader = ScriptReader(vad_scp) vad_dict = {} for utt, vad in scp_reader: vad_dict[utt] = vad return vad_dict
def run(args): print(f"Arguments in args:\n{pprint.pformat(vars(args))}", flush=True) decoder = FasterDecoder(args.am, cpt_tag=args.am_tag, function=args.function, device_id=args.device_id) if decoder.accept_raw: src_reader = AudioReader(args.feats_or_wav_scp, sr=args.sr, channel=args.channel) else: src_reader = ScriptReader(args.feats_or_wav_scp) if args.lm: if Path(args.lm).is_file(): from aps.asr.lm.ngram import NgramLM lm = NgramLM(args.lm, args.dict) logger.info( f"Load ngram LM from {args.lm}, weight = {args.lm_weight}") else: lm = NnetEvaluator(args.lm, device_id=args.device_id, cpt_tag=args.lm_tag) logger.info(f"Load RNN LM from {args.lm}: epoch {lm.epoch}, " + f"weight = {args.lm_weight}") lm = lm.nnet else: lm = None processor = TextPostProcessor(args.dict, space=args.space, show_unk=args.show_unk, spm=args.spm) stdout_top1, top1 = io_wrapper(args.best, "w") topn = None if args.dump_nbest: stdout_topn, topn = io_wrapper(args.dump_nbest, "w") if args.function == "greedy_search": nbest = min(args.beam_size, args.nbest) else: nbest = 1 topn.write(f"{nbest}\n") ali_dir = args.dump_align if ali_dir: Path(ali_dir).mkdir(exist_ok=True, parents=True) logger.info(f"Dump alignments to dir: {ali_dir}") N = 0 timer = SimpleTimer() dec_args = dict( filter(lambda x: x[0] in beam_search_params, vars(args).items())) dec_args["lm"] = lm for key, src in src_reader: logger.info(f"Decoding utterance {key}...") nbest_hypos = decoder.run(src, **dec_args) nbest = [f"{key}\n"] for idx, hyp in enumerate(nbest_hypos): # remove SOS/EOS token = hyp["trans"][1:-1] trans = processor.run(token) score = hyp["score"] nbest.append(f"{score:.3f}\t{len(token):d}\t{trans}\n") if idx == 0: top1.write(f"{key}\t{trans}\n") if ali_dir: if hyp["align"] is None: raise RuntimeError( "Can not dump alignment out as it's None") np.save(f"{ali_dir}/{key}-nbest{idx+1}", hyp["align"].numpy()) if topn: topn.write("".join(nbest)) if not (N + 1) % 10: top1.flush() if topn: topn.flush() N += 1 if not stdout_top1: top1.close() if topn and not stdout_topn: topn.close() cost = timer.elapsed() logger.info( f"Decode {len(src_reader)} utterance done, time cost = {cost:.2f}m")
def run(args): print(f"Arguments in args:\n{pprint.pformat(vars(args))}", flush=True) if args.batch_size == 1: warnings.warn("can use decode.py instead as batch_size == 1") decoder = BatchDecoder(args.am, device_id=args.device_id, cpt_tag=args.am_tag) if decoder.accept_raw: src_reader = AudioReader(args.feats_or_wav_scp, sr=args.sr, channel=args.channel) else: src_reader = ScriptReader(args.feats_or_wav_scp) if args.lm: if Path(args.lm).is_file(): from aps.asr.lm.ngram import NgramLM lm = NgramLM(args.lm, args.dict) logger.info( f"Load ngram LM from {args.lm}, weight = {args.lm_weight}") else: lm = NnetEvaluator(args.lm, device_id=args.device_id, cpt_tag=args.lm_tag) logger.info(f"Load RNN LM from {args.lm}: epoch {lm.epoch}, " + f"weight = {args.lm_weight}") lm = lm.nnet else: lm = None processor = TextPostProcessor(args.dict, space=args.space, show_unk=args.show_unk, spm=args.spm) stdout_top1, top1 = io_wrapper(args.best, "w") topn = None if args.dump_nbest: stdout_topn, topn = io_wrapper(args.dump_nbest, "w") nbest = min(args.beam_size, args.nbest) topn.write(f"{nbest}\n") ali_dir = args.dump_align if ali_dir: Path(ali_dir).mkdir(exist_ok=True, parents=True) logger.info(f"Dump alignments to dir: {ali_dir}") done = 0 timer = SimpleTimer() batches = [] dec_args = dict( filter(lambda x: x[0] in beam_search_params, vars(args).items())) dec_args["lm"] = lm tot_utts = len(src_reader) for key, src in src_reader: done += 1 batches.append({ "key": key, "inp": src, "len": src.shape[-1] if decoder.accept_raw else src.shape[0] }) end = (done == len(src_reader) and len(batches)) if len(batches) != args.batch_size and not end: continue # decode batches = sorted(batches, key=lambda b: b["len"], reverse=True) batch_nbest = decoder.run([bz["inp"] for bz in batches], **dec_args) keys = [bz["key"] for bz in batches] for key, nbest in zip(keys, batch_nbest): logger.info(f"Decoding utterance {key} ({done}/{tot_utts}) ...") nbest_hypos = [f"{key}\n"] for idx, hyp in enumerate(nbest): # remove SOS/EOS token = hyp["trans"][1:-1] trans = processor.run(token) score = hyp["score"] nbest_hypos.append(f"{score:.3f}\t{len(token):d}\t{trans}\n") if idx == 0: logger.info(f"{key} ({score:.3f}, {len(token):d}) {trans}") top1.write(f"{key}\t{trans}\n") if ali_dir: if hyp["align"] is None: raise RuntimeError( "Can not dump alignment out as it's None") np.save(f"{ali_dir}/{key}-nbest{idx+1}", hyp["align"].numpy()) if topn: topn.write("".join(nbest_hypos)) top1.flush() if topn: topn.flush() batches.clear() if not stdout_top1: top1.close() if topn and not stdout_topn: topn.close() cost = timer.elapsed() logger.info(f"Decode {tot_utts} utterance done, time cost = {cost:.2f}m")
def script_reader(scp): reader = ScriptReader(scp, matrix=False) return reader