def test_cut_with_temporal_array_move_to_memory_large_offset(): path = "test/fixtures/libri/cuts.json" cut = CutSet.from_file(path)[0] cut.start = 10.0 cut.duration = 1.5 with NamedTemporaryFile(suffix=".h5") as f, NumpyHdf5Writer(f.name) as w: arr = np.array( np.arange( compute_num_frames(cut.duration, frame_shift=0.01, sampling_rate=16000))) cut.custom_array = w.store_array( key="dummy-key", value=arr, frame_shift=0.01, temporal_dim=0, start=cut.start, ) cut_mem = cut.move_to_memory() arr_mem = cut_mem.load_custom_array() assert arr.dtype == arr_mem.dtype np.testing.assert_equal(arr, arr_mem) arr_trunc = cut.truncate(duration=0.5).load_custom_array() arr_mem_trunc = cut_mem.truncate(duration=0.5).load_custom_array() assert arr_trunc.dtype == arr_mem_trunc.dtype np.testing.assert_equal(arr_trunc, arr_mem_trunc)
def test_cutset_from_webdataset_sharded_pipe(): cuts = CutSet.from_file("test/fixtures/libri/cuts.json") cut = cuts[0] cuts = [] for i in range(10): cuts.append(fastcopy(cut, id=cut.id + "-" + str(i))) cuts = CutSet.from_cuts(cuts) with TemporaryDirectory() as dir_path: tar_pattern = f"pipe:gzip -c > {dir_path}/shard-%06d.tar.gz" export_to_webdataset(cuts, output_path=tar_pattern, shard_size=2) # disabling shard shuffling for testing purposes here cuts_ds = CutSet.from_webdataset( "pipe:gunzip -c " + dir_path + "/shard-{000000..000004}.tar.gz", shuffle_shards=False, ) assert list(cuts.ids) == list(cuts_ds.ids) for c, cds in zip(cuts, cuts_ds): np.testing.assert_equal(c.load_audio(), cds.load_audio()) np.testing.assert_almost_equal( c.load_features(), cds.load_features(), decimal=2 )
def test_webdataset_sampler_epoch_increment(): cuts = CutSet.from_file("test/fixtures/libri/cuts.json").repeat(10) with TemporaryDirectory() as dir_path: tar_pattern = f"{dir_path}/shard-%06d.tar" export_to_webdataset(cuts, output_path=tar_pattern, shard_size=1) cuts_ds = CutSet.from_webdataset( [str(p) for p in Path(dir_path).glob("*.tar")], shuffle_shards=True ) sampler = DynamicCutSampler(cuts_ds, max_cuts=1) dloader = DataLoader( IterableDatasetWrapper(DummyDataset(), sampler, auto_increment_epoch=True), batch_size=None, num_workers=1, persistent_workers=True, ) epoch_batches = {} for epoch in [0, 1]: batches = [] for batch in dloader: for cut in batch: batches.append(cut) epoch_batches[epoch] = CutSet.from_cuts(batches) # Both epochs have the same cut IDs. assert sorted(epoch_batches[0].ids) == sorted(epoch_batches[1].ids) # Both epochs have different cut order (shards were re-shuffled). assert list(epoch_batches[0].ids) != list(epoch_batches[1].ids)
def extract_cuts( cutset: Pathlike, output_cutset: Pathlike, storage_path: Pathlike, feature_manifest: Optional[Pathlike], storage_type: str, num_jobs: int, ): """ Extract features for cuts in a given CUTSET manifest. The features are stored in STORAGE_PATH, and the output manifest with features is stored in OUTPUT_CUTSET. """ from lhotse import CutSet cuts: CutSet = CutSet.from_file(cutset) feature_extractor = (FeatureExtractor.from_yaml(feature_manifest) if feature_manifest is not None else Fbank()) cuts = cuts.compute_and_store_features( extractor=feature_extractor, storage_path=storage_path, num_jobs=num_jobs, storage_type=get_writer(storage_type), ) Path(output_cutset).parent.mkdir(parents=True, exist_ok=True) cuts.to_file(output_cutset)
def test_cut_move_to_memory_load_custom_false(): path = "test/fixtures/libri/cuts.json" cut = CutSet.from_file(path)[0] cut.custom_array = Array("irrelevant", "irrelevant", "irrelevant", [10]) cut_mem = cut.move_to_memory(load_custom=False) assert cut.custom_array == cut_mem.custom_array # nothing was copied
def test_cut_move_to_memory_load_features_false(): path = "test/fixtures/libri/cuts.json" cut = CutSet.from_file(path)[0] assert cut.has_features cut_mem = cut.move_to_memory(load_features=False) assert cut.features == cut_mem.features # nothing was copied
def test_cut_move_to_memory_load_audio_false(): path = "test/fixtures/libri/cuts.json" cut = CutSet.from_file(path)[0] assert cut.has_recording cut_mem = cut.move_to_memory(load_audio=False) assert cut.recording == cut_mem.recording # nothing was copied
def test_cut_with_features_move_to_memory(): path = "test/fixtures/libri/cuts.json" cut = CutSet.from_file(path)[0] arr = cut.load_features() assert arr is not None cut_mem = cut.move_to_memory() arr_mem = cut_mem.load_features() np.testing.assert_almost_equal(arr, arr_mem, decimal=2)
def test_cut_with_array_move_to_memory(): path = "test/fixtures/libri/cuts.json" cut = CutSet.from_file(path)[0] with NamedTemporaryFile(suffix=".h5") as f, NumpyHdf5Writer(f.name) as w: arr = np.array([0, 1, 2, 3]) cut.custom_array = w.store_array(key="dummy-key", value=arr) cut_mem = cut.move_to_memory() arr_mem = cut_mem.load_custom_array() assert arr.dtype == arr_mem.dtype np.testing.assert_equal(arr, arr_mem)
def test_features_move_to_memory(): path = "test/fixtures/libri/cuts.json" cut = CutSet.from_file(path)[0] feats = cut.features assert feats is not None arr = feats.load() feats_mem = feats.move_to_memory() arr_mem = feats_mem.load() np.testing.assert_equal(arr, arr_mem)
def prepare_data(total_cuts: int, root: Pathlike) -> Tuple[int, List[str]]: """ Loads a cutset with 1 cut, repeats it a few times, and stores shards in tmp dir with 1 cut per shard for easy testing arithmetic. """ cuts = CutSet.from_file("test/fixtures/libri/cuts_no_feats.json").repeat( total_cuts) Path(root).mkdir(exist_ok=True) n_shards = export_to_webdataset(cuts, f"{root}/shard-%06d.tar", shard_size=1, audio_format="wav", verbose=False) return n_shards, sorted(cuts.ids)
def train_cuts(self) -> CutSet: logging.info("About to get train cuts") path = ( self.args.feature_dir / f"gigaspeech_cuts_{self.args.subset}{get_context_suffix(self.args)}.jsonl.gz" ) if self.args.subset in ["L", "XL"]: # "L" and "XL" partitions are large enough that we have to read their manifests lazily; # The "CutSet" holds a file handle and reads the items sequentially on-the-fly to avoid # wasting memory and time pre-reading everything. Some operations on "CutSet" won't work, # e.g. shuffling (or they would have read everything into memory in the process). # We expect that the manifests read lazily are pre-shuffled, otherwise you might experience # issues with convergence. cuts_train = CutSet.from_jsonl_lazy(path) else: # For other subsets, just read everything into memory. cuts_train = CutSet.from_file(path) return cuts_train
def test_export_to_webdataset(): cuts = CutSet.from_file("test/fixtures/libri/cuts.json") cut = cuts[0] cuts = [] for i in range(10): cuts.append(fastcopy(cut, id=cut.id + "-" + str(i))) cuts = CutSet.from_cuts(cuts) with NamedTemporaryFile(suffix=".tar") as f: export_to_webdataset(cuts, output_path=f.name) f.flush() ds = webdataset.WebDataset(f.name) dicts = (pickle.loads(data["data"]) for data in ds) cuts_ds = CutSet.from_dicts(dicts) assert list(cuts.ids) == list(cuts_ds.ids)
def extract_cuts_batch( cutset: Pathlike, output_cutset: Pathlike, storage_path: Pathlike, feature_manifest: Optional[Pathlike], storage_type: str, num_jobs: int, batch_duration: Seconds, ): """ Extract features for cuts in a given CUTSET manifest. The features are stored in STORAGE_PATH, and the output manifest with features is stored in OUTPUT_CUTSET. This version enables CUDA acceleration for feature extractors that support it (e.g., kaldifeat extractors). \b Example usage of kaldifeat fbank with CUDA: $ pip install kaldifeat # note: ensure it's compiled with CUDA $ lhotse feat write-default-config -f kaldifeat-fbank feat.yml $ sed 's/device: cpu/device: cuda/' feat.yml feat-cuda.yml $ lhotse feat extract-cuts-batch -f feat-cuda.yml cuts.jsonl cuts_with_feats.jsonl feats.h5 """ from lhotse import CutSet cuts: CutSet = CutSet.from_file(cutset) feature_extractor = (FeatureExtractor.from_yaml(feature_manifest) if feature_manifest is not None else Fbank()) cuts = cuts.compute_and_store_features_batch( extractor=feature_extractor, storage_path=storage_path, batch_duration=batch_duration, num_workers=num_jobs, storage_type=get_writer(storage_type), ) Path(output_cutset).parent.mkdir(parents=True, exist_ok=True) cuts.to_file(output_cutset)
def test_cutset_from_webdataset(): cuts = CutSet.from_file("test/fixtures/libri/cuts.json") cut = cuts[0] cuts = [] for i in range(10): cuts.append(fastcopy(cut, id=cut.id + "-" + str(i))) cuts = CutSet.from_cuts(cuts) with NamedTemporaryFile(suffix=".tar") as f: export_to_webdataset(cuts, output_path=f.name) f.flush() cuts_ds = CutSet.from_webdataset(f.name) assert list(cuts.ids) == list(cuts_ds.ids) for c, cds in zip(cuts, cuts_ds): np.testing.assert_equal(c.load_audio(), cds.load_audio()) np.testing.assert_almost_equal( c.load_features(), cds.load_features(), decimal=2 )
def cuts(): return CutSet.from_file("test/fixtures/libri/cuts.json")
def main(): fix_random_seed(42) start_epoch = 0 num_epochs = 8 exp_dir = "exp-lstm-adam-ctc-musan" setup_logger("{}/log/log-train".format(exp_dir)) tb_writer = SummaryWriter(log_dir=f"{exp_dir}/tensorboard") # load L, G, symbol_table lang_dir = Path("data/lang_nosp") phone_symbol_table = k2.SymbolTable.from_file(lang_dir / "phones.txt") word_symbol_table = k2.SymbolTable.from_file(lang_dir / "words.txt") logging.info("Loading L.fst") if (lang_dir / "Linv.pt").exists(): L_inv = k2.Fsa.from_dict(torch.load(lang_dir / "Linv.pt")) else: with open(lang_dir / "L.fst.txt") as f: L = k2.Fsa.from_openfst(f.read(), acceptor=False) L_inv = k2.arc_sort(L.invert_()) torch.save(L_inv.as_dict(), lang_dir / "Linv.pt") graph_compiler = CtcTrainingGraphCompiler(L_inv=L_inv, phones=phone_symbol_table, words=word_symbol_table) phone_ids = get_phone_symbols(phone_symbol_table) # load dataset feature_dir = Path("exp/data") logging.info("About to get train cuts") cuts_train = CutSet.from_file(feature_dir / "gigaspeech_cuts_S.jsonl.gz") logging.info("About to get dev cuts") cuts_dev = CutSet.from_file( feature_dir / "gigaspeech_cuts_DEV.jsonl.gz").subset(first=1000) logging.info("About to get Musan cuts") cuts_musan = CutSet.from_json(feature_dir / "cuts_musan.json.gz") logging.info("About to create train dataset") train = K2SpeechRecognitionDataset( cuts_train, cut_transforms=[ CutConcatenate(), CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20)), ], ) train_sampler = SingleCutSampler( cuts_train, max_frames=90000, shuffle=True, ) logging.info("About to create train dataloader") train_dl = torch.utils.data.DataLoader(train, sampler=train_sampler, batch_size=None, num_workers=4) logging.info("About to create dev dataset") validate = K2SpeechRecognitionDataset(cuts_dev) valid_sampler = SingleCutSampler(cuts_dev, max_frames=90000) logging.info("About to create dev dataloader") valid_dl = torch.utils.data.DataLoader(validate, sampler=valid_sampler, batch_size=None, num_workers=1) if not torch.cuda.is_available(): logging.error("No GPU detected!") sys.exit(-1) logging.info("About to create model") device_id = 0 device = torch.device("cuda", device_id) model = TdnnLstm1b( num_features=80, num_classes=len(phone_ids) + 1, # +1 for the blank symbol subsampling_factor=4, ) model.to(device) describe(model) learning_rate = 1e-3 optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=5e-4) best_objf = np.inf best_valid_objf = np.inf best_epoch = start_epoch best_model_path = os.path.join(exp_dir, "best_model.pt") best_epoch_info_filename = os.path.join(exp_dir, "best-epoch-info") global_batch_idx_train = 0 # for logging only if start_epoch > 0: model_path = os.path.join(exp_dir, "epoch-{}.pt".format(start_epoch - 1)) ckpt = load_checkpoint(filename=model_path, model=model, optimizer=optimizer) best_objf = ckpt["objf"] best_valid_objf = ckpt["valid_objf"] global_batch_idx_train = ckpt["global_batch_idx_train"] logging.info( f"epoch = {ckpt['epoch']}, objf = {best_objf}, valid_objf = {best_valid_objf}" ) for epoch in range(start_epoch, num_epochs): train_sampler.set_epoch(epoch) curr_learning_rate = 1e-3 # curr_learning_rate = learning_rate * pow(0.4, epoch) # for param_group in optimizer.param_groups: # param_group['lr'] = curr_learning_rate tb_writer.add_scalar("learning_rate", curr_learning_rate, epoch) logging.info("epoch {}, learning rate {}".format( epoch, curr_learning_rate)) objf, valid_objf, global_batch_idx_train = train_one_epoch( dataloader=train_dl, valid_dataloader=valid_dl, model=model, device=device, graph_compiler=graph_compiler, optimizer=optimizer, current_epoch=epoch, tb_writer=tb_writer, num_epochs=num_epochs, global_batch_idx_train=global_batch_idx_train, ) # the lower, the better if valid_objf < best_valid_objf: best_valid_objf = valid_objf best_objf = objf best_epoch = epoch save_checkpoint( filename=best_model_path, model=model, epoch=epoch, optimizer=None, scheduler=None, learning_rate=curr_learning_rate, objf=objf, valid_objf=valid_objf, global_batch_idx_train=global_batch_idx_train, ) save_training_info( filename=best_epoch_info_filename, model_path=best_model_path, current_epoch=epoch, learning_rate=curr_learning_rate, objf=best_objf, best_objf=best_objf, valid_objf=valid_objf, best_valid_objf=best_valid_objf, best_epoch=best_epoch, ) # we always save the model for every epoch model_path = os.path.join(exp_dir, "epoch-{}.pt".format(epoch)) save_checkpoint( filename=model_path, model=model, optimizer=optimizer, scheduler=None, epoch=epoch, learning_rate=curr_learning_rate, objf=objf, valid_objf=valid_objf, global_batch_idx_train=global_batch_idx_train, ) epoch_info_filename = os.path.join(exp_dir, "epoch-{}-info".format(epoch)) save_training_info( filename=epoch_info_filename, model_path=model_path, current_epoch=epoch, learning_rate=curr_learning_rate, objf=objf, best_objf=best_objf, valid_objf=valid_objf, best_valid_objf=best_valid_objf, best_epoch=best_epoch, ) logging.warning("Done")
def main(): exp_dir = Path("exp-lstm-adam-ctc-musan") setup_logger("{}/log/log-decode".format(exp_dir), log_level="debug") # load L, G, symbol_table lang_dir = Path("data/lang_nosp") symbol_table = k2.SymbolTable.from_file(lang_dir / "words.txt") phone_symbol_table = k2.SymbolTable.from_file(lang_dir / "phones.txt") phone_ids = get_phone_symbols(phone_symbol_table) phone_ids_with_blank = [0] + phone_ids ctc_topo = k2.arc_sort(build_ctc_topo(phone_ids_with_blank)) if not os.path.exists(lang_dir / "HLG.pt"): print("Loading L_disambig.fst.txt") with open(lang_dir / "L_disambig.fst.txt") as f: L = k2.Fsa.from_openfst(f.read(), acceptor=False) print("Loading G.fst.txt") with open(lang_dir / "G.fst.txt") as f: G = k2.Fsa.from_openfst(f.read(), acceptor=False) first_phone_disambig_id = find_first_disambig_symbol( phone_symbol_table) first_word_disambig_id = find_first_disambig_symbol(symbol_table) HLG = compile_HLG( L=L, G=G, H=ctc_topo, labels_disambig_id_start=first_phone_disambig_id, aux_labels_disambig_id_start=first_word_disambig_id, ) torch.save(HLG.as_dict(), lang_dir / "HLG.pt") else: print("Loading pre-compiled HLG") d = torch.load(lang_dir / "HLG.pt") HLG = k2.Fsa.from_dict(d) # load dataset feature_dir = Path("exp/data") print("About to get test cuts") cuts_test = CutSet.from_file(feature_dir / "gigaspeech_cuts_TEST.jsonl.gz") print("About to create test dataset") test = K2SpeechRecognitionDataset(cuts_test) sampler = SingleCutSampler(cuts_test, max_frames=100000) print("About to create test dataloader") test_dl = torch.utils.data.DataLoader(test, batch_size=None, sampler=sampler, num_workers=1) # if not torch.cuda.is_available(): # logging.error('No GPU detected!') # sys.exit(-1) print("About to load model") # Note: Use "export CUDA_VISIBLE_DEVICES=N" to setup device id to N # device = torch.device('cuda', 1) device = torch.device("cuda") model = TdnnLstm1b( num_features=80, num_classes=len(phone_ids) + 1, # +1 for the blank symbol subsampling_factor=4, ) checkpoint = os.path.join(exp_dir, "epoch-7.pt") load_checkpoint(checkpoint, model) model.to(device) model.eval() print("convert HLG to device") HLG = HLG.to(device) HLG.aux_labels = k2.ragged.remove_values_eq(HLG.aux_labels, 0) HLG.requires_grad_(False) print("About to decode") results = decode(dataloader=test_dl, model=model, device=device, HLG=HLG, symbols=symbol_table) s = "" for ref, hyp in results: s += f"ref={ref}\n" s += f"hyp={hyp}\n" logging.info(s) # compute WER dists = [edit_distance(r, h) for r, h in results] errors = { key: sum(dist[key] for dist in dists) for key in ["sub", "ins", "del", "total"] } total_words = sum(len(ref) for ref, _ in results) # Print Kaldi-like message: # %WER 8.20 [ 4459 / 54402, 695 ins, 427 del, 3337 sub ] logging.info( f'%WER {errors["total"] / total_words:.2%} ' f'[{errors["total"]} / {total_words}, {errors["ins"]} ins, {errors["del"]} del, {errors["sub"]} sub ]' )
def main(): args = get_parser().parse_args() dataset_parts = [args.subset, "DEV", "TEST"] print("Parts we will prepare: ", dataset_parts) corpus_dir = locate_corpus( Path("/export/corpora5/gigaspeech"), Path("/exp/pzelasko/gigaspeech"), ) musan_dir = locate_corpus( Path("/export/corpora5/JHU/musan"), Path("/export/common/data/corpora/MUSAN/musan"), Path("/root/fangjun/data/musan"), ) output_dir = Path("exp/data") print("GigaSpeech manifest preparation:") gigaspeech_manifests = prepare_gigaspeech( corpus_dir=corpus_dir, dataset_parts=dataset_parts, output_dir=output_dir, num_jobs=args.num_jobs, ) print("Musan manifest preparation:") musan_cuts_path = output_dir / "cuts_musan.json.gz" musan_manifests = prepare_musan(corpus_dir=musan_dir, output_dir=output_dir, parts=("music", "speech", "noise")) ctx_suffix = get_context_suffix(args) print("Feature extraction:") extractor = Fbank(FbankConfig(num_mel_bins=80)) with get_executor() as ex: # Initialize the executor only once. for partition, manifests in gigaspeech_manifests.items(): raw_cuts_path = output_dir / f"gigaspeech_cuts_{partition}_raw.jsonl.gz" cuts_path = (output_dir / f"gigaspeech_cuts_{partition}{ctx_suffix}.jsonl.gz") if raw_cuts_path.is_file(): print( f"{partition} already exists - skipping feature extraction." ) else: # Note this step makes the recipe different than LibriSpeech: # We must filter out some utterances and remove punctuation to be consistent with Kaldi. print("Filtering OOV utterances from supervisions") manifests["supervisions"] = manifests["supervisions"].filter( has_no_oov) print("Normalizing text in", partition) for sup in manifests["supervisions"]: sup.text = normalize_text(sup.text) # Create long-recording cut manifests. print("Processing", partition) cut_set = CutSet.from_manifests( recordings=manifests["recordings"], supervisions=manifests["supervisions"], ) # Run data augmentation that needs to be done in the time domain. if partition not in ["DEV", "TEST"]: cut_set = (cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)) cut_set.to_file(raw_cuts_path) if cuts_path.is_file(): print( f"{partition} already exists - skipping cutting into sub-segments." ) else: try: # If we skipped initializing `cut_set` because it exists on disk, we'll load it. # This helps us avoid re-computing the features for different variants of # context windows. cut_set except NameError: print(f"Reading {partition} raw cuts from disk.") cut_set = CutSet.from_file(raw_cuts_path) # Note this step makes the recipe different than LibriSpeech: # Since recordings are long, the initial CutSet has very long cuts with a plenty of supervisions. # We cut these into smaller chunks centered around each supervision, possibly adding acoustic # context. print( f"About to split {partition} raw cuts into smaller chunks." ) cut_set = cut_set.trim_to_supervisions( keep_overlapping=False, min_duration=None if args.context_window <= 0.0 else args.context_window, context_direction=args.context_direction, ) if partition in ["L", "XL"]: # Before storing manifests in, we want to pre-shuffle them, # as the sampler won't be able to do it later in an efficient manner. cut_set = cut_set.shuffle() if args.precomputed_features: # Extract the features after cutting large recordings into smaller cuts. # Note: we support very efficient "chunked" feature reads with the argument # `storage_type=ChunkedLilcomHdf5Writer`, but we don't support efficient # data augmentation and feature computation for long recordings yet. # Therefore, we sacrifice some storage for the ability to precompute # features on shorter chunks, without memory blow-ups. cut_set = cut_set.compute_and_store_features( extractor=extractor, storage_path= f"{output_dir}/feats_gigaspeech_{partition}", # when an executor is specified, make more partitions num_jobs=args.num_jobs if ex is None else 80, executor=ex, ) cut_set.to_file(cuts_path) # Remove cut_set so the next iteration can correctly infer whether it needs to # load the raw cuts from disk or not. del cut_set # Now onto Musan if not musan_cuts_path.is_file(): print("Extracting features for Musan") # create chunks of Musan with duration 5 - 10 seconds musan_cuts = (CutSet.from_manifests(recordings=combine( part["recordings"] for part in musan_manifests.values())).cut_into_windows( 10.0).filter( lambda c: c.duration > 5).compute_and_store_features( extractor=extractor, storage_path=f"{output_dir}/feats_musan", num_jobs=args.num_jobs if ex is None else 80, executor=ex, storage_type=LilcomHdf5Writer, )) musan_cuts.to_file(musan_cuts_path)