def __make_lm_module(self, lm_path: str): if len(lm_path) <= 0: return None lm_config = get_model_conf(model_path=lm_path) lm_model_module = getattr(lm_config, "model_module", "default") lm_class = dynamic_import_lm(lm_model_module, lm_config.backend) lm = lm_class(len(self.train_args.char_list), lm_config) torch_load(lm_path, lm) lm.eval() return lm
def test_lm_trainable_and_decodable(lm_name, lm_args, device, dtype): if device == "cuda" and not torch.cuda.is_available(): pytest.skip("no cuda device is available") if device == "cpu" and dtype == "float16": pytest.skip( "cpu float16 implementation is not available in pytorch yet") dtype = getattr(torch, dtype) model, x, ilens, y, data, train_args = prepare("rnn", rnn_args) char_list = train_args.char_list n_vocab = len(char_list) lm = dynamic_import_lm(lm_name, backend="pytorch").build(n_vocab, **lm_args) lm.to(device=device, dtype=dtype) # test trainable a = torch.randint(1, n_vocab, (3, 2), device=device) b = torch.randint(1, n_vocab, (3, 2), device=device) loss, logp, count = lm(a, b) loss.backward() for p in lm.parameters(): assert p.grad is not None # test decodable model.to(device=device, dtype=dtype).eval() lm.eval() scorers = model.scorers() scorers["lm"] = lm scorers["length_bonus"] = LengthBonus(len(char_list)) weights = dict(decoder=1.0, lm=1.0, length_bonus=1.0) with torch.no_grad(): feat = x[0, :ilens[0]].to(device=device, dtype=dtype) enc = model.encode(feat) beam_size = 3 result = beam_search( x=enc, sos=model.sos, eos=model.eos, beam_size=beam_size, vocab_size=len(train_args.char_list), weights=weights, scorers=scorers, token_list=train_args.char_list, ) assert len(result) >= beam_size
def main(args): """Run the main function""" model_loc = "exp/" + args[0] + "/rnnlm.model.best" print(model_loc) train_args = get_model_conf(model_loc) #model_module = "espnet.nets.pytorch_backend.lm.default:DefaultRNNLM" model_class = dynamic_import_lm(train_args.model_module, train_args.backend) if train_args.train_dtype in ("float16", "float32", "float64"): dtype = getattr(torch, train_args.train_dtype) else: dtype = torch.float32 model = model_class(train_args.n_vocab, train_args).to(dtype=dtype) torch_load(model_loc, model) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) print("Trainable parameters: ", params) print("Length vocabulary: ", train_args.n_vocab)
def test_beam_search_equal( model_class, args, ctc_weight, lm_weight, bonus, device, dtype ): if device == "cuda" and not torch.cuda.is_available(): pytest.skip("no cuda device is available") if device == "cpu" and dtype == "float16": pytest.skip("cpu float16 implementation is not available in pytorch yet") # seed setting torch.manual_seed(123) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = ( False # https://github.com/pytorch/pytorch/issues/6351 ) dtype = getattr(torch, dtype) model, x, ilens, y, data, train_args = prepare( model_class, args, mtlalpha=ctc_weight ) model.eval() char_list = train_args.char_list lm_args = Namespace(type="lstm", layer=1, unit=2, embed_unit=2, dropout_rate=0.0) lm = dynamic_import_lm("default", backend="pytorch")(len(char_list), lm_args) lm.eval() # test previous beam search args = Namespace( beam_size=3, penalty=bonus, ctc_weight=ctc_weight, maxlenratio=0, lm_weight=lm_weight, minlenratio=0, nbest=5, ) feat = x[0, : ilens[0]].numpy() # legacy beam search with torch.no_grad(): nbest = model.recognize(feat, args, char_list, lm.model) # new beam search scorers = model.scorers() if lm_weight != 0: scorers["lm"] = lm scorers["length_bonus"] = LengthBonus(len(char_list)) weights = dict( decoder=1.0 - ctc_weight, ctc=ctc_weight, lm=args.lm_weight, length_bonus=args.penalty, ) model.to(device, dtype=dtype) model.eval() beam = BeamSearch( beam_size=args.beam_size, vocab_size=len(char_list), weights=weights, scorers=scorers, token_list=train_args.char_list, sos=model.sos, eos=model.eos, pre_beam_score_key=None if ctc_weight == 1.0 else "decoder", ) beam.to(device, dtype=dtype) beam.eval() with torch.no_grad(): enc = model.encode(torch.as_tensor(feat).to(device, dtype=dtype)) nbest_bs = beam( x=enc, maxlenratio=args.maxlenratio, minlenratio=args.minlenratio ) if dtype == torch.float16: # skip because results are different. just checking it is decodable return for i, (expected, actual) in enumerate(zip(nbest, nbest_bs)): actual = actual.asdict() assert expected["yseq"] == actual["yseq"] numpy.testing.assert_allclose(expected["score"], actual["score"], rtol=1e-6)
def main(cmd_args): """Train LM.""" parser = get_parser() args, _ = parser.parse_known_args(cmd_args) if args.backend == "chainer" and args.train_dtype != "float32": raise NotImplementedError( f"chainer backend does not support --train-dtype {args.train_dtype}." "Use --dtype float32.") if args.ngpu == 0 and args.train_dtype in ("O0", "O1", "O2", "O3", "float16"): raise ValueError( f"--train-dtype {args.train_dtype} does not support the CPU backend." ) # parse arguments dynamically model_class = dynamic_import_lm(args.model_module, args.backend) model_class.add_arguments(parser) if args.schedulers is not None: for k, v in args.schedulers: scheduler_class = dynamic_import_scheduler(v) scheduler_class.add_arguments(k, parser) opt_class = dynamic_import_optimizer(args.opt, args.backend) opt_class.add_arguments(parser) args = parser.parse_args(cmd_args) # logging info if args.verbose > 0: logging.basicConfig( level=logging.INFO, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s') else: logging.basicConfig( level=logging.WARN, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s') logging.warning('Skip DEBUG/INFO messages') # If --ngpu is not given, # 1. if CUDA_VISIBLE_DEVICES is set, all visible devices # 2. if nvidia-smi exists, use all devices # 3. else ngpu=0 if args.ngpu is None: cvd = os.environ.get("CUDA_VISIBLE_DEVICES") if cvd is not None: ngpu = len(cvd.split(',')) else: logging.warning("CUDA_VISIBLE_DEVICES is not set.") try: p = subprocess.run(['nvidia-smi', '-L'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) except (subprocess.CalledProcessError, FileNotFoundError): ngpu = 0 else: ngpu = len(p.stderr.decode().split('\n')) - 1 args.ngpu = ngpu else: ngpu = args.ngpu logging.info(f"ngpu: {ngpu}") # display PYTHONPATH logging.info('python path = ' + os.environ.get('PYTHONPATH', '(None)')) # seed setting nseed = args.seed random.seed(nseed) np.random.seed(nseed) # load dictionary with open(args.dict, 'rb') as f: dictionary = f.readlines() char_list = [entry.decode('utf-8').split(' ')[0] for entry in dictionary] char_list.insert(0, '<blank>') char_list.append('<eos>') args.char_list_dict = {x: i for i, x in enumerate(char_list)} args.n_vocab = len(char_list) # train logging.info('backend = ' + args.backend) if args.backend == "chainer": from espnet.lm.chainer_backend.lm import train train(args) elif args.backend == "pytorch": from espnet.lm.pytorch_backend.lm import train train(args) else: raise ValueError("Only chainer and pytorch are supported.")
def train(args): """Train with the given args. :param Namespace args: The program arguments :param type model_class: LMInterface class for training """ model_class = dynamic_import_lm(args.model_module, args.backend) assert issubclass(model_class, LMInterface), "model should implement LMInterface" # display torch version logging.info('torch version = ' + torch.__version__) set_deterministic_pytorch(args) # check cuda and cudnn availability if not torch.cuda.is_available(): logging.warning('cuda is not available') # get special label ids unk = args.char_list_dict['<unk>'] eos = args.char_list_dict['<eos>'] # read tokens as a sequence of sentences val, n_val_tokens, n_val_oovs = load_dataset(args.valid_label, args.char_list_dict, args.dump_hdf5_path) train, n_train_tokens, n_train_oovs = load_dataset(args.train_label, args.char_list_dict, args.dump_hdf5_path) logging.info('#vocab = ' + str(args.n_vocab)) logging.info('#sentences in the training data = ' + str(len(train))) logging.info('#tokens in the training data = ' + str(n_train_tokens)) logging.info('oov rate in the training data = %.2f %%' % (n_train_oovs / n_train_tokens * 100)) logging.info('#sentences in the validation data = ' + str(len(val))) logging.info('#tokens in the validation data = ' + str(n_val_tokens)) logging.info('oov rate in the validation data = %.2f %%' % (n_val_oovs / n_val_tokens * 100)) use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0 # Create the dataset iterators batch_size = args.batchsize * max(args.ngpu, 1) if batch_size * args.accum_grad > args.batchsize: logging.info( f'batch size is automatically increased ({args.batchsize} -> {batch_size * args.accum_grad})' ) train_iter = ParallelSentenceIterator(train, batch_size, max_length=args.maxlen, sos=eos, eos=eos, shuffle=not use_sortagrad) val_iter = ParallelSentenceIterator(val, batch_size, max_length=args.maxlen, sos=eos, eos=eos, repeat=False) epoch_iters = int(len(train_iter.batch_indices) / args.accum_grad) logging.info('#iterations per epoch = %d' % epoch_iters) logging.info('#total iterations = ' + str(args.epoch * epoch_iters)) # Prepare an RNNLM model if args.train_dtype in ("float16", "float32", "float64"): dtype = getattr(torch, args.train_dtype) else: dtype = torch.float32 model = model_class(args.n_vocab, args).to(dtype=dtype) if args.ngpu > 0: model.to("cuda") gpu_id = list(range(args.ngpu)) else: gpu_id = [-1] # Save model conf to json model_conf = args.outdir + '/model.json' with open(model_conf, 'wb') as f: logging.info('writing a model config file to ' + model_conf) f.write( json.dumps(vars(args), indent=4, ensure_ascii=False, sort_keys=True).encode('utf_8')) # Set up an optimizer opt_class = dynamic_import_optimizer(args.opt, args.backend) optimizer = opt_class.from_args(model.parameters(), args) if args.schedulers is None: schedulers = [] else: schedulers = [ dynamic_import_scheduler(v)(k, args) for k, v in args.schedulers ] # setup apex.amp if args.train_dtype in ("O0", "O1", "O2", "O3"): try: from apex import amp except ImportError as e: logging.error( f"You need to install apex for --train-dtype {args.train_dtype}. " "See https://github.com/NVIDIA/apex#linux") raise e model, optimizer = amp.initialize(model, optimizer, opt_level=args.train_dtype) use_apex = True else: use_apex = False # FIXME: TOO DIRTY HACK reporter = Reporter() setattr(model, "reporter", reporter) setattr(optimizer, "target", reporter) setattr(optimizer, "serialize", lambda s: reporter.serialize(s)) print('----------------------', gpu_id[0]) updater = BPTTUpdater(train_iter, model, optimizer, schedulers, gpu_id, gradclip=args.gradclip, use_apex=use_apex, accum_grad=args.accum_grad) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.outdir) trainer.extend(LMEvaluator(val_iter, model, reporter, device=gpu_id)) trainer.extend( extensions.LogReport(postprocess=compute_perplexity, trigger=(args.report_interval_iters, 'iteration'))) trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'main/loss', 'perplexity', 'val_perplexity', 'elapsed_time' ]), trigger=(args.report_interval_iters, 'iteration')) trainer.extend( extensions.ProgressBar(update_interval=args.report_interval_iters)) # Save best models trainer.extend(torch_snapshot(filename='snapshot.ep.{.updater.epoch}')) trainer.extend(snapshot_object(model, 'rnnlm.model.{.updater.epoch}')) # T.Hori: MinValueTrigger should be used, but it fails when resuming trainer.extend( MakeSymlinkToBestModel('validation/main/loss', 'rnnlm.model')) if use_sortagrad: trainer.extend( ShufflingEnabler([train_iter]), trigger=(args.sortagrad if args.sortagrad != -1 else args.epoch, 'epoch')) if args.resume: logging.info('resumed from %s' % args.resume) torch_resume(args.resume, trainer) set_early_stop(trainer, args, is_lm=True) if args.tensorboard_dir is not None and args.tensorboard_dir != "": writer = SummaryWriter(args.tensorboard_dir) trainer.extend(TensorboardLogger(writer), trigger=(args.report_interval_iters, 'iteration')) trainer.run() check_early_stop(trainer, args.epoch) # compute perplexity for test set if args.test_label: logging.info('test the best model') torch_load(args.outdir + '/rnnlm.model.best', model) test = read_tokens(args.test_label, args.char_list_dict) n_test_tokens, n_test_oovs = count_tokens(test, unk) logging.info('#sentences in the test data = ' + str(len(test))) logging.info('#tokens in the test data = ' + str(n_test_tokens)) logging.info('oov rate in the test data = %.2f %%' % (n_test_oovs / n_test_tokens * 100)) test_iter = ParallelSentenceIterator(test, batch_size, max_length=args.maxlen, sos=eos, eos=eos, repeat=False) evaluator = LMEvaluator(test_iter, model, reporter, device=gpu_id) result = evaluator() compute_perplexity(result) logging.info(f"test perplexity: {result['perplexity']}")
def recog_v2(args): """Decode with custom models that implements ScorerInterface. Notes: The previous backend espnet.asr.pytorch_backend.asr.recog only supports E2E and RNNLM Args: args (namespace): The program arguments. See py:func:`espnet.bin.asr_recog.get_parser` for details """ logging.warning("experimental API for custom LMs is selected by --api v2") if args.batchsize > 1: raise NotImplementedError("multi-utt batch decoding is not implemented") if args.streaming_mode is not None: raise NotImplementedError("streaming mode is not implemented") if args.word_rnnlm: raise NotImplementedError("word LM is not implemented") set_deterministic_pytorch(args) model, train_args = load_trained_model(args.model) assert isinstance(model, ASRInterface) if args.quantize_config is not None: q_config = set([getattr(torch.nn, q) for q in args.quantize_config]) else: q_config = {torch.nn.Linear} if args.quantize_asr_model: logging.info("Use quantized asr model for decoding") # See https://github.com/espnet/espnet/pull/3616 for more information. if ( torch.__version__ < LooseVersion("1.4.0") and "lstm" in train_args.etype and torch.nn.LSTM in q_config ): raise ValueError( "Quantized LSTM in ESPnet is only supported with torch 1.4+." ) if args.quantize_dtype == "float16" and torch.__version__ < LooseVersion( "1.5.0" ): raise ValueError( "float16 dtype for dynamic quantization is not supported with torch " "version < 1.5.0. Switching to qint8 dtype instead." ) dtype = getattr(torch, args.quantize_dtype) model = torch.quantization.quantize_dynamic(model, q_config, dtype=dtype) model.eval() load_inputs_and_targets = LoadInputsAndTargets( mode="asr", load_output=False, sort_in_input_length=False, preprocess_conf=train_args.preprocess_conf if args.preprocess_conf is None else args.preprocess_conf, preprocess_args={"train": False}, ) if args.rnnlm: lm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) # NOTE: for a compatibility with less than 0.5.0 version models lm_model_module = getattr(lm_args, "model_module", "default") lm_class = dynamic_import_lm(lm_model_module, lm_args.backend) lm = lm_class(len(train_args.char_list), lm_args) torch_load(args.rnnlm, lm) if args.quantize_lm_model: logging.info("Use quantized lm model") dtype = getattr(torch, args.quantize_dtype) lm = torch.quantization.quantize_dynamic(lm, q_config, dtype=dtype) lm.eval() else: lm = None if args.ngram_model: from espnet.nets.scorers.ngram import NgramFullScorer from espnet.nets.scorers.ngram import NgramPartScorer if args.ngram_scorer == "full": ngram = NgramFullScorer(args.ngram_model, train_args.char_list) else: ngram = NgramPartScorer(args.ngram_model, train_args.char_list) else: ngram = None scorers = model.scorers() scorers["lm"] = lm scorers["ngram"] = ngram scorers["length_bonus"] = LengthBonus(len(train_args.char_list)) weights = dict( decoder=1.0 - args.ctc_weight, ctc=args.ctc_weight, lm=args.lm_weight, ngram=args.ngram_weight, length_bonus=args.penalty, ) beam_search = BeamSearch( beam_size=args.beam_size, vocab_size=len(train_args.char_list), weights=weights, scorers=scorers, sos=model.sos, eos=model.eos, token_list=train_args.char_list, pre_beam_score_key=None if args.ctc_weight == 1.0 else "full", ) # TODO(karita): make all scorers batchfied if args.batchsize == 1: non_batch = [ k for k, v in beam_search.full_scorers.items() if not isinstance(v, BatchScorerInterface) ] if len(non_batch) == 0: beam_search.__class__ = BatchBeamSearch logging.info("BatchBeamSearch implementation is selected.") else: logging.warning( f"As non-batch scorers {non_batch} are found, " f"fall back to non-batch implementation." ) if args.ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") if args.ngpu == 1: device = "cuda" else: device = "cpu" dtype = getattr(torch, args.dtype) logging.info(f"Decoding device={device}, dtype={dtype}") model.to(device=device, dtype=dtype).eval() beam_search.to(device=device, dtype=dtype).eval() # read json data with open(args.recog_json, "rb") as f: js = json.load(f)["utts"] new_js = {} with torch.no_grad(): for idx, name in enumerate(js.keys(), 1): logging.info("(%d/%d) decoding " + name, idx, len(js.keys())) batch = [(name, js[name])] feat = load_inputs_and_targets(batch)[0][0] enc = model.encode(torch.as_tensor(feat).to(device=device, dtype=dtype)) nbest_hyps = beam_search( x=enc, maxlenratio=args.maxlenratio, minlenratio=args.minlenratio ) nbest_hyps = [ h.asdict() for h in nbest_hyps[: min(len(nbest_hyps), args.nbest)] ] new_js[name] = add_results_to_json( js[name], nbest_hyps, train_args.char_list ) with open(args.result_label, "wb") as f: f.write( json.dumps( {"utts": new_js}, indent=4, ensure_ascii=False, sort_keys=True ).encode("utf_8") )
def test_batch_beam_search_equal( model_class, args, ctc_weight, lm_nn, lm_args, lm_weight, ngram_weight, bonus, device, dtype, ): if device == "cuda" and not torch.cuda.is_available(): pytest.skip("no cuda device is available") if device == "cpu" and dtype == "float16": pytest.skip( "cpu float16 implementation is not available in pytorch yet") # seed setting torch.manual_seed(123) torch.backends.cudnn.deterministic = True # https://github.com/pytorch/pytorch/issues/6351 torch.backends.cudnn.benchmark = False dtype = getattr(torch, dtype) model, x, ilens, y, data, train_args = prepare(model_class, args, mtlalpha=ctc_weight) model.eval() char_list = train_args.char_list lm = dynamic_import_lm(lm_nn, backend="pytorch")(len(char_list), lm_args) lm.eval() root = os.path.dirname(os.path.abspath(__file__)) ngram = NgramFullScorer(os.path.join(root, "beam_search_test.arpa"), args.char_list) # test previous beam search args = Namespace( beam_size=3, penalty=bonus, ctc_weight=ctc_weight, maxlenratio=0, lm_weight=lm_weight, ngram_weight=ngram_weight, minlenratio=0, nbest=5, ) # new beam search scorers = model.scorers() if lm_weight != 0: scorers["lm"] = lm if ngram_weight != 0: scorers["ngram"] = ngram scorers["length_bonus"] = LengthBonus(len(char_list)) weights = dict( decoder=1.0 - ctc_weight, ctc=ctc_weight, lm=args.lm_weight, ngram=args.ngram_weight, length_bonus=args.penalty, ) model.to(device, dtype=dtype) model.eval() with torch.no_grad(): enc = model.encode(x[0, :ilens[0]].to(device, dtype=dtype)) legacy_beam = BeamSearch( beam_size=args.beam_size, vocab_size=len(char_list), weights=weights, scorers=scorers, token_list=train_args.char_list, sos=model.sos, eos=model.eos, pre_beam_score_key=None if ctc_weight == 1.0 else "decoder", ) legacy_beam.to(device, dtype=dtype) legacy_beam.eval() beam = BatchBeamSearch( beam_size=args.beam_size, vocab_size=len(char_list), weights=weights, scorers=scorers, token_list=train_args.char_list, sos=model.sos, eos=model.eos, ) beam.to(device, dtype=dtype) beam.eval() with torch.no_grad(): legacy_nbest_bs = legacy_beam(x=enc, maxlenratio=args.maxlenratio, minlenratio=args.minlenratio) nbest_bs = beam(x=enc, maxlenratio=args.maxlenratio, minlenratio=args.minlenratio) for i, (expected, actual) in enumerate(zip(legacy_nbest_bs, nbest_bs)): assert expected.yseq.tolist() == actual.yseq.tolist() numpy.testing.assert_allclose(expected.score.cpu(), actual.score.cpu(), rtol=1e-6)
def recog_v2(args): """Decode with custom models that implements ScorerInterface. Notes: The previous backend espnet.asr.pytorch_backend.asr.recog only supports E2E and RNNLM Args: args (namespace): The program arguments. See py:func:`espnet.bin.asr_recog.get_parser` for details """ logging.warning("experimental API for custom LMs is selected by --api v2") if args.batchsize > 1: raise NotImplementedError( "multi-utt batch decoding is not implemented") if args.streaming_mode is not None: raise NotImplementedError("streaming mode is not implemented") if args.word_rnnlm: raise NotImplementedError("word LM is not implemented") set_deterministic_pytorch(args) model, train_args = load_trained_model(args.model) assert isinstance(model, ASRInterface) model.eval() load_inputs_and_targets = LoadInputsAndTargets( mode="asr", load_output=False, sort_in_input_length=False, preprocess_conf=train_args.preprocess_conf if args.preprocess_conf is None else args.preprocess_conf, preprocess_args={"train": False}, ) if args.rnnlm: lm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) # NOTE: for a compatibility with less than 0.5.0 version models lm_model_module = getattr(lm_args, "model_module", "default") lm_class = dynamic_import_lm(lm_model_module, lm_args.backend) lm = lm_class(len(train_args.char_list), lm_args) torch_load(args.rnnlm, lm) lm.eval() else: lm = None if args.ngram_model: from espnet.nets.scorers.ngram import NgramFullScorer from espnet.nets.scorers.ngram import NgramPartScorer if args.ngram_scorer == "full": ngram = NgramFullScorer(args.ngram_model, train_args.char_list) else: ngram = NgramPartScorer(args.ngram_model, train_args.char_list) else: ngram = None scorers = model.scorers() scorers["lm"] = lm scorers["ngram"] = ngram scorers["length_bonus"] = LengthBonus(len(train_args.char_list)) weights = dict( decoder=1.0 - args.ctc_weight, ctc=args.ctc_weight, lm=args.lm_weight, ngram=args.ngram_weight, length_bonus=args.penalty, ) beam_search = BeamSearch( beam_size=args.beam_size, vocab_size=len(train_args.char_list), weights=weights, scorers=scorers, sos=model.sos, eos=model.eos, token_list=train_args.char_list, pre_beam_score_key=None if args.ctc_weight == 1.0 else "full", ) # TODO(karita): make all scorers batchfied if args.batchsize == 1: non_batch = [ k for k, v in beam_search.full_scorers.items() if not isinstance(v, BatchScorerInterface) ] if len(non_batch) == 0: beam_search.__class__ = BatchBeamSearch logging.info("BatchBeamSearch implementation is selected.") else: logging.warning(f"As non-batch scorers {non_batch} are found, " f"fall back to non-batch implementation.") if args.ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") if args.ngpu == 1: device = "cuda" else: device = "cpu" dtype = getattr(torch, args.dtype) logging.info(f"Decoding device={device}, dtype={dtype}") model.to(device=device, dtype=dtype).eval() beam_search.to(device=device, dtype=dtype).eval() # read json data with open(args.recog_json, "r") as f: # "rb" content = f.read() if content.startswith( "Warning! You haven't set Python environment yet. Go to /content/espnet/tools and generate 'activate_python.sh'" ): train_json = json.loads( content[110:] )["utts"] # 110 is the number of characters for the above WARNING LINE. else: train_json = json.loads(content)["utts"] # json.load(f)["utts"] js = train_json # json.load(f)["utts"] new_js = {} with torch.no_grad(): for idx, name in enumerate(js.keys(), 1): logging.info("(%d/%d) decoding " + name, idx, len(js.keys())) batch = [(name, js[name])] feat = load_inputs_and_targets(batch)[0][0] enc = model.encode( torch.as_tensor(feat).to(device=device, dtype=dtype)) nbest_hyps = beam_search(x=enc, maxlenratio=args.maxlenratio, minlenratio=args.minlenratio) nbest_hyps = [ h.asdict() for h in nbest_hyps[:min(len(nbest_hyps), args.nbest)] ] new_js[name] = add_results_to_json(js[name], nbest_hyps, train_args.char_list) with open(args.result_label, "wb") as f: f.write( json.dumps({ "utts": new_js }, indent=4, ensure_ascii=False, sort_keys=True).encode("utf_8"))
def recog_v2(args): """Decode with custom models that implements ScorerInterface. Notes: The previous backend espnet.asr.pytorch_backend.asr.recog only supports E2E and RNNLM Args: args (namespace): The program arguments. See py:func:`espnet.bin.asr_recog.get_parser` for details """ logging.warning("experimental API for custom LMs is selected by --api v2") if args.batchsize > 1: raise NotImplementedError("batch decoding is not implemented") if args.streaming_mode is not None: raise NotImplementedError("streaming mode is not implemented") if args.word_rnnlm: raise NotImplementedError("word LM is not implemented") set_deterministic_pytorch(args) model, train_args = load_trained_model(args.model) assert isinstance(model, ASRInterface) model.eval() load_inputs_and_targets = LoadInputsAndTargets( mode='asr', load_output=False, sort_in_input_length=False, preprocess_conf=train_args.preprocess_conf if args.preprocess_conf is None else args.preprocess_conf, preprocess_args={'train': False}) if args.rnnlm: lm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) # NOTE: for a compatibility with less than 0.5.0 version models lm_model_module = getattr(lm_args, "model_module", "default") lm_class = dynamic_import_lm(lm_model_module, lm_args.backend) lm = lm_class(len(train_args.char_list), lm_args) torch_load(args.rnnlm, lm) lm.eval() else: lm = None scorers = model.scorers() scorers["lm"] = lm scorers["length_bonus"] = LengthBonus(len(train_args.char_list)) weights = dict(decoder=1.0 - args.ctc_weight, ctc=args.ctc_weight, lm=args.lm_weight, length_bonus=args.penalty) beam_search = BeamSearch( beam_size=args.beam_size, vocab_size=len(train_args.char_list), weights=weights, scorers=scorers, sos=model.sos, eos=model.eos, token_list=train_args.char_list, ) if args.ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") if args.ngpu == 1: device = "cuda" else: device = "cpu" dtype = getattr(torch, args.dtype) logging.info(f"Decoding device={device}, dtype={dtype}") model.to(device=device, dtype=dtype).eval() beam_search.to(device=device, dtype=dtype).eval() # read json data with open(args.recog_json, 'rb') as f: js = json.load(f)['utts'] new_js = {} with torch.no_grad(): for idx, name in enumerate(js.keys(), 1): logging.info('(%d/%d) decoding ' + name, idx, len(js.keys())) batch = [(name, js[name])] feat = load_inputs_and_targets(batch)[0][0] enc = model.encode( torch.as_tensor(feat).to(device=device, dtype=dtype)) print(enc.shape) print(model) nbest_hyps = beam_search(x=enc, maxlenratio=args.maxlenratio, minlenratio=args.minlenratio) nbest_hyps = [ h.asdict() for h in nbest_hyps[:min(len(nbest_hyps), args.nbest)] ] new_js[name] = add_results_to_json(js[name], nbest_hyps, train_args.char_list) with open(args.result_label, 'wb') as f: f.write( json.dumps({ 'utts': new_js }, indent=4, ensure_ascii=False, sort_keys=True).encode('utf_8'))
def train(args): """Train with the given args. :param Namespace args: The program arguments :param type model_class: LMInterface class for training """ model_class = dynamic_import_lm(args.model_module, args.backend) assert issubclass(model_class, LMInterface), "model should implement LMInterface" # display torch version logging.info('torch version = ' + torch.__version__) set_deterministic_pytorch(args) # check cuda and cudnn availability if not torch.cuda.is_available(): logging.warning('cuda is not available') # get special label ids unk = args.char_list_dict['<unk>'] eos = args.char_list_dict['<eos>'] # read tokens as a sequence of sentences val, n_val_tokens, n_val_oovs = load_dataset(args.valid_label, args.char_list_dict, args.dump_hdf5_path) train, n_train_tokens, n_train_oovs = load_dataset(args.train_label, args.char_list_dict, args.dump_hdf5_path) logging.info('#vocab = ' + str(args.n_vocab)) logging.info('#sentences in the training data = ' + str(len(train))) logging.info('#tokens in the training data = ' + str(n_train_tokens)) logging.info('oov rate in the training data = %.2f %%' % (n_train_oovs / n_train_tokens * 100)) logging.info('#sentences in the validation data = ' + str(len(val))) logging.info('#tokens in the validation data = ' + str(n_val_tokens)) logging.info('oov rate in the validation data = %.2f %%' % (n_val_oovs / n_val_tokens * 100)) use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0 # Create the dataset iterators batch_size = args.batchsize * max(args.ngpu, 1) if batch_size > args.batchsize: logging.info(f'batch size is automatically increased ({args.batchsize} -> {batch_size})') train_iter = ParallelSentenceIterator(train, batch_size, max_length=args.maxlen, sos=eos, eos=eos, shuffle=not use_sortagrad) val_iter = ParallelSentenceIterator(val, batch_size, max_length=args.maxlen, sos=eos, eos=eos, repeat=False) logging.info('#iterations per epoch = ' + str(len(train_iter.batch_indices))) logging.info('#total iterations = ' + str(args.epoch * len(train_iter.batch_indices))) # Prepare an RNNLM model model = model_class(args.n_vocab, args) reporter = Reporter() if args.ngpu > 0: model = torch.nn.DataParallel(model, device_ids=list(range(args.ngpu))).cuda() gpu_id = 0 else: gpu_id = -1 setattr(model, "reporter", reporter) # Save model conf to json model_conf = args.outdir + '/model.json' with open(model_conf, 'wb') as f: logging.info('writing a model config file to ' + model_conf) f.write(json.dumps(vars(args), indent=4, ensure_ascii=False, sort_keys=True).encode('utf_8')) # Set up an optimizer if args.opt == 'sgd': optimizer = torch.optim.SGD(model.parameters(), lr=1.0) elif args.opt == 'adam': optimizer = torch.optim.Adam(model.parameters()) # FIXME: TOO DIRTY HACK setattr(optimizer, "target", reporter) setattr(optimizer, "serialize", lambda s: reporter.serialize(s)) updater = BPTTUpdater(train_iter, model, optimizer, gpu_id, gradclip=args.gradclip) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.outdir) trainer.extend(LMEvaluator(val_iter, model, reporter, device=gpu_id)) trainer.extend(extensions.LogReport(postprocess=compute_perplexity, trigger=(args.report_interval_iters, 'iteration'))) trainer.extend(extensions.PrintReport( ['epoch', 'iteration', 'main/loss', 'perplexity', 'val_perplexity', 'elapsed_time'] ), trigger=(args.report_interval_iters, 'iteration')) trainer.extend(extensions.ProgressBar(update_interval=args.report_interval_iters)) # Save best models trainer.extend(torch_snapshot(filename='snapshot.ep.{.updater.epoch}')) trainer.extend(snapshot_object(model, 'rnnlm.model.{.updater.epoch}')) # T.Hori: MinValueTrigger should be used, but it fails when resuming trainer.extend(MakeSymlinkToBestModel('validation/main/loss', 'rnnlm.model')) if use_sortagrad: trainer.extend(ShufflingEnabler([train_iter]), trigger=(args.sortagrad if args.sortagrad != -1 else args.epoch, 'epoch')) if args.resume: logging.info('resumed from %s' % args.resume) torch_resume(args.resume, trainer) set_early_stop(trainer, args, is_lm=True) if args.tensorboard_dir is not None and args.tensorboard_dir != "": writer = SummaryWriter(args.tensorboard_dir) trainer.extend(TensorboardLogger(writer), trigger=(args.report_interval_iters, 'iteration')) trainer.run() check_early_stop(trainer, args.epoch) # compute perplexity for test set if args.test_label: logging.info('test the best model') torch_load(args.outdir + '/rnnlm.model.best', model) test = read_tokens(args.test_label, args.char_list_dict) n_test_tokens, n_test_oovs = count_tokens(test, unk) logging.info('#sentences in the test data = ' + str(len(test))) logging.info('#tokens in the test data = ' + str(n_test_tokens)) logging.info('oov rate in the test data = %.2f %%' % (n_test_oovs / n_test_tokens * 100)) test_iter = ParallelSentenceIterator(test, batch_size, max_length=args.maxlen, sos=eos, eos=eos, repeat=False) evaluator = LMEvaluator(test_iter, model, reporter, device=gpu_id) result = evaluator() compute_perplexity(result) logging.info(f"test perplexity: {result['perplexity']}")