def __init__(self, cfg: InferConfig) -> None: self.cfg = cfg self.task = tasks.setup_task(cfg.task) self.tgt_dict = self.task.target_dictionary models, saved_cfg = self.load_model_ensemble() self.models = models self.saved_cfg = saved_cfg self.task.load_dataset( self.cfg.dataset.gen_subset, task_cfg=saved_cfg.task, ) self.generator = Decoder(cfg.decoding.decoder, self.tgt_dict) self.gen_timer = StopwatchMeter() self.wps_meter = TimeMeter() self.num_sentences = 0 self.total_errors = 0 self.total_length = 0 self.hypo_words_file = None self.hypo_units_file = None self.ref_words_file = None self.ref_units_file = None self.progress_bar = self.build_progress_bar()
def _main(args, output_file): logging.basicConfig( format='%(asctime)s | %(levelname)s | %(name)s | %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, stream=output_file, ) logger = logging.getLogger('fairseq_cli.generate') utils.import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 logger.info(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Set dictionaries try: src_dict = getattr(task, 'source_dictionary', None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary # Load ensemble logger.info('loading model(s) from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( utils.split_paths(args.path), arg_overrides=eval(args.model_overrides), task=task, ) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=args.log_format, log_interval=args.log_interval, default_log_format=('tqdm' if not args.no_progress_bar else 'none'), ) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(models, args) # Handle tokenization and BPE tokenizer = encoders.build_tokenizer(args) bpe = encoders.build_bpe(args) def decode_fn(x): if bpe is not None: x = bpe.decode(x) if tokenizer is not None: x = tokenizer.decode(x) return x # Generate and compute BLEU score if args.sacrebleu: scorer = bleu.SacrebleuScorer() else: scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) num_sentences = 0 has_target = True wps_meter = TimeMeter() for sample in progress: sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample['target'][:, :args.prefix_size] gen_timer.start() hypos = task.inference_step(generator, models, sample, prefix_tokens) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None # Remove padding src_tokens = utils.strip_pad( sample['net_input']['src_tokens'][i, :], tgt_dict.pad()) target_tokens = None if has_target: target_tokens = utils.strip_pad(sample['target'][i, :], tgt_dict.pad()).int().cpu() # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset( args.gen_subset).src.get_original_text(sample_id) target_str = task.dataset( args.gen_subset).tgt.get_original_text(sample_id) else: if src_dict is not None: src_str = src_dict.string(src_tokens, args.remove_bpe) else: src_str = "" if has_target: target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True, extra_symbols_to_ignore={ generator.eos, }) src_str = decode_fn(src_str) if has_target: target_str = decode_fn(target_str) if not args.quiet: if src_dict is not None: print('S-{}\t{}'.format(sample_id, src_str), file=output_file) if has_target: print('T-{}\t{}'.format(sample_id, target_str), file=output_file) # Process top predictions for j, hypo in enumerate(hypos[i][:args.nbest]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'], align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, extra_symbols_to_ignore={ generator.eos, }) detok_hypo_str = decode_fn(hypo_str) if not args.quiet: score = hypo['score'] / math.log(2) # convert to base 2 # original hypothesis (after tokenization and BPE) print('H-{}\t{}\t{}'.format(sample_id, score, hypo_str), file=output_file) # detokenized hypothesis print('D-{}\t{}\t{}'.format(sample_id, score, detok_hypo_str), file=output_file) print( 'P-{}\t{}'.format( sample_id, ' '.join( map( lambda x: '{:.4f}'.format(x), # convert from base e to base 2 hypo['positional_scores'].div_(math.log(2) ).tolist(), ))), file=output_file) if args.print_alignment: print('A-{}\t{}'.format( sample_id, ' '.join([ '{}-{}'.format(src_idx, tgt_idx) for src_idx, tgt_idx in alignment ])), file=output_file) if args.print_step: print('I-{}\t{}'.format(sample_id, hypo['steps']), file=output_file) if 'enc_selection' in hypo: print('Menc-{}\t{}'.format(sample_id, hypo['enc_selection']), file=output_file) if 'dec_selection' in hypo: print('Mdec-{}\t{}'.format(sample_id, hypo['dec_selection']), file=output_file) if args.print_attn_confidence: print('C-{}\t{}'.format(sample_id, hypo['enc_self_attn_conf']), file=output_file) if getattr(args, 'retain_iter_history', False): for step, h in enumerate(hypo['history']): _, h_str, _ = utils.post_process_prediction( hypo_tokens=h['tokens'].int().cpu(), src_str=src_str, alignment=None, align_dict=None, tgt_dict=tgt_dict, remove_bpe=None, ) print('E-{}_{}\t{}'.format(sample_id, step, h_str), file=output_file) # Score only the top hypothesis if has_target and j == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tgt_dict.encode_line( target_str, add_if_not_exist=True) hypo_tokens = tgt_dict.encode_line( detok_hypo_str, add_if_not_exist=True) if hasattr(scorer, 'add_string'): scorer.add_string(target_str, detok_hypo_str) else: scorer.add(target_tokens, hypo_tokens) wps_meter.update(num_generated_tokens) progress.log({'wps': round(wps_meter.avg)}) num_sentences += sample['nsentences'] logger.info('NOTE: hypothesis and token scores are output in base 2') logger.info( 'Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: if args.bpe and not args.sacrebleu: if args.remove_bpe: logger.warning( "BLEU score is being computed by splitting detokenized string on spaces, this is probably not what you want. Use --sacrebleu for standard 13a BLEU tokenization" ) else: logger.warning( "If you are using BPE on the target side, the BLEU score is computed on BPE tokens, not on proper words. Use --sacrebleu for standard 13a BLEU tokenization" ) logger.info('Generate {} with beam={}: {}'.format( args.gen_subset, args.beam, scorer.result_string())) return scorer
def _main(args, output_file): logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, stream=output_file, ) logger = logging.getLogger("espresso.dump_posteriors") print_options_meaning_changes(args, logger) utils.import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 logger.info(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset split task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Load ensemble logger.info("loading model(s) from {}".format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( utils.split_paths(args.path), arg_overrides=eval(args.model_overrides), task=task, suffix=getattr(args, "checkpoint_suffix", ""), ) # Load state prior for cross-entropy trained systems decoding if args.state_prior_file is not None: prior = torch.from_numpy(kaldi_io.read_vec_flt(args.state_prior_file)) else: prior = [] # Optimize ensemble for generation for model in models: model.make_generation_fast_() if args.fp16: model.half() if use_cuda: model.cuda() if isinstance(prior, list) and getattr(model, "state_prior", None) is not None: prior.append(model.state_prior.unsqueeze(0)) if isinstance(prior, list) and len(prior) > 0: prior = torch.cat(prior, 0).mean(0) # average priors across models prior = prior / prior.sum() # re-normalize elif isinstance(prior, list): prior = None if prior is not None: if args.fp16: prior = prior.half() if use_cuda: prior = prior.cuda() log_prior = prior.log() else: log_prior = None # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[ model.max_positions() if hasattr(model, "encoder") else (None, model.max_positions()) for model in models ]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=args.log_format, log_interval=args.log_interval, default_log_format=("tqdm" if not args.no_progress_bar else "none"), ) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(models, args) # Generate and dump num_sentences = 0 chunk_width = getattr(task, "chunk_width", None) lprobs_wspecifier = "ark:| copy-matrix ark:- ark:-" with kaldi_io.open_or_fd(lprobs_wspecifier, "wb") as f: if chunk_width is None: # normal dumping (i.e., no chunking) for sample in progress: sample = utils.move_to_cuda(sample) if use_cuda else sample if "net_input" not in sample: continue gen_timer.start() lprobs, padding_mask = task.inference_step( generator, models, sample) if log_prior is not None: assert lprobs.size(-1) == log_prior.size(0) lprobs = lprobs - log_prior out_lengths = (~padding_mask).long().sum( dim=1).cpu() if padding_mask is not None else None num_processed_frames = sample["ntokens"] gen_timer.stop(num_processed_frames) num_sentences += sample["nsentences"] if out_lengths is not None: for i in range(sample["nsentences"]): length = out_lengths[i] kaldi_io.write_mat(f, lprobs[i, :length, :].cpu().numpy(), key=sample["utt_id"][i]) else: for i in range(sample["nsentences"]): kaldi_io.write_mat(f, lprobs[i, :, :].cpu().numpy(), key=sample["utt_id"][i]) else: # dumping chunks within the same utterance from left to right for sample in progress: # sample is actually a list of batches sample = utils.move_to_cuda(sample) if use_cuda else sample utt_id = sample[0]["utt_id"] id = sample[0]["id"] whole_lprobs = None for i, chunk_sample in enumerate(sample): if "net_input" not in chunk_sample: continue assert chunk_sample["utt_id"] == utt_id and ( chunk_sample["id"] == id).all() gen_timer.start() lprobs, _ = task.inference_step(generator, models, chunk_sample) if log_prior is not None: assert lprobs.size(-1) == log_prior.size(0) lprobs = lprobs - log_prior if whole_lprobs is None: whole_lprobs = lprobs.cpu() else: whole_lprobs = torch.cat((whole_lprobs, lprobs.cpu()), 1) num_processed_frames = chunk_sample["ntokens"] gen_timer.stop(num_processed_frames) if i == len(sample) - 1: num_sentences += len(utt_id) for j in range(len(utt_id)): truncated_length = models[0].output_lengths( task.dataset(args.gen_subset).src_sizes[id[j]] ) # length is after possible subsampling by the model mat = whole_lprobs[j, :truncated_length, :] kaldi_io.write_mat(f, mat.numpy(), key=utt_id[j]) logger.info( "Dumped {} utterances ({} frames) in {:.1f}s ({:.2f} sentences/s, {:.2f} frames/s)" .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) return
def eval_lm( models: List[fairseq.models.FairseqModel], source_dictionary: fairseq.data.Dictionary, batch_iterator: Iterable, post_process: Optional[str] = None, output_word_probs: bool = False, output_word_stats: bool = False, target_dictionary: Optional[fairseq.data.Dictionary] = None, softmax_batch: int = 0, remove_bos_token: bool = False, device: Optional[torch.device] = None, ): """ Args: models (List[~fairseq.models.FairseqModel]): list of models to evaluate. Models are essentially `nn.Module` instances, but must be compatible with fairseq's `SequenceScorer`. source_dictionary (~fairseq.data.Dictionary): dictionary for applying any relevant post processing or outputing word probs/stats. batch_iterator (Iterable): yield batches of data post_process (Optional[str]): post-process text by removing BPE, letter segmentation, etc. Valid options can be found in fairseq.data.utils.post_process, although not all options are implemented here. output_word_probs (Optional[bool]): output words and their predicted log probabilities output_word_stats (Optional[bool]): output word statistics such as word count and average probability target_dictionary (Optional[~fairseq.data.Dictionary]): output dictionary (defaults to *source_dictionary*) softmax_batch (Optional[bool]): if BxT is more than this, will batch the softmax over vocab to this amount of tokens, in order to fit into GPU memory remove_bos_token (Optional[bool]): if True, confirm that the first token is the beginning-of-sentence symbol (according to the relevant dictionary) and remove it from the output device (Optional[torch.device]): device to use for evaluation (defaults to device of first model parameter) """ if target_dictionary is None: target_dictionary = source_dictionary if device is None: device = next(models[0].parameters()).device gen_timer = StopwatchMeter() scorer = SequenceScorer(target_dictionary, softmax_batch) score_sum = 0.0 count = 0 if post_process is not None: if post_process in {"subword_nmt", "@@ "}: bpe_cont = post_process.rstrip() bpe_toks = { i for i in range(len(source_dictionary)) if source_dictionary[i].endswith(bpe_cont) } else: raise NotImplementedError( f"--post-process={post_process} is not implemented" ) bpe_len = len(bpe_cont) else: bpe_toks = None bpe_len = 0 word_stats = dict() for sample in batch_iterator: if "net_input" not in sample: continue sample = utils.move_to_cuda(sample, device=device) gen_timer.start() hypos = scorer.generate(models, sample) gen_timer.stop(sample["ntokens"]) for i, hypos_i in enumerate(hypos): hypo = hypos_i[0] sample_id = sample["id"][i] tokens = hypo["tokens"] tgt_len = tokens.numel() pos_scores = hypo["positional_scores"].float() if remove_bos_token: assert hypo["tokens"][0].item() == target_dictionary.bos() tokens = tokens[1:] pos_scores = pos_scores[1:] skipped_toks = 0 if bpe_toks is not None: for i in range(tgt_len - 1): if tokens[i].item() in bpe_toks: skipped_toks += 1 pos_scores[i + 1] += pos_scores[i] pos_scores[i] = 0 inf_scores = pos_scores.eq(float("inf")) | pos_scores.eq(float("-inf")) if inf_scores.any(): logger.info( "skipping tokens with inf scores:", target_dictionary.string(tokens[inf_scores.nonzero()]), ) pos_scores = pos_scores[(~inf_scores).nonzero()] score_sum += pos_scores.sum().cpu() count += pos_scores.numel() - skipped_toks if output_word_probs or output_word_stats: w = "" word_prob = [] is_bpe = False for i in range(len(tokens)): w_ind = tokens[i].item() w += source_dictionary[w_ind] if bpe_toks is not None and w_ind in bpe_toks: w = w[:-bpe_len] is_bpe = True else: word_prob.append((w, pos_scores[i].item())) next_prob = None ind = i + 1 while ind < len(tokens): if pos_scores[ind].item() != 0: next_prob = pos_scores[ind] break ind += 1 word_stats.setdefault(w, WordStat(w, is_bpe)).add( pos_scores[i].item(), next_prob ) is_bpe = False w = "" if output_word_probs: logger.info( str(int(sample_id)) + " " + ( "\t".join( "{} [{:2f}]".format(x[0], x[1]) for x in word_prob ) ) ) avg_nll_loss = ( -score_sum / count / math.log(2) if count > 0 else 0 ) # convert to base 2 logger.info( "Evaluated {:,} tokens in {:.1f}s ({:.2f} tokens/s)".format( gen_timer.n, gen_timer.sum, 1.0 / gen_timer.avg if gen_timer.avg > 0 else 0 ) ) if output_word_stats: for ws in sorted(word_stats.values(), key=lambda x: x.count, reverse=True): logger.info(ws) return { "loss": avg_nll_loss, "perplexity": 2**avg_nll_loss, }
def _main(args, output_file): logging.basicConfig( format='%(asctime)s | %(levelname)s | %(name)s | %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, stream=output_file, ) logger = logging.getLogger('fairseq_cli.generate') utils.import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 logger.info(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Set dictionaries try: src_dict = getattr(task, 'source_dictionary', None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary # Load ensemble logger.info('loading model(s) from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( utils.split_paths(args.path), arg_overrides=eval(args.model_overrides), task=task, ) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=args.log_format, log_interval=args.log_interval, default_log_format=('tqdm' if not args.no_progress_bar else 'none'), ) # debug: ahmed def quantize(data, n, max_value=1): scale = ((2**(n) - 1) / 2) / torch.max(torch.abs(data)) # adaptive max #scale = ((2**(n)-1)/2)/max_value # static max (predetermined) return torch.round(scale * data) / scale # quantize model layer by layer to n-bit #print("#########################################") for name, param in model.named_parameters(): if param.requires_grad and ('weight' in name): layer = 'model.' + name #fileName = 'model_wmt14.weights.layers' fileName = 'model_iwslt14.tokenized.de-en.weights.layers' with open(fileName) as f: layersList = f.readlines() layersNamesList = [layerName.rstrip('\n') for layerName in layersList] layer_max_dict = pickle.load(open("layer_max_dict.pkl", "rb")) n = 8 #PRANNOY (type=int) for layer in layersNamesList: print('----------') #print(model.encoder.layers[0].self_attn) print(layer) kernel = eval(layer) max_value = layer_max_dict[layer].item() kernel_q = quantize(kernel, n) # adaptive (on the fly) #kernel_q = quantize(kernel, 8, max_value) # static exec(layer + '=' + 'torch.nn.Parameter(kernel_q)') print(len((eval(layer)).unique())) """ # quantize model layer by layer to n-bit print("#########################################") #print(model.encoder.embed_tokens.weight.shape) fileName = 'model_print.keys.weights.layers' with open(fileName) as f: layersList = f.readlines() layersNamesList = [layerName.rstrip('\n') for layerName in layersList] for layer in layersNamesList: #print(vars(layer).shape) #print(model.encoder.embed_tokens.weight) #print(exec(layer)) #print(globals()[layer]) #print(eval(layer).shape) print('------------') print(layer) kernel = eval(layer) kernel_q = quantize(kernel) #eval(layer) = torch.nn.Parameter(kernel_q) exec(layer + '=' + 'torch.nn.Parameter(kernel_q)') print(len((eval(layer)).unique())) #print(model) #kernel = model.decoder.layers[3].fc1.weight #print(kernel.shape) #print(torch.max(torch.abs(kernel))) #print(kernel[0][0:3]) #print(len(set(model.decoder.layers[3].fc1.weight))) #kernel_q = quantize(kernel) #print(kernel_q[0][0:3]) #model.decoder.layers[3].fc1.weight = torch.nn.Parameter(kernel_q) #print(len((model.decoder.layers[3].fc1.weight).unique())) print("#########################################") """ # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(models, args) # Handle tokenization and BPE tokenizer = encoders.build_tokenizer(args) bpe = encoders.build_bpe(args) def decode_fn(x): if bpe is not None: x = bpe.decode(x) if tokenizer is not None: x = tokenizer.decode(x) return x # Generate and compute BLEU score if args.sacrebleu: scorer = bleu.SacrebleuScorer() else: scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) num_sentences = 0 has_target = True wps_meter = TimeMeter() for sample in progress: sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample['target'][:, :args.prefix_size] gen_timer.start() hypos = task.inference_step(generator, models, sample, prefix_tokens) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None # Remove padding src_tokens = utils.strip_pad( sample['net_input']['src_tokens'][i, :], tgt_dict.pad()) target_tokens = None if has_target: target_tokens = utils.strip_pad(sample['target'][i, :], tgt_dict.pad()).int().cpu() # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset( args.gen_subset).src.get_original_text(sample_id) target_str = task.dataset( args.gen_subset).tgt.get_original_text(sample_id) else: if src_dict is not None: src_str = src_dict.string(src_tokens, args.remove_bpe) else: src_str = "" if has_target: target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True, extra_symbols_to_ignore={ generator.eos, }) src_str = decode_fn(src_str) if has_target: target_str = decode_fn(target_str) if not args.quiet: if src_dict is not None: print('S-{}\t{}'.format(sample_id, src_str), file=output_file) if has_target: print('T-{}\t{}'.format(sample_id, target_str), file=output_file) # Process top predictions for j, hypo in enumerate(hypos[i][:args.nbest]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'], align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, extra_symbols_to_ignore={ generator.eos, }) detok_hypo_str = decode_fn(hypo_str) if not args.quiet: score = hypo['score'] / math.log(2) # convert to base 2 # original hypothesis (after tokenization and BPE) print('H-{}\t{}\t{}'.format(sample_id, score, hypo_str), file=output_file) # detokenized hypothesis print('D-{}\t{}\t{}'.format(sample_id, score, detok_hypo_str), file=output_file) print( 'P-{}\t{}'.format( sample_id, ' '.join( map( lambda x: '{:.4f}'.format(x), # convert from base e to base 2 hypo['positional_scores'].div_(math.log(2) ).tolist(), ))), file=output_file) if args.print_alignment: print('A-{}\t{}'.format( sample_id, ' '.join([ '{}-{}'.format(src_idx, tgt_idx) for src_idx, tgt_idx in alignment ])), file=output_file) if args.print_step: print('I-{}\t{}'.format(sample_id, hypo['steps']), file=output_file) if getattr(args, 'retain_iter_history', False): for step, h in enumerate(hypo['history']): _, h_str, _ = utils.post_process_prediction( hypo_tokens=h['tokens'].int().cpu(), src_str=src_str, alignment=None, align_dict=None, tgt_dict=tgt_dict, remove_bpe=None, ) print('E-{}_{}\t{}'.format(sample_id, step, h_str), file=output_file) # Score only the top hypothesis if has_target and j == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tgt_dict.encode_line( target_str, add_if_not_exist=True) hypo_tokens = tgt_dict.encode_line( detok_hypo_str, add_if_not_exist=True) if hasattr(scorer, 'add_string'): scorer.add_string(target_str, detok_hypo_str) else: scorer.add(target_tokens, hypo_tokens) wps_meter.update(num_generated_tokens) progress.log({'wps': round(wps_meter.avg)}) num_sentences += sample['nsentences'] logger.info('NOTE: hypothesis and token scores are output in base 2') logger.info( 'Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: if args.bpe and not args.sacrebleu: if args.remove_bpe: logger.warning( "BLEU score is being computed by splitting detokenized string on spaces, this is probably not what you want. Use --sacrebleu for standard 13a BLEU tokenization" ) else: logger.warning( "If you are using BPE on the target side, the BLEU score is computed on BPE tokens, not on proper words. Use --sacrebleu for standard 13a BLEU tokenization" ) logger.info('Generate {} with beam={}: {}'.format( args.gen_subset, args.beam, scorer.result_string())) # ahmed: logging with open("infer_BLEU.txt", "a") as myfile: myfile.write(scorer.result_string()) myfile.write("\n") return scorer
class InferenceProcessor: def __init__(self, cfg: InferConfig) -> None: self.cfg = cfg self.task = tasks.setup_task(cfg.task) self.tgt_dict = self.task.target_dictionary models, saved_cfg = self.load_model_ensemble() self.models = models self.saved_cfg = saved_cfg self.task.load_dataset( self.cfg.dataset.gen_subset, task_cfg=saved_cfg.task, ) self.generator = Decoder(cfg.decoding.decoder, self.tgt_dict) self.gen_timer = StopwatchMeter() self.wps_meter = TimeMeter() self.num_sentences = 0 self.total_errors = 0 self.total_length = 0 self.hypo_words_file = None self.hypo_units_file = None self.ref_words_file = None self.ref_units_file = None self.progress_bar = self.build_progress_bar() def __enter__(self) -> "InferenceProcessor": if self.cfg.decoding.write_sentences: self.hypo_words_file = self.get_res_file("hypo.word") self.hypo_units_file = self.get_res_file("hypo.units") self.ref_words_file = self.get_res_file("ref.word") self.ref_units_file = self.get_res_file("ref.units") return self def __exit__(self, *exc) -> bool: if self.cfg.decoding.write_sentences: self.hypo_words_file.close() self.hypo_units_file.close() self.ref_words_file.close() self.ref_units_file.close() return False def __iter__(self) -> Any: for sample in self.progress_bar: if not self.cfg.common.cpu: sample = utils.move_to_cuda(sample) # Happens on the last batch. if "net_input" not in sample: continue yield sample def log(self, *args, **kwargs): self.progress_bar.log(*args, **kwargs) def print(self, *args, **kwargs): self.progress_bar.print(*args, **kwargs) def get_res_file(self, fname: str) -> None: if self.data_parallel_world_size > 1: fname = f"{fname}.{self.data_parallel_rank}" return open(fname, "w", buffering=1) def merge_shards(self) -> None: """Merges all shard files into shard 0, then removes shard suffix.""" shard_id = self.data_parallel_rank num_shards = self.data_parallel_world_size def merge_shards_with_root(fname: str) -> None: logger.info("Merging %s on shard %d", fname, shard_id) base_fpath = Path(f"{fname}.0") with open(base_fpath, "a") as out_file: for s in range(1, num_shards): shard_fpath = Path(f"{fname}.{s}") with open(shard_fpath, "r") as in_file: for line in in_file: out_file.write(line) shard_fpath.unlink() shutil.move(f"{fname}.0", fname) if shard_id == (0 % num_shards): merge_shards_with_root("hypo.word") if shard_id == (1 % num_shards): merge_shards_with_root("hypo.units") if shard_id == (2 % num_shards): merge_shards_with_root("ref.word") if shard_id == (3 % num_shards): merge_shards_with_root("ref.units") dist.barrier() def optimize_model(self, model: FairseqModel) -> None: gcfg = self.cfg.generation model.make_generation_fast_( beamable_mm_beam_size=None if gcfg.no_beamable_mm else gcfg.beam, need_attn=gcfg.print_alignment, ) if self.cfg.common.fp16: model.half() if not self.cfg.common.cpu: model.cuda() def load_model_ensemble( self) -> Tuple[List[FairseqModel], FairseqDataclass]: arg_overrides = ast.literal_eval(self.cfg.common_eval.model_overrides) models, saved_cfg = checkpoint_utils.load_model_ensemble( utils.split_paths(self.cfg.common_eval.path), arg_overrides=arg_overrides, task=self.task, suffix=self.cfg.checkpoint.checkpoint_suffix, strict=(self.cfg.checkpoint.checkpoint_shard_count == 1), num_shards=self.cfg.checkpoint.checkpoint_shard_count, ) for model in models: self.optimize_model(model) return models, saved_cfg def get_dataset_itr(self, disable_iterator_cache: bool = False) -> None: return self.task.get_batch_iterator( dataset=self.task.dataset(self.cfg.dataset.gen_subset), max_tokens=self.cfg.dataset.max_tokens, max_sentences=self.cfg.dataset.batch_size, max_positions=(sys.maxsize, sys.maxsize), ignore_invalid_inputs=self.cfg.dataset. skip_invalid_size_inputs_valid_test, required_batch_size_multiple=self.cfg.dataset. required_batch_size_multiple, seed=self.cfg.common.seed, num_shards=self.data_parallel_world_size, shard_id=self.data_parallel_rank, num_workers=self.cfg.dataset.num_workers, data_buffer_size=self.cfg.dataset.data_buffer_size, disable_iterator_cache=disable_iterator_cache, ).next_epoch_itr(shuffle=False) def build_progress_bar( self, epoch: Optional[int] = None, prefix: Optional[str] = None, default_log_format: str = "tqdm", ) -> BaseProgressBar: return progress_bar.progress_bar( iterator=self.get_dataset_itr(), log_format=self.cfg.common.log_format, log_interval=self.cfg.common.log_interval, epoch=epoch, prefix=prefix, tensorboard_logdir=self.cfg.common.tensorboard_logdir, default_log_format=default_log_format, ) @property def data_parallel_world_size(self): if self.cfg.distributed_training.distributed_world_size == 1: return 1 return distributed_utils.get_data_parallel_world_size() @property def data_parallel_rank(self): if self.cfg.distributed_training.distributed_world_size == 1: return 0 return distributed_utils.get_data_parallel_rank() def process_sentence( self, sample: Dict[str, Any], hypo: Dict[str, Any], sid: int, batch_id: int, ) -> Tuple[int, int]: speaker = None # Speaker can't be parsed from dataset. if "target_label" in sample: toks = sample["target_label"] else: toks = sample["target"] toks = toks[batch_id, :] # Processes hypothesis. hyp_pieces = self.tgt_dict.string(hypo["tokens"].int().cpu()) if "words" in hypo: hyp_words = " ".join(hypo["words"]) else: hyp_words = post_process(hyp_pieces, self.cfg.common_eval.post_process) # Processes target. target_tokens = utils.strip_pad(toks, self.tgt_dict.pad()) tgt_pieces = self.tgt_dict.string(target_tokens.int().cpu()) tgt_words = post_process(tgt_pieces, self.cfg.common_eval.post_process) if self.cfg.decoding.write_sentences: print(f"{hyp_pieces} ({speaker}-{sid})", file=self.hypo_units_file) print(f"{hyp_words} ({speaker}-{sid})", file=self.hypo_words_file) print(f"{tgt_pieces} ({speaker}-{sid})", file=self.ref_units_file) print(f"{tgt_words} ({speaker}-{sid})", file=self.ref_words_file) hyp_words, tgt_words = hyp_words.split(), tgt_words.split() return editdistance.eval(hyp_words, tgt_words), len(tgt_words) def process_sample(self, sample: Dict[str, Any]) -> None: self.gen_timer.start() hypos = self.task.inference_step( generator=self.generator, models=self.models, sample=sample, ) num_generated_tokens = sum(len(h[0]["tokens"]) for h in hypos) self.gen_timer.stop(num_generated_tokens) self.wps_meter.update(num_generated_tokens) for batch_id, sample_id in enumerate(sample["id"].tolist()): errs, length = self.process_sentence( sample=sample, sid=sample_id, batch_id=batch_id, hypo=hypos[batch_id][0], ) self.total_errors += errs self.total_length += length self.log({"wps": round(self.wps_meter.avg)}) if "nsentences" in sample: self.num_sentences += sample["nsentences"] else: self.num_sentences += sample["id"].numel() def log_generation_time(self) -> None: logger.info( "Processed %d sentences (%d tokens) in %.1fs %.2f " "sentences per second, %.2f tokens per second)", self.num_sentences, self.gen_timer.n, self.gen_timer.sum, self.num_sentences / self.gen_timer.sum, 1.0 / self.gen_timer.avg)
def main(cfg: DictConfig, override_args=None, **unused_kwargs): if isinstance(cfg, Namespace): cfg = convert_namespace_to_omegaconf(cfg) utils.import_user_module(cfg.common) use_fp16 = cfg.common.fp16 use_cuda = torch.cuda.is_available() and not cfg.common.cpu if use_cuda: torch.cuda.set_device(cfg.distributed_training.device_id) if override_args is not None: overrides = vars(override_args) overrides.update(eval(getattr(override_args, "model_overrides", "{}"))) else: overrides = None logger.info(cfg) # Load ensemble logger.info("loading model(s) from {}".format(cfg.common_eval.path)) # reduce tokens per sample by the required context window size cfg.task.tokens_per_sample -= cfg.eval_lm.context_window models, model_args, task = checkpoint_utils.load_model_ensemble_and_task( [cfg.common_eval.path], arg_overrides=overrides, suffix=cfg.checkpoint.checkpoint_suffix, strict=(cfg.checkpoint.checkpoint_shard_count == 1), num_shards=cfg.checkpoint.checkpoint_shard_count, ) # Load dataset splits gen_subset = cfg.dataset.gen_subset task.load_dataset(gen_subset) dataset = task.dataset(gen_subset) if cfg.eval_lm.context_window > 0: dataset = LMContextWindowDataset( dataset=dataset, tokens_per_sample=cfg.task.tokens_per_sample, context_window=cfg.eval_lm.context_window, pad_idx=task.source_dictionary.pad(), ) logger.info("{} {} {} examples".format(cfg.task.data, gen_subset, len(dataset))) # Optimize ensemble for generation and set the source and dest dicts on the model (required by scorer) for model in models: if use_fp16: model.half() if use_cuda and not cfg.distributed_training.pipeline_model_parallel: model.cuda() model.prepare_for_inference_(cfg) assert len(models) > 0 logger.info( "num. model params: {}".format(sum(p.numel() for p in models[0].parameters())) ) itr = task.get_batch_iterator( dataset=dataset, max_tokens=cfg.dataset.max_tokens or 36000, max_sentences=cfg.dataset.batch_size, max_positions=utils.resolve_max_positions( *[model.max_positions() for model in models] ), ignore_invalid_inputs=True, num_shards=max( cfg.dataset.num_shards, cfg.distributed_training.distributed_world_size, ), shard_id=max( cfg.dataset.shard_id, cfg.distributed_training.distributed_rank, ), num_workers=cfg.dataset.num_workers, data_buffer_size=cfg.dataset.data_buffer_size, ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=cfg.common.log_format, log_interval=cfg.common.log_interval, default_log_format=("tqdm" if not cfg.common.no_progress_bar else "simple"), ) gen_timer = StopwatchMeter() scorer = SequenceScorer(task.target_dictionary, cfg.eval_lm.softmax_batch) score_sum = 0.0 count = 0 if cfg.common_eval.remove_bpe is not None: if cfg.common_eval.remove_bpe == "sentencepiece": raise NotImplementedError else: bpe_cont = cfg.common_eval.remove_bpe.rstrip() bpe_toks = { i for i in range(len(task.source_dictionary)) if task.source_dictionary[i].endswith(bpe_cont) } bpe_len = len(bpe_cont) else: bpe_toks = None bpe_len = 0 word_stats = dict() wps_meter = TimeMeter() for sample in progress: if "net_input" not in sample: continue sample = utils.move_to_cuda(sample) if use_cuda else sample gen_timer.start() hypos = scorer.generate(models, sample) gen_timer.stop(sample["ntokens"]) for i, hypos_i in enumerate(hypos): hypo = hypos_i[0] sample_id = sample["id"][i] tokens = hypo["tokens"] tgt_len = tokens.numel() pos_scores = hypo["positional_scores"].float() if cfg.task.add_bos_token: assert hypo["tokens"][0].item() == task.target_dictionary.bos() tokens = tokens[1:] pos_scores = pos_scores[1:] skipped_toks = 0 if bpe_toks is not None: for i in range(tgt_len - 1): if tokens[i].item() in bpe_toks: skipped_toks += 1 pos_scores[i + 1] += pos_scores[i] pos_scores[i] = 0 inf_scores = pos_scores.eq(float("inf")) | pos_scores.eq(float("-inf")) if inf_scores.any(): logger.info( "skipping tokens with inf scores:", task.target_dictionary.string(tokens[inf_scores.nonzero()]), ) pos_scores = pos_scores[(~inf_scores).nonzero()] score_sum += pos_scores.sum().cpu() count += pos_scores.numel() - skipped_toks if cfg.eval_lm.output_word_probs or cfg.eval_lm.output_word_stats: w = "" word_prob = [] is_bpe = False for i in range(len(tokens)): w_ind = tokens[i].item() w += task.source_dictionary[w_ind] if bpe_toks is not None and w_ind in bpe_toks: w = w[:-bpe_len] is_bpe = True else: word_prob.append((w, pos_scores[i].item())) next_prob = None ind = i + 1 while ind < len(tokens): if pos_scores[ind].item() != 0: next_prob = pos_scores[ind] break ind += 1 word_stats.setdefault(w, WordStat(w, is_bpe)).add( pos_scores[i].item(), next_prob ) is_bpe = False w = "" if cfg.eval_lm.output_word_probs: logger.info( str(int(sample_id)) + " " + ( "\t".join( "{} [{:2f}]".format(x[0], x[1]) for x in word_prob ) ) ) wps_meter.update(sample["ntokens"]) progress.log({"wps": round(wps_meter.avg)}) avg_nll_loss = -score_sum / count / math.log(2) # convert to base 2 logger.info( "Evaluated {} tokens in {:.1f}s ({:.2f} tokens/s)".format( gen_timer.n, gen_timer.sum, 1.0 / gen_timer.avg ) ) logger.info( "Loss (base 2): {:.4f}, Perplexity: {:.2f}".format( avg_nll_loss, 2 ** avg_nll_loss ) ) if cfg.eval_lm.output_word_stats: for ws in sorted(word_stats.values(), key=lambda x: x.count, reverse=True): logger.info(ws)
def _main(args, output_file): logging.basicConfig( format='%(asctime)s | %(levelname)s | %(name)s | %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, stream=output_file, ) logger = logging.getLogger('espresso.speech_recognize') if output_file is not sys.stdout: # also print to stdout logger.addHandler(logging.StreamHandler(sys.stdout)) print_options_meaning_changes(args, logger) utils.import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 logger.info(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset split task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Set dictionary dictionary = task.target_dictionary # Load ensemble logger.info('loading model(s) from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( utils.split_paths(args.path), arg_overrides=eval(args.model_overrides), task=task, ) for i, m in enumerate(models): if hasattr(m, 'is_wordlm') and m.is_wordlm: # assume subword LM comes before word LM if isinstance(models[i - 1], FairseqLanguageModel): models[i - 1] = MultiLevelLanguageModel( m, models[i - 1], subwordlm_weight=args.subwordlm_weight, oov_penalty=args.oov_penalty, open_vocab=not args.disable_open_vocab, ) del models[i] logger.info('LM fusion with Multi-level LM') else: models[i] = TensorizedLookaheadLanguageModel( m, dictionary, oov_penalty=args.oov_penalty, open_vocab=not args.disable_open_vocab, ) logger.info('LM fusion with Look-ahead Word LM') # assume subword LM comes after E2E models elif i == len(models) - 1 and isinstance(m, FairseqLanguageModel): logger.info('LM fusion with Subword LM') if args.lm_weight != 0.0: logger.info('using LM fusion with lm-weight={:.2f}'.format( args.lm_weight)) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[ model.max_positions() if hasattr(model, 'encoder') else (None, model.max_positions()) for model in models ]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=args.log_format, log_interval=args.log_interval, default_log_format=('tqdm' if not args.no_progress_bar else 'none'), ) # Initialize generator if args.match_source_len: logger.warning( 'The option match_source_len is not applicable to speech recognition. Ignoring it.' ) gen_timer = StopwatchMeter() generator = task.build_generator(args) # Handle tokenization and BPE tokenizer = encoders.build_tokenizer(args) bpe = encoders.build_bpe(args) def decode_fn(x): if bpe is not None: x = bpe.decode(x) if tokenizer is not None: x = tokenizer.decode(x) return x # Generate and compute WER scorer = wer.Scorer(dictionary, wer_output_filter=args.wer_output_filter) num_sentences = 0 has_target = True wps_meter = TimeMeter() for sample in progress: sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample['target'][:, :args.prefix_size] gen_timer.start() hypos = task.inference_step( generator, models, sample, prefix_tokens, lm_weight=args.lm_weight, ) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) gen_timer.stop(num_generated_tokens) # obtain nonpad mask of encoder output to plot attentions if args.print_alignment: net_input = sample['net_input'] src_tokens = net_input['src_tokens'] output_lengths = models[0].encoder.output_lengths( net_input['src_lengths']) nonpad_idxs = sequence_mask( output_lengths, models[0].encoder.output_lengths(src_tokens.size(1))) for i in range(len(sample['id'])): has_target = sample['target'] is not None utt_id = sample['utt_id'][i] # Retrieve the original sentences if has_target: target_str = sample['target_raw_text'][i] if not args.quiet: detok_target_str = decode_fn(target_str) print('T-{}\t{}'.format(utt_id, detok_target_str), file=output_file) # Process top predictions for j, hypo in enumerate(hypos[i][:args.nbest]): hypo_str = dictionary.string( hypo['tokens'].int().cpu(), bpe_symbol=None, extra_symbols_to_ignore={dictionary.pad()}, ) # not removing bpe at this point detok_hypo_str = decode_fn(hypo_str) if not args.quiet: score = hypo['score'] / math.log(2) # convert to base 2 print('H-{}\t{}\t{}'.format(utt_id, detok_hypo_str, score), file=output_file) # Score and obtain attention only the top hypothesis if j == 0: # src_len x tgt_len attention = hypo['attention'][nonpad_idxs[i]].float().cpu() \ if args.print_alignment and hypo['attention'] is not None else None if args.print_alignment and attention is not None: save_dir = os.path.join(args.results_path, 'attn_plots') os.makedirs(save_dir, exist_ok=True) plot_attention(attention, detok_hypo_str, utt_id, save_dir) scorer.add_prediction(utt_id, hypo_str) if has_target: scorer.add_evaluation(utt_id, target_str, hypo_str) wps_meter.update(num_generated_tokens) progress.log({'wps': round(wps_meter.avg)}) num_sentences += sample['nsentences'] logger.info('NOTE: hypothesis and token scores are output in base 2') logger.info( 'Recognized {} utterances ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if args.print_alignment: logger.info('Saved attention plots in ' + save_dir) if has_target: scorer.add_ordered_utt_list(task.datasets[args.gen_subset].tgt.utt_ids) fn = 'decoded_char_results.txt' with open(os.path.join(args.results_path, fn), 'w', encoding='utf-8') as f: f.write(scorer.print_char_results()) logger.info('Decoded char results saved as ' + f.name) fn = 'decoded_results.txt' with open(os.path.join(args.results_path, fn), 'w', encoding='utf-8') as f: f.write(scorer.print_results()) logger.info('Decoded results saved as ' + f.name) if has_target: header = 'Recognize {} with beam={}: '.format(args.gen_subset, args.beam) fn = 'wer' with open(os.path.join(args.results_path, fn), 'w', encoding='utf-8') as f: res = 'WER={:.2f}%, Sub={:.2f}%, Ins={:.2f}%, Del={:.2f}%'.format( *(scorer.wer())) logger.info(header + res) f.write(res + '\n') logger.info('WER saved in ' + f.name) fn = 'cer' with open(os.path.join(args.results_path, fn), 'w', encoding='utf-8') as f: res = 'CER={:.2f}%, Sub={:.2f}%, Ins={:.2f}%, Del={:.2f}%'.format( *(scorer.cer())) logger.info(' ' * len(header) + res) f.write(res + '\n') logger.info('CER saved in ' + f.name) fn = 'aligned_results.txt' with open(os.path.join(args.results_path, fn), 'w', encoding='utf-8') as f: f.write(scorer.print_aligned_results()) logger.info('Aligned results saved as ' + f.name) return scorer
def _main(args, output_file): logging.basicConfig( format='%(asctime)s | %(levelname)s | %(name)s | %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, stream=output_file, ) logger = logging.getLogger('fairseq_cli.generate_multitask_classifier') utils.import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 logger.info(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Load ensemble logger.info('loading model(s) from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( utils.split_paths(args.path), arg_overrides=eval(args.model_overrides), task=task, ) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=args.log_format, log_interval=args.log_interval, default_log_format=('tqdm' if not args.no_progress_bar else 'none'), ) # Initialize generator gen_timer = StopwatchMeter() num_sentences = 0 for sample in progress: sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue gen_timer.start() hypos = generate_probs(models, sample).cpu() gen_timer.stop(1) for i, sample_id in enumerate(sample['id'].tolist()): if not args.quiet: print('S-{}\t{}'.format(sample_id, sample['auxiliary_target'][i][0]), file=output_file) print('H-{}\t{}\t{}'.format(sample_id, hypos[i].argmax(), hypos[i]), file=output_file) num_sentences += sample['nsentences'] logger.info( 'Predicted {} sentences in {:.1f}s ({:.2f} sentences/s)'.format( num_sentences, gen_timer.sum, num_sentences / gen_timer.sum))
def eval_bleu_score(cfg: DictConfig, model, split='test'): task = tasks.setup_task(cfg.task) model.prepare_for_inference_(cfg) task.load_dataset(split, task_cfg=cfg.task) # Set dictionaries try: src_dict = getattr(task, "source_dictionary", None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary align_dict = utils.load_align_dict(cfg.generation.replace_unk) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(split), max_tokens=cfg.dataset.max_tokens, max_sentences=cfg.dataset.batch_size, max_positions=utils.resolve_max_positions(task.max_positions(), *[model.max_positions()]), ignore_invalid_inputs=cfg.dataset.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=cfg.dataset.required_batch_size_multiple, seed=cfg.common.seed, num_shards=cfg.distributed_training.distributed_world_size, shard_id=cfg.distributed_training.distributed_rank, num_workers=cfg.dataset.num_workers, data_buffer_size=cfg.dataset.data_buffer_size, ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=cfg.common.log_format, log_interval=cfg.common.log_interval, default_log_format=("tqdm" if not cfg.common.no_progress_bar else "simple"), ) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator([model], cfg.generation) # Handle tokenization and BPE tokenizer = encoders.build_tokenizer(cfg.tokenizer) bpe = encoders.build_bpe(cfg.bpe) def decode_fn(x): if bpe is not None: x = bpe.decode(x) if tokenizer is not None: x = tokenizer.decode(x) return x scorer = scoring.build_scorer(cfg.scoring, tgt_dict) use_cuda = torch.cuda.is_available() and not cfg.common.cpu has_target = True for sample in tqdm(progress): sample = utils.move_to_cuda(sample) if use_cuda else sample if "net_input" not in sample: continue prefix_tokens = None if cfg.generation.prefix_size > 0: prefix_tokens = sample["target"][:, :cfg.generation.prefix_size] constraints = None if "constraints" in sample: constraints = sample["constraints"] gen_timer.start() hypos = task.inference_step( generator, model, sample, prefix_tokens=prefix_tokens, constraints=constraints, ) num_generated_tokens = sum(len(h[0]["tokens"]) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample["id"].tolist()): has_target = sample["target"] is not None # Remove padding if "src_tokens" in sample["net_input"]: src_tokens = utils.strip_pad( sample["net_input"]["src_tokens"][i, :], tgt_dict.pad()) else: src_tokens = None target_tokens = None if has_target: target_tokens = (utils.strip_pad(sample["target"][i, :], tgt_dict.pad()).int().cpu()) # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset(split).src.get_original_text(sample_id) target_str = task.dataset(split).tgt.get_original_text( sample_id) else: if src_dict is not None: src_str = src_dict.string(src_tokens, cfg.common_eval.post_process) else: src_str = "" if has_target: target_str = tgt_dict.string( target_tokens, cfg.common_eval.post_process, escape_unk=True, extra_symbols_to_ignore= get_symbols_to_strip_from_output(generator), ) src_str = decode_fn(src_str) if has_target: target_str = decode_fn(target_str) # Process top predictions for j, hypo in enumerate(hypos[i][:cfg.generation.nbest]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo["tokens"].int().cpu(), src_str=src_str, alignment=hypo["alignment"], align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=cfg.common_eval.post_process, extra_symbols_to_ignore=get_symbols_to_strip_from_output( generator), ) detok_hypo_str = decode_fn(hypo_str) # Score only the top hypothesis if has_target and j == 0: if align_dict is not None or cfg.common_eval.post_process is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tgt_dict.encode_line( target_str, add_if_not_exist=True) hypo_tokens = tgt_dict.encode_line( detok_hypo_str, add_if_not_exist=True) if hasattr(scorer, "add_string"): scorer.add_string(target_str, detok_hypo_str) else: scorer.add(target_tokens, hypo_tokens) if has_target: # use print to be consistent with other main outputs: S-, H-, T-, D- and so on print("Generate {} with beam={}: {}".format(split, cfg.generation.beam, scorer.result_string())) return scorer.score()
def eval_bleu_score(cfg: DictConfig, saved_cfg, models, gates, lms, use_cuda, num_of_heads): # Load dataset splits task = tasks.setup_task(cfg.task) # Set dictionaries try: src_dict = getattr(task, "source_dictionary", None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary head_mask = convert_gate_to_mask(gates, num_of_heads) head_mask = utils.move_to_cuda(head_mask) if use_cuda else head_mask models[0].apply_masks(head_mask) # loading the dataset should happen after the checkpoint has been loaded so we can give it the saved task config task.load_dataset(cfg.dataset.gen_subset, task_cfg=saved_cfg.task) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(cfg.generation.replace_unk) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(cfg.dataset.gen_subset), max_tokens=cfg.dataset.max_tokens, max_sentences=cfg.dataset.batch_size, max_positions=utils.resolve_max_positions( task.max_positions(), *[m.max_positions() for m in models]), ignore_invalid_inputs=cfg.dataset.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=cfg.dataset.required_batch_size_multiple, seed=cfg.common.seed, num_shards=cfg.distributed_training.distributed_world_size, shard_id=cfg.distributed_training.distributed_rank, num_workers=cfg.dataset.num_workers, data_buffer_size=cfg.dataset.data_buffer_size, ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=cfg.common.log_format, log_interval=cfg.common.log_interval, default_log_format=("tqdm" if not cfg.common.no_progress_bar else "simple"), ) # Initialize generator gen_timer = StopwatchMeter() extra_gen_cls_kwargs = { "lm_model": lms[0], "lm_weight": cfg.generation.lm_weight } generator = task.build_generator(models, cfg.generation, extra_gen_cls_kwargs=extra_gen_cls_kwargs) # Handle tokenization and BPE tokenizer = encoders.build_tokenizer(cfg.tokenizer) bpe = encoders.build_bpe(cfg.bpe) def decode_fn(x): if bpe is not None: x = bpe.decode(x) if tokenizer is not None: x = tokenizer.decode(x) return x scorer = scoring.build_scorer(cfg.scoring, tgt_dict) num_sentences = 0 has_target = True wps_meter = TimeMeter() for sample in progress: sample = utils.move_to_cuda(sample) if use_cuda else sample if "net_input" not in sample: continue prefix_tokens = None if cfg.generation.prefix_size > 0: prefix_tokens = sample["target"][:, :cfg.generation.prefix_size] constraints = None if "constraints" in sample: constraints = sample["constraints"] gen_timer.start() hypos = task.inference_step( generator, models, sample, prefix_tokens=prefix_tokens, constraints=constraints, ) num_generated_tokens = sum(len(h[0]["tokens"]) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample["id"].tolist()): has_target = sample["target"] is not None # Remove padding if "src_tokens" in sample["net_input"]: src_tokens = utils.strip_pad( sample["net_input"]["src_tokens"][i, :], tgt_dict.pad()) else: src_tokens = None target_tokens = None if has_target: target_tokens = (utils.strip_pad(sample["target"][i, :], tgt_dict.pad()).int().cpu()) # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset( cfg.dataset.gen_subset).src.get_original_text(sample_id) target_str = task.dataset( cfg.dataset.gen_subset).tgt.get_original_text(sample_id) else: if src_dict is not None: src_str = src_dict.string(src_tokens, cfg.common_eval.post_process) else: src_str = "" if has_target: target_str = tgt_dict.string( target_tokens, cfg.common_eval.post_process, escape_unk=True, extra_symbols_to_ignore= get_symbols_to_strip_from_output(generator), ) src_str = decode_fn(src_str) if has_target: target_str = decode_fn(target_str) # Process top predictions for j, hypo in enumerate(hypos[i][:cfg.generation.nbest]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo["tokens"].int().cpu(), src_str=src_str, alignment=hypo["alignment"], align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=cfg.common_eval.post_process, extra_symbols_to_ignore=get_symbols_to_strip_from_output( generator), ) detok_hypo_str = decode_fn(hypo_str) # Score only the top hypothesis if has_target and j == 0: if align_dict is not None or cfg.common_eval.post_process is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tgt_dict.encode_line( target_str, add_if_not_exist=True) hypo_tokens = tgt_dict.encode_line( detok_hypo_str, add_if_not_exist=True) if hasattr(scorer, "add_string"): scorer.add_string(target_str, detok_hypo_str) else: scorer.add(target_tokens, hypo_tokens) wps_meter.update(num_generated_tokens) progress.log({"wps": round(wps_meter.avg)}) num_sentences += (sample["nsentences"] if "nsentences" in sample else sample["id"].numel()) return scorer.score()
def _main(args, output_file): logging.basicConfig( format='%(asctime)s | %(levelname)s | %(name)s | %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, stream=output_file, ) logger = logging.getLogger('fairseq_cli.predict') utils.import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 logger.info(args) # Fix seed for stochastic decoding if args.seed is not None and not args.no_seed_provided: np.random.seed(args.seed) utils.set_torch_seed(args.seed) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Set dictionaries src_dict = getattr(task, 'source_dictionary', None) label_dict = task.label_dictionary # Load ensemble logger.info('loading model(s) from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( utils.split_paths(args.path), arg_overrides=eval(args.model_overrides), task=task, suffix=getattr(args, "checkpoint_suffix", ""), ) # Optimize ensemble for generation for model in models: model.prepare_for_inference_(args) if args.fp16: model.half() if use_cuda: model.cuda() # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=args.log_format, log_interval=args.log_interval, default_log_format=('tqdm' if not args.no_progress_bar else 'none'), ) # Initialize generator gen_timer = StopwatchMeter() # Handle tokenization and BPE tokenizer = encoders.build_tokenizer(args) bpe = encoders.build_bpe(args) # label_fn = lambda label: label_dict.string( # [label + label_dict.nspecial] # ) def decode_fn(x): # decode tag if bpe is not None: x = bpe.decode(x) if tokenizer is not None: x = tokenizer.decode(x) return x num_sentences = 0 num_correct = 0 wps_meter = TimeMeter() for sample in progress: sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue gen_timer.start() hypos = task.inference_step(models, sample, args.classification_head_name) num_correct += (hypos['labels'] == sample['target'].view(-1)).sum() num_generated_tokens = sample['ntokens'] gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample['id'].tolist()): hypo = hypos['labels'][i] score = hypos['scores'][i] hypo_str = label_dict[hypo + label_dict.nspecial] has_target = sample['target'] is not None # Remove padding if 'src_tokens' in sample['net_input']: src_tokens = utils.strip_pad( sample['net_input']['src_tokens'][i, :], src_dict.pad()) src_str = src_dict.string(src_tokens, args.remove_bpe) if has_target: label_str = label_dict[sample['target'][i] + label_dict.nspecial] src_str = decode_fn(src_str) if not args.quiet: if src_dict is not None: print('S-{}\t{}'.format(sample_id, src_str), file=output_file) if has_target: print('T-{}\t{}'.format(sample_id, label_str), file=output_file) print('H-{}\t{}\t{:.3f}'.format(sample_id, hypo_str, score.item()), file=output_file) wps_meter.update(num_generated_tokens) progress.log({'wps': round(wps_meter.avg)}) num_sentences += sample[ "nsentences"] if "nsentences" in sample else sample['id'].numel() logger.info('NOTE: hypothesis and token scores are output in base 2') logger.info( 'Predicted {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) logger.info('Accuracy {}/{}={}'.format(num_correct, num_sentences, num_correct / num_sentences))
def _main(args, output_file): for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, stream=output_file, ) logger = logging.getLogger("fastseq.optimizer.fairseq.generate") utils.import_user_module(args) if args.max_tokens is None and args.batch_size is None: args.max_tokens = 12000 logger.info(args) # Fix seed for stochastic decoding if args.seed is not None and not args.no_seed_provided: np.random.seed(args.seed) utils.set_torch_seed(args.seed) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) overrides = ast.literal_eval(args.model_overrides) # Load ensemble logger.info("loading model(s) from {}".format(args.path)) models, _ = checkpoint_utils.load_model_ensemble( utils.split_paths(args.path), arg_overrides=overrides, task=task, suffix=getattr(args, "checkpoint_suffix", ""), strict=(args.checkpoint_shard_count == 1), num_shards=args.checkpoint_shard_count, ) if args.lm_path is not None: overrides["data"] = args.data try: lms, _ = checkpoint_utils.load_model_ensemble( [args.lm_path], arg_overrides=overrides, task=None, ) except: logger.warning( "Failed to load language model! Please make sure that the language model dict is the same as target dict and is located in the data dir ({})" .format(args.data)) raise assert len(lms) == 1 else: lms = [None] # Optimize ensemble for generation for model in chain(models, lms): if model is None: continue if args.fp16: model.half() if use_cuda and not args.pipeline_model_parallel: model.cuda() model.prepare_for_inference_(args) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.batch_size, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, data_buffer_size=args.data_buffer_size, ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=args.log_format, log_interval=args.log_interval, default_log_format=("tqdm" if not args.no_progress_bar else "none"), ) # Initialize generator gen_timer = StopwatchMeter() extra_gen_cls_kwargs = {"lm_model": lms[0], "lm_weight": args.lm_weight} generator = task.build_generator(models, args, extra_gen_cls_kwargs=extra_gen_cls_kwargs) num_sentences = 0 data_queue = Queue() message_queue = JoinableQueue() p_list = [] for _ in range(args.postprocess_workers): p = PostProcess(args, task, data_queue, message_queue, generator) p_list.append(p) p.start() io_process = IOProcess(args, task, message_queue, output_file) io_process.start() if args.use_el_attn: task.transpose_enc_dec_kv_proj(models) wps_meter = TimeMeter() for sample in progress: cpu_sample = sample if 'net_input' not in sample: continue sample = utils.move_to_cuda(sample) if use_cuda else sample prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample['target'][:, :args.prefix_size] constraints = None if "constraints" in sample: constraints = sample["constraints"] gen_timer.start() try: hypos = task.inference_step(generator, models, sample, prefix_tokens, constraints) except: logging.exception(sys.exc_info()[0]) for p in p_list: p.terminate() io_process.terminate() data_queue.close() message_queue.close() sys.exit(1) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) gen_timer.stop(num_generated_tokens) hypos = [h[:args.nbest] for h in hypos] hypos = move_to_cpu(hypos) if use_cuda else hypos data_queue.put((cpu_sample, hypos)) wps_meter.update(num_generated_tokens) progress.log({'wps': round(wps_meter.avg)}) num_sentences += (cpu_sample['nsentences'] if "nsentences" in cpu_sample else cpu_sample["id"].numel()) data_queue.put(GENERATE_FINISHED) for p in p_list: p.join() message_queue.put(GENERATE_FINISHED) io_process.join() sent_through = num_sentences / gen_timer.sum if num_sentences > 0 else 0 tokens_through = 1. / gen_timer.avg if num_sentences > 0 else 0 logger.info("NOTE: hypothesis and token scores are output in base 2") logger.info( "Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)" .format( num_sentences, gen_timer.n, gen_timer.sum, sent_through, tokens_through, )) return
def main(args): utils.import_user_module(args) if args.buffer_size < 1: args.buffer_size = 1 if args.max_tokens is None and args.max_sentences is None: args.max_sentences = 1 assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert not args.max_sentences or args.max_sentences <= args.buffer_size, \ '--max-sentences/--batch-size cannot be larger than --buffer-size' logger.info(args) use_cuda = torch.cuda.is_available() and not args.cpu # Setup task, e.g., translation task = tasks.setup_task(args) # Load ensemble logger.info('loading model(s) from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( args.path.split(os.pathsep), arg_overrides=eval(args.model_overrides), task=task, ) # Set dictionaries src_dict = task.source_dictionary tgt_dict = task.target_dictionary # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(args) # Handle tokenization and BPE tokenizer = encoders.build_tokenizer(args) bpe = encoders.build_bpe(args) def encode_fn(x): if tokenizer is not None: x = tokenizer.encode(x) if bpe is not None: x = bpe.encode(x) return x def decode_fn(x): if bpe is not None: x = bpe.decode(x) if tokenizer is not None: x = tokenizer.decode(x) return x # Generate and compute BLEU score if args.sacrebleu: scorer = bleu.SacrebleuScorer() else: scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) max_positions = utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]) num_sentences = 0 if args.buffer_size > 1: logger.info('Sentence buffer size: %s', args.buffer_size) logger.info('NOTE: hypothesis and token scores are output in base 2') logger.info('Type the input sentence and press return:') start_id = 0 for line in sys.stdin: inputs = [line.strip()] results = [] for batch in make_batches(inputs, args, task, max_positions, encode_fn): src_tokens = batch.src_tokens src_lengths = batch.src_lengths tgt_tokens = batch.tgt_tokens num_sentences += src_tokens[0].size(0) if use_cuda: if isinstance(src_tokens, list): src_tokens = [tokens.cuda() for tokens in src_tokens] src_lengths = [lengths.cuda() for lengths in src_lengths] else: src_tokens = src_tokens.cuda() src_lengths = src_lengths.cuda() sample = { 'net_input': { 'src_tokens': src_tokens, 'src_lengths': src_lengths, }, 'target': tgt_tokens, } gen_timer.start() translations = task.inference_step(generator, models, sample) num_generated_tokens = sum( len(h[0]['tokens']) for h in translations) gen_timer.stop(num_generated_tokens) for i, (id, hypos) in enumerate(zip(batch.ids.tolist(), translations)): src_tokens_i = utils.strip_pad(src_tokens[i], tgt_dict.pad()) tgt_tokens_i = None if tgt_tokens is not None: tgt_tokens_i = utils.strip_pad(tgt_tokens[i, :], tgt_dict.pad()).int().cpu() results.append( (start_id + id, src_tokens_i, hypos, tgt_tokens_i)) # sort output to match input order for id, src_tokens, hypos, tgt_tokens in sorted(results, key=lambda x: x[0]): if src_dict is not None: src_str = src_dict.string(src_tokens, args.remove_bpe) print('S-{}\t{}'.format(id, src_str)) if tgt_tokens is not None: tgt_str = tgt_dict.string(tgt_tokens, args.remove_bpe, escape_unk=True) print('T-{}\t{}'.format(id, tgt_str)) # Process top predictions for j, hypo in enumerate(hypos[:min(len(hypos), args.nbest)]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'], align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) hypo_str = decode_fn(hypo_str) score = hypo['score'] / math.log(2) # convert to base 2 print('H-{}\t{}\t{}'.format(id, score, hypo_str)) print('P-{}\t{}'.format( id, ' '.join( map( lambda x: '{:.4f}'.format(x), # convert from base e to base 2 hypo['positional_scores'].div_(math.log(2) ).tolist(), )))) if args.print_alignment: alignment_str = " ".join( ["{}-{}".format(src, tgt) for src, tgt in alignment]) print('A-{}\t{}'.format(id, alignment_str)) if args.print_step: print('I-{}\t{}'.format(id, hypo['steps'])) print('O-{}\t{}'.format(id, hypo['num_ops'])) if getattr(args, 'retain_iter_history', False): for step, h in enumerate(hypo['history']): _, h_str, _ = utils.post_process_prediction( hypo_tokens=h['tokens'].int().cpu(), src_str=src_str, alignment=None, align_dict=None, tgt_dict=tgt_dict, remove_bpe=None, ) print('E-{}_{}\t{}'.format(id, step, h_str)) # Score only the top hypothesis if tgt_tokens is not None and j == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE tgt_tokens = tgt_dict.encode_line( tgt_str, add_if_not_exist=True) if hasattr(scorer, 'add_string'): scorer.add_string(tgt_str, hypo_str) else: scorer.add(tgt_tokens, hypo_tokens) sys.stdout.flush() # update running id counter start_id += len(inputs) logger.info( 'Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if args.has_target: logger.info('Generate with beam={}: {}'.format(args.beam, scorer.result_string()))
def _main(cfg, output_file): logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=os.environ.get("LOGLEVEL", "INFO").upper(), stream=output_file, ) logger = logging.getLogger("espresso.dump_posteriors") print_options_meaning_changes(cfg, logger) utils.import_user_module(cfg.common) if cfg.dataset.max_tokens is None and cfg.dataset.batch_size is None: cfg.dataset.max_tokens = 12000 logger.info(cfg) # Fix seed for stochastic decoding if cfg.common.seed is not None and not cfg.generation.no_seed_provided: np.random.seed(cfg.common.seed) utils.set_torch_seed(cfg.common.seed) use_cuda = torch.cuda.is_available() and not cfg.common.cpu task = tasks.setup_task(cfg.task) overrides = ast.literal_eval(cfg.common_eval.model_overrides) # Load ensemble logger.info("loading model(s) from {}".format(cfg.common_eval.path)) models, saved_cfg = checkpoint_utils.load_model_ensemble( utils.split_paths(cfg.common_eval.path), arg_overrides=overrides, task=task, suffix=cfg.checkpoint.checkpoint_suffix, strict=(cfg.checkpoint.checkpoint_shard_count == 1), num_shards=cfg.checkpoint.checkpoint_shard_count, ) # loading the dataset should happen after the checkpoint has been loaded so we can give it the saved task config task.load_dataset(cfg.dataset.gen_subset, task_cfg=saved_cfg.task) # Load state prior for cross-entropy trained systems decoding if cfg.generation.state_prior_file is not None: prior = torch.from_numpy( kaldi_io.read_vec_flt(cfg.generation.state_prior_file)) else: prior = [] # Optimize ensemble for generation for model in models: if model is None: continue if cfg.common.fp16: model.half() if use_cuda and not cfg.distributed_training.pipeline_model_parallel: model.cuda() model.prepare_for_inference_(cfg) if isinstance(prior, list) and getattr(model, "state_prior", None) is not None: prior.append(model.state_prior.unsqueeze(0)) if isinstance(prior, list) and len(prior) > 0: prior = torch.cat(prior, 0).mean(0) # average priors across models prior = prior / prior.sum() # re-normalize elif isinstance(prior, list): prior = None if prior is not None: if cfg.common.fp16: prior = prior.half() if use_cuda: prior = prior.cuda() log_prior = prior.log() else: log_prior = None # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(cfg.dataset.gen_subset), max_tokens=cfg.dataset.max_tokens, max_sentences=cfg.dataset.batch_size, max_positions=utils.resolve_max_positions( task.max_positions(), *[m.max_positions() for m in models]), ignore_invalid_inputs=cfg.dataset.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=cfg.dataset.required_batch_size_multiple, seed=cfg.common.seed, num_shards=cfg.distributed_training.distributed_world_size, shard_id=cfg.distributed_training.distributed_rank, num_workers=cfg.dataset.num_workers, data_buffer_size=cfg.dataset.data_buffer_size, ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=cfg.common.log_format, log_interval=cfg.common.log_interval, default_log_format=("tqdm" if not cfg.common.no_progress_bar else "simple"), ) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(models, cfg.generation) # Generate and dump num_sentences = 0 chunk_width = getattr(task, "chunk_width", None) lprobs_wspecifier = "ark:| copy-matrix ark:- ark:-" with kaldi_io.open_or_fd(lprobs_wspecifier, "wb") as f: if chunk_width is None: # normal dumping (i.e., no chunking) for sample in progress: sample = utils.move_to_cuda(sample) if use_cuda else sample if "net_input" not in sample: continue gen_timer.start() lprobs, padding_mask = task.inference_step( generator, models, sample) if log_prior is not None: assert lprobs.size(-1) == log_prior.size(0) lprobs = lprobs - log_prior out_lengths = ((~padding_mask).long().sum( dim=1).cpu() if padding_mask is not None else None) num_processed_frames = sample["ntokens"] gen_timer.stop(num_processed_frames) num_sentences += (sample["nsentences"] if "nsentences" in sample else sample["id"].numel()) if out_lengths is not None: for i in range(sample["nsentences"]): length = out_lengths[i] kaldi_io.write_mat( f, lprobs[i, :length, :].cpu().numpy(), key=sample["utt_id"][i], ) else: for i in range(sample["nsentences"]): kaldi_io.write_mat(f, lprobs[i, :, :].cpu().numpy(), key=sample["utt_id"][i]) else: # dumping chunks within the same utterance from left to right for sample in progress: # sample is actually a list of batches sample = utils.move_to_cuda(sample) if use_cuda else sample utt_id = sample[0]["utt_id"] id = sample[0]["id"] whole_lprobs = None for i, chunk_sample in enumerate(sample): if "net_input" not in chunk_sample: continue assert (chunk_sample["utt_id"] == utt_id and (chunk_sample["id"] == id).all()) gen_timer.start() lprobs, _ = task.inference_step(generator, models, chunk_sample) if log_prior is not None: assert lprobs.size(-1) == log_prior.size(0) lprobs = lprobs - log_prior if whole_lprobs is None: whole_lprobs = lprobs.cpu() else: whole_lprobs = torch.cat((whole_lprobs, lprobs.cpu()), 1) num_processed_frames = chunk_sample["ntokens"] gen_timer.stop(num_processed_frames) if i == len(sample) - 1: num_sentences += len(utt_id) for j in range(len(utt_id)): truncated_length = models[0].output_lengths( task.dataset( cfg.dataset.gen_subset).src_sizes[id[j]] ) # length is after possible subsampling by the model mat = whole_lprobs[j, :truncated_length, :] kaldi_io.write_mat(f, mat.numpy(), key=utt_id[j]) logger.info( "Dumped {:,} utterances ({} frames) in {:.1f}s ({:.2f} sentences/s, {:.2f} frames/s)" .format( num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1.0 / gen_timer.avg, )) return
def main(parsed_args, **unused_kwargs): assert parsed_args.path is not None, '--path required for evaluation!' if torch.cuda.is_available() and not parsed_args.cpu: torch.cuda.set_device(parsed_args.device_id) utils.import_user_module(parsed_args) logger.info(parsed_args) use_cuda = torch.cuda.is_available() and not parsed_args.cpu task = tasks.setup_task(parsed_args) # Load ensemble logger.info('loading model(s) from {}'.format(parsed_args.path)) models, args = checkpoint_utils.load_model_ensemble( parsed_args.path.split(os.pathsep), arg_overrides=eval(parsed_args.model_overrides), task=task, suffix=getattr(parsed_args, "checkpoint_suffix", ""), ) for arg in vars(parsed_args).keys(): if arg not in { 'self_target', 'future_target', 'past_target', 'tokens_per_sample', 'output_size_dictionary', 'add_bos_token', }: setattr(args, arg, getattr(parsed_args, arg)) # reduce tokens per sample by the required context window size args.tokens_per_sample -= args.context_window task = tasks.setup_task(args) # Load dataset splits task.load_dataset(args.gen_subset) dataset = task.dataset(args.gen_subset) if args.context_window > 0: dataset = LMContextWindowDataset( dataset=dataset, tokens_per_sample=args.tokens_per_sample, context_window=args.context_window, pad_idx=task.source_dictionary.pad(), ) logger.info('{} {} {} examples'.format(args.data, args.gen_subset, len(dataset))) # Optimize ensemble for generation and set the source and dest dicts on the model (required by scorer) for model in models: model.prepare_for_inference_(args) if args.fp16: model.half() if use_cuda: model.cuda() assert len(models) > 0 logger.info('num. model params: {}'.format( sum(p.numel() for p in models[0].parameters()))) itr = task.get_batch_iterator( dataset=dataset, max_tokens=args.max_tokens or 36000, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( *[model.max_positions() for model in models]), ignore_invalid_inputs=True, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=args.log_format, log_interval=args.log_interval, default_log_format=('tqdm' if not args.no_progress_bar else 'none'), ) gen_timer = StopwatchMeter() scorer = SequenceScorer(task.target_dictionary, args.softmax_batch) score_sum = 0. count = 0 if args.remove_bpe is not None: if args.remove_bpe == 'sentencepiece': raise NotImplementedError else: bpe_cont = args.remove_bpe.rstrip() bpe_toks = { i for i in range(len(task.source_dictionary)) if task.source_dictionary[i].endswith(bpe_cont) } bpe_len = len(bpe_cont) else: bpe_toks = None bpe_len = 0 word_stats = dict() wps_meter = TimeMeter() for sample in progress: if 'net_input' not in sample: continue sample = utils.move_to_cuda(sample) if use_cuda else sample gen_timer.start() hypos = scorer.generate(models, sample) gen_timer.stop(sample['ntokens']) for i, hypos_i in enumerate(hypos): hypo = hypos_i[0] sample_id = sample['id'][i] tokens = hypo['tokens'] tgt_len = tokens.numel() pos_scores = hypo['positional_scores'].float() if getattr(args, 'add_bos_token', False): assert hypo['tokens'][0].item() == task.target_dictionary.bos() tokens = tokens[1:] pos_scores = pos_scores[1:] skipped_toks = 0 if bpe_toks is not None: for i in range(tgt_len - 1): if tokens[i].item() in bpe_toks: skipped_toks += 1 pos_scores[i + 1] += pos_scores[i] pos_scores[i] = 0 inf_scores = pos_scores.eq(float('inf')) | pos_scores.eq( float('-inf')) if inf_scores.any(): logger.info( 'skipping tokens with inf scores:', task.target_dictionary.string( tokens[inf_scores.nonzero()])) pos_scores = pos_scores[(~inf_scores).nonzero()] score_sum += pos_scores.sum().cpu() count += pos_scores.numel() - skipped_toks if args.output_word_probs or args.output_word_stats: w = '' word_prob = [] is_bpe = False for i in range(len(tokens)): w_ind = tokens[i].item() w += task.source_dictionary[w_ind] if bpe_toks is not None and w_ind in bpe_toks: w = w[:-bpe_len] is_bpe = True else: word_prob.append((w, pos_scores[i].item())) next_prob = None ind = i + 1 while ind < len(tokens): if pos_scores[ind].item() != 0: next_prob = pos_scores[ind] break ind += 1 word_stats.setdefault(w, WordStat(w, is_bpe)).add( pos_scores[i].item(), next_prob) is_bpe = False w = '' if args.output_word_probs: logger.info( str(int(sample_id)) + " " + ('\t'.join('{} [{:2f}]'.format(x[0], x[1]) for x in word_prob))) wps_meter.update(sample['ntokens']) progress.log({'wps': round(wps_meter.avg)}) avg_nll_loss = -score_sum / count / math.log(2) # convert to base 2 logger.info('Evaluated {} tokens in {:.1f}s ({:.2f} tokens/s)'.format( gen_timer.n, gen_timer.sum, 1. / gen_timer.avg)) logger.info('Loss (base 2): {:.4f}, Perplexity: {:.2f}'.format( avg_nll_loss, 2**avg_nll_loss)) if args.output_word_stats: for ws in sorted(word_stats.values(), key=lambda x: x.count, reverse=True): logger.info(ws)
def generate(cfg: UnsupGenerateConfig, models, saved_cfg, use_cuda): task = tasks.setup_task(cfg.fairseq.task) saved_cfg.task.labels = cfg.fairseq.task.labels task.load_dataset(cfg.fairseq.dataset.gen_subset, task_cfg=saved_cfg.task) # Set dictionary tgt_dict = task.target_dictionary logger.info( "| {} {} {} examples".format( cfg.fairseq.task.data, cfg.fairseq.dataset.gen_subset, len(task.dataset(cfg.fairseq.dataset.gen_subset)), ) ) # Load dataset (possibly sharded) itr = get_dataset_itr(cfg, task) # Initialize generator gen_timer = StopwatchMeter() def build_generator(cfg: UnsupGenerateConfig): w2l_decoder = cfg.w2l_decoder if w2l_decoder == DecoderType.VITERBI: from examples.speech_recognition.w2l_decoder import W2lViterbiDecoder return W2lViterbiDecoder(cfg, task.target_dictionary) elif w2l_decoder == DecoderType.KENLM: from examples.speech_recognition.w2l_decoder import W2lKenLMDecoder return W2lKenLMDecoder(cfg, task.target_dictionary) elif w2l_decoder == DecoderType.FAIRSEQ: from examples.speech_recognition.w2l_decoder import W2lFairseqLMDecoder return W2lFairseqLMDecoder(cfg, task.target_dictionary) elif w2l_decoder == DecoderType.KALDI: from examples.speech_recognition.kaldi.kaldi_decoder import KaldiDecoder assert cfg.kaldi_decoder_config is not None return KaldiDecoder( cfg.kaldi_decoder_config, cfg.beam, ) else: raise NotImplementedError( "only wav2letter decoders with (viterbi, kenlm, fairseqlm) options are supported at the moment but found " + str(w2l_decoder) ) generator = build_generator(cfg) kenlm = None fairseq_lm = None if cfg.lm_model is not None: import kenlm kenlm = kenlm.Model(cfg.lm_model) num_sentences = 0 if cfg.results_path is not None and not os.path.exists(cfg.results_path): os.makedirs(cfg.results_path) res_files = prepare_result_files(cfg) errs_t = 0 lengths_hyp_t = 0 lengths_hyp_unit_t = 0 lengths_t = 0 count = 0 num_feats = 0 all_hyp_pieces = [] all_hyp_words = [] num_symbols = ( len([s for s in tgt_dict.symbols if not s.startswith("madeup")]) - tgt_dict.nspecial ) targets = None if cfg.targets is not None: tgt_path = os.path.join( cfg.fairseq.task.data, cfg.fairseq.dataset.gen_subset + "." + cfg.targets ) if os.path.exists(tgt_path): with open(tgt_path, "r") as f: targets = f.read().splitlines() viterbi_transcript = None if cfg.viterbi_transcript is not None and len(cfg.viterbi_transcript) > 0: logger.info(f"loading viterbi transcript from {cfg.viterbi_transcript}") with open(cfg.viterbi_transcript, "r") as vf: viterbi_transcript = vf.readlines() viterbi_transcript = [v.rstrip().split() for v in viterbi_transcript] gen_timer.start() start = 0 end = len(itr) hypo_futures = None if cfg.w2l_decoder == DecoderType.KALDI: logger.info("Extracting features") hypo_futures = [] samples = [] with progress_bar.build_progress_bar(cfg.fairseq.common, itr) as t: for i, sample in enumerate(t): if "net_input" not in sample or i < start or i >= end: continue if "padding_mask" not in sample["net_input"]: sample["net_input"]["padding_mask"] = None hypos, num_feats = gen_hypos( generator, models, num_feats, sample, task, use_cuda ) hypo_futures.append(hypos) samples.append(sample) if cfg.debug: break itr = list(zip(hypo_futures, samples)) start = 0 end = len(itr) logger.info("Finished extracting features") with progress_bar.build_progress_bar(cfg.fairseq.common, itr) as t: for i, sample in enumerate(t): if i < start or i >= end: continue if hypo_futures is not None: hypos, sample = sample hypos = [h.result() for h in hypos] else: if "net_input" not in sample: continue hypos, num_feats = gen_hypos( generator, models, num_feats, sample, task, use_cuda ) for i, sample_id in enumerate(sample["id"].tolist()): if targets is not None: target_tokens = targets[sample_id] elif "target" in sample or "target_label" in sample: toks = ( sample["target"][i, :] if "target_label" not in sample else sample["target_label"][i, :] ) target_tokens = utils.strip_pad(toks, tgt_dict.pad()).int().cpu() else: target_tokens = None # Process top predictions ( errs, length_hyp, length, hyp_pieces, hyp_words, ) = process_predictions( cfg, hypos[i], tgt_dict, target_tokens, res_files, ) errs_t += errs lengths_hyp_t += length_hyp lengths_hyp_unit_t += ( len(hyp_pieces) if len(hyp_pieces) > 0 else len(hyp_words) ) lengths_t += length count += 1 all_hyp_pieces.append(hyp_pieces) all_hyp_words.append(hyp_words) num_sentences += ( sample["nsentences"] if "nsentences" in sample else sample["id"].numel() ) lm_score_sum = 0 if kenlm is not None: if cfg.unit_lm: lm_score_sum = sum(kenlm.score(w) for w in all_hyp_pieces) else: lm_score_sum = sum(kenlm.score(w) for w in all_hyp_words) elif fairseq_lm is not None: lm_score_sum = sum(fairseq_lm.score([h.split() for h in all_hyp_words])[0]) vt_err_t = 0 vt_length_t = 0 if viterbi_transcript is not None: unit_hyps = [] if cfg.targets is not None and cfg.lexicon is not None: lex = {} with open(cfg.lexicon, "r") as lf: for line in lf: items = line.rstrip().split() lex[items[0]] = items[1:] for h in all_hyp_pieces: hyp_ws = [] for w in h.split(): assert w in lex, w hyp_ws.extend(lex[w]) unit_hyps.append(hyp_ws) else: unit_hyps.extend([h.split() for h in all_hyp_words]) vt_err_t = sum( editdistance.eval(vt, h) for vt, h in zip(viterbi_transcript, unit_hyps) ) vt_length_t = sum(len(h) for h in viterbi_transcript) if res_files is not None: for r in res_files.values(): r.close() gen_timer.stop(lengths_hyp_t) return GenResult( count, errs_t, gen_timer, lengths_hyp_unit_t, lengths_hyp_t, lengths_t, lm_score_sum, num_feats, num_sentences, num_symbols, vt_err_t, vt_length_t, )
def transcribe(self, wav_files): process_dir = uuid.uuid1().hex process_dir = os.path.join(self.temp_path, process_dir) os.makedirs(process_dir) self.args.data = process_dir self.args.gen_subset = 'test' self.args.results_path = process_dir copy2(self.args.w2vec_dict, process_dir) test_words = os.path.join(process_dir, 'test.wrd') test_letters = os.path.join(process_dir, 'test.ltr') test_map = os.path.join(process_dir, 'test.tsv') paths = [os.path.abspath(d) for d in wav_files] for i in range(0, len(paths)): audio_info = soundfile.info(paths[i]) frames = audio_info.frames paths[i] = paths[i] + '\t' + str(frames) words = ['THIS IS A SAMPLE'] * len(paths) letters = [d.replace(' ', '|') for d in words] letters = [' '.join(list(d)) + ' |' for d in letters] with open(test_words, 'w') as f: f.write('\n'.join(words)) with open(test_letters, 'w') as f: f.write('\n'.join(letters)) with open(test_map, 'w') as f: f.write('\n') f.write('\n'.join(paths)) args = self.args if args.max_tokens is None and args.batch_size is None: args.max_tokens = 4000000 use_cuda = torch.cuda.is_available() and not args.cpu task = tasks.setup_task(args) if self.state is None: state = checkpoint_utils.load_checkpoint_to_cpu(args.path, None) state['cfg']['model']['w2v_path'] = self.pretrain_model state['cfg']['generation']['beam'] = self.beam_size self.state = state else: state = self.state if self.models is None: models, saved_cfg = checkpoint_utils.load_model_ensemble( utils.split_paths(args.path), arg_overrides=ast.literal_eval(args.model_overrides), task=task, suffix=args.checkpoint_suffix, strict=(args.checkpoint_shard_count == 1), num_shards=args.checkpoint_shard_count, state=state, ) self.models, self.saved_cfg = models, saved_cfg else: models, saved_cfg = self.models, self.saved_cfg optimize_models(args, use_cuda, models) task.load_dataset(args.gen_subset, task_cfg=saved_cfg.task) # Set dictionary tgt_dict = task.target_dictionary # hack to pass transitions to W2lDecoder if args.criterion == "asg_loss": raise NotImplementedError("asg_loss is currently not supported") # trans = criterions[0].asg.trans.data # args.asg_transitions = torch.flatten(trans).tolist() # Load dataset (possibly sharded) itr = get_dataset_itr(args, task, models) # Initialize generator gen_timer = StopwatchMeter() def build_generator(args): w2l_decoder = getattr(args, "w2l_decoder", None) if w2l_decoder == "viterbi": from examples.speech_recognition.w2l_decoder import W2lViterbiDecoder return W2lViterbiDecoder(args, task.target_dictionary) elif w2l_decoder == "kenlm": from examples.speech_recognition.w2l_decoder import W2lKenLMDecoder return W2lKenLMDecoder(args, task.target_dictionary) elif w2l_decoder == "fairseqlm": from examples.speech_recognition.w2l_decoder import W2lFairseqLMDecoder return W2lFairseqLMDecoder(args, task.target_dictionary) else: print( "only wav2letter decoders with (viterbi, kenlm, fairseqlm) options are supported at the moment" ) # please do not touch this unless you test both generate.py and infer.py with audio_pretraining task if self.generator is None: generator = build_generator(args) else: generator = self.generator if args.load_emissions: generator = ExistingEmissionsDecoder( generator, np.load(args.load_emissions, allow_pickle=True)) num_sentences = 0 if args.results_path is not None and not os.path.exists( args.results_path): os.makedirs(args.results_path) max_source_pos = (utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ) if max_source_pos is not None: max_source_pos = max_source_pos[0] if max_source_pos is not None: max_source_pos = max_source_pos[0] - 1 if args.dump_emissions: emissions = {} if args.dump_features: features = {} models[0].bert.proj = None else: res_files = prepare_result_files(args) errs_t = 0 lengths_t = 0 with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() for sample in t: sample = utils.move_to_cuda(sample) if use_cuda else sample if "net_input" not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample["target"][:, :args.prefix_size] gen_timer.start() if args.dump_emissions: with torch.no_grad(): encoder_out = models[0](**sample["net_input"]) emm = models[0].get_normalized_probs(encoder_out, log_probs=True) emm = emm.transpose(0, 1).cpu().numpy() for i, id in enumerate(sample["id"]): emissions[id.item()] = emm[i] continue elif args.dump_features: with torch.no_grad(): encoder_out = models[0](**sample["net_input"]) feat = encoder_out["encoder_out"].transpose( 0, 1).cpu().numpy() for i, id in enumerate(sample["id"]): padding = (encoder_out["encoder_padding_mask"] [i].cpu().numpy() if encoder_out["encoder_padding_mask"] is not None else None) features[id.item()] = (feat[i], padding) continue hypos = task.inference_step(generator, models, sample, prefix_tokens) num_generated_tokens = sum(len(h[0]["tokens"]) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample["id"].tolist()): speaker = None # id = task.dataset(args.gen_subset).ids[int(sample_id)] id = sample_id toks = (sample["target"][i, :] if "target_label" not in sample else sample["target_label"][i, :]) target_tokens = utils.strip_pad( toks, tgt_dict.pad()).int().cpu() # Process top predictions errs, length = process_predictions( args, hypos[i], None, tgt_dict, target_tokens, res_files, speaker, id, ) errs_t += errs lengths_t += length wps_meter.update(num_generated_tokens) t.log({"wps": round(wps_meter.avg)}) num_sentences += (sample["nsentences"] if "nsentences" in sample else sample["id"].numel()) wer = None if args.dump_emissions: emm_arr = [] for i in range(len(emissions)): emm_arr.append(emissions[i]) np.save(args.dump_emissions, emm_arr) elif args.dump_features: feat_arr = [] for i in range(len(features)): feat_arr.append(features[i]) np.save(args.dump_features, feat_arr) else: if lengths_t > 0: wer = errs_t * 100.0 / lengths_t hypo_file = [ file for file in os.listdir(process_dir) if 'hypo.word' in file ][0] hypo_file = os.path.join(process_dir, hypo_file) with open(hypo_file) as f: hypos = f.read().splitlines() for i in range(0, len(hypos)): words = ' '.join(hypos[i].split()[:-1]) idx_ = hypos[i].split()[-1].split('-')[1][:-1] hypos[i] = (words, int(idx_)) hypos = sorted(hypos, key=lambda x: x[1]) hypos = [h[0] for h in hypos] os.system('rm -rf ' + process_dir) return hypos
def main(args, task=None, model_state=None): check_args(args) if args.max_tokens is None and args.batch_size is None: args.max_tokens = 4000000 logger.info(args) use_cuda = torch.cuda.is_available() and not args.cpu logger.info("| decoding with criterion {}".format(args.criterion)) # Load ensemble if args.load_emissions: models, criterions = [], [] else: logger.info("| loading model(s) from {}".format(args.path)) models, criterions, task = load_models_and_criterions( args.path, data_path=args.data, arg_overrides=eval(args.model_overrides), # noqa task=task, model_state=model_state, ) optimize_models(args, use_cuda, models) # Load dataset splits task.load_dataset(args.gen_subset) # Set dictionary tgt_dict = task.target_dictionary logger.info("| {} {} {} examples".format( args.data, args.gen_subset, len(task.dataset(args.gen_subset)))) # hack to pass transitions to W2lDecoder if args.criterion == "asg_loss": trans = criterions[0].asg.trans.data args.asg_transitions = torch.flatten(trans).tolist() # Load dataset (possibly sharded) itr = get_dataset_itr(args, task, models) # Initialize generator gen_timer = StopwatchMeter() def build_generator(args): w2l_decoder = getattr(args, "w2l_decoder", None) if w2l_decoder == "viterbi": from examples.speech_recognition.w2l_decoder import W2lViterbiDecoder return W2lViterbiDecoder(args, task.target_dictionary) elif w2l_decoder == "kenlm": from examples.speech_recognition.w2l_decoder import W2lKenLMDecoder return W2lKenLMDecoder(args, task.target_dictionary) elif w2l_decoder == "fairseqlm": from examples.speech_recognition.w2l_decoder import W2lFairseqLMDecoder return W2lFairseqLMDecoder(args, task.target_dictionary) else: print( "only wav2letter decoders with (viterbi, kenlm, fairseqlm) options are supported at the moment" ) # please do not touch this unless you test both generate.py and infer.py with audio_pretraining task generator = build_generator(args) if args.load_emissions: generator = ExistingEmissionsDecoder( generator, np.load(args.load_emissions, allow_pickle=True)) logger.info("loaded emissions from " + args.load_emissions) num_sentences = 0 if args.results_path is not None and not os.path.exists(args.results_path): os.makedirs(args.results_path) max_source_pos = (utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ) if max_source_pos is not None: max_source_pos = max_source_pos[0] if max_source_pos is not None: max_source_pos = max_source_pos[0] - 1 if args.dump_emissions: emissions = {} if args.dump_features: features = {} models[0].bert.proj = None else: res_files = prepare_result_files(args) errs_t = 0 lengths_t = 0 with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() for sample in t: sample = utils.move_to_cuda(sample) if use_cuda else sample if "net_input" not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample["target"][:, :args.prefix_size] gen_timer.start() if args.dump_emissions: with torch.no_grad(): encoder_out = models[0](**sample["net_input"]) emm = models[0].get_normalized_probs(encoder_out, log_probs=True) emm = emm.transpose(0, 1).cpu().numpy() for i, id in enumerate(sample["id"]): emissions[id.item()] = emm[i] continue elif args.dump_features: with torch.no_grad(): encoder_out = models[0](**sample["net_input"]) feat = encoder_out["encoder_out"].transpose( 0, 1).cpu().numpy() for i, id in enumerate(sample["id"]): padding = (encoder_out["encoder_padding_mask"][i].cpu( ).numpy() if encoder_out["encoder_padding_mask"] is not None else None) features[id.item()] = (feat[i], padding) continue hypos = task.inference_step(generator, models, sample, prefix_tokens) num_generated_tokens = sum(len(h[0]["tokens"]) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample["id"].tolist()): speaker = None # id = task.dataset(args.gen_subset).ids[int(sample_id)] id = sample_id toks = (sample["target"][i, :] if "target_label" not in sample else sample["target_label"][i, :]) target_tokens = utils.strip_pad(toks, tgt_dict.pad()).int().cpu() # Process top predictions errs, length = process_predictions( args, hypos[i], None, tgt_dict, target_tokens, res_files, speaker, id, ) errs_t += errs lengths_t += length wps_meter.update(num_generated_tokens) t.log({"wps": round(wps_meter.avg)}) num_sentences += (sample["nsentences"] if "nsentences" in sample else sample["id"].numel()) wer = None if args.dump_emissions: emm_arr = [] for i in range(len(emissions)): emm_arr.append(emissions[i]) np.save(args.dump_emissions, emm_arr) logger.info( f"saved {len(emissions)} emissions to {args.dump_emissions}") elif args.dump_features: feat_arr = [] for i in range(len(features)): feat_arr.append(features[i]) np.save(args.dump_features, feat_arr) logger.info(f"saved {len(features)} emissions to {args.dump_features}") else: if lengths_t > 0: wer = errs_t * 100.0 / lengths_t logger.info(f"WER: {wer}") logger.info("| Processed {} sentences ({} tokens) in {:.1f}s ({:.2f}" "sentences/s, {:.2f} tokens/s)".format( num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1.0 / gen_timer.avg, )) logger.info("| Generate {} with beam={}".format( args.gen_subset, args.beam)) return task, wer
def main(args, config=None, init_distributed=False): utils.import_user_module(args) experiment = None if config: experiment = ExistingExperiment( api_key=config["api_key"], previous_experiment=config["experiment_key"], auto_output_logging=None, ) assert ( args.max_tokens is not None or args.max_sentences is not None ), "Must specify batch size either with --max-tokens or --max-sentences" # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) print(args) if experiment: experiment.log_parameters(vars(args), prefix="Device {} :: ".format( args.device_id)) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(","): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print(model) print("| model {}, criterion {}".format(args.arch, criterion.__class__.__name__)) print("| num. model params: {} (num. trained: {})".format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) if experiment: experiment.log_parameters( { "criterion": criterion.__class__.__name__, "num. model params": sum(p.numel() for p in model.parameters()), "num. trained params": sum(p.numel() for p in model.parameters() if p.requires_grad), }, prefix="Device {} :: ".format(args.device_id), ) # Build trainer trainer = Trainer(args, task, model, criterion) print("| training on {} GPUs".format(args.distributed_world_size)) print("| max tokens per GPU = {} and max sentences per GPU = {}".format( args.max_tokens, args.max_sentences)) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(",") while (lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates() < max_update): # train for one epoch train(args, trainer, task, epoch_itr, experiment) if (not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0): valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets, experiment) else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) reload_dataset = ":" in getattr(args, "data", "") # sharded data: get train iterator for next epoch epoch_itr = trainer.get_train_iterator(epoch_itr.epoch, load_dataset=reload_dataset) train_meter.stop() print("| done training in {:.1f} seconds".format(train_meter.sum)) if experiment: experiment.log_metrics( { "valid_loss": valid_losses[0], "lr": lr }, prefix="Device {} ".format(args.device_id), )
def _main(cfg: DictConfig, output_file): logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=os.environ.get("LOGLEVEL", "INFO").upper(), stream=output_file, ) logger = logging.getLogger("fairseq_cli.generate") utils.import_user_module(cfg.common) if cfg.dataset.max_tokens is None and cfg.dataset.batch_size is None: cfg.dataset.max_tokens = 12000 logger.info(cfg) # Fix seed for stochastic decoding if cfg.common.seed is not None and not cfg.generation.no_seed_provided: np.random.seed(cfg.common.seed) utils.set_torch_seed(cfg.common.seed) use_cuda = torch.cuda.is_available() and not cfg.common.cpu # Load dataset splits task = tasks.setup_task(cfg.task) # Set dictionaries try: src_dict = getattr(task, "source_dictionary", None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary overrides = ast.literal_eval(cfg.common_eval.model_overrides) # Load ensemble logger.info("loading model(s) from {}".format(cfg.common_eval.path)) models, saved_cfg = checkpoint_utils.load_model_ensemble( utils.split_paths(cfg.common_eval.path), arg_overrides=overrides, task=task, suffix=cfg.checkpoint.checkpoint_suffix, strict=(cfg.checkpoint.checkpoint_shard_count == 1), num_shards=cfg.checkpoint.checkpoint_shard_count, ) # loading the dataset should happen after the checkpoint has been loaded so we can give it the saved task config task.load_dataset(cfg.dataset.gen_subset, task_cfg=saved_cfg.task) if cfg.generation.lm_path is not None: overrides["data"] = cfg.task.data try: lms, _ = checkpoint_utils.load_model_ensemble( [cfg.generation.lm_path], arg_overrides=overrides, task=None) except: logger.warning( f"Failed to load language model! Please make sure that the language model dict is the same " f"as target dict and is located in the data dir ({cfg.task.data})" ) raise assert len(lms) == 1 else: lms = [None] # Optimize ensemble for generation for model in chain(models, lms): if model is None: continue if cfg.common.fp16: model.half() if use_cuda and not cfg.distributed_training.pipeline_model_parallel: model.cuda() model.prepare_for_inference_(cfg) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(cfg.generation.replace_unk) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(cfg.dataset.gen_subset), max_tokens=cfg.dataset.max_tokens, max_sentences=cfg.dataset.batch_size, max_positions=utils.resolve_max_positions( task.max_positions(), *[m.max_positions() for m in models]), ignore_invalid_inputs=cfg.dataset.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=cfg.dataset.required_batch_size_multiple, seed=cfg.common.seed, num_shards=cfg.distributed_training.distributed_world_size, shard_id=cfg.distributed_training.distributed_rank, num_workers=cfg.dataset.num_workers, data_buffer_size=cfg.dataset.data_buffer_size, ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=cfg.common.log_format, log_interval=cfg.common.log_interval, default_log_format=("tqdm" if not cfg.common.no_progress_bar else "simple"), ) # Initialize generator gen_timer = StopwatchMeter() extra_gen_cls_kwargs = { "lm_model": lms[0], "lm_weight": cfg.generation.lm_weight } generator = task.build_generator(models, cfg.generation, extra_gen_cls_kwargs=extra_gen_cls_kwargs) # Handle tokenization and BPE tokenizer = task.build_tokenizer(cfg.tokenizer) bpe = task.build_bpe(cfg.bpe) def decode_fn(x): if bpe is not None: x = bpe.decode(x) if tokenizer is not None: x = tokenizer.decode(x) return x scorer = scoring.build_scorer(cfg.scoring, tgt_dict) num_sentences = 0 has_target = True wps_meter = TimeMeter() for sample in progress: sample = utils.move_to_cuda(sample) if use_cuda else sample if "net_input" not in sample: continue prefix_tokens = None if cfg.generation.prefix_size > 0: prefix_tokens = sample["target"][:, :cfg.generation.prefix_size] constraints = None if "constraints" in sample: constraints = sample["constraints"] gen_timer.start() hypos = task.inference_step( generator, models, sample, prefix_tokens=prefix_tokens, constraints=constraints, ) num_generated_tokens = sum(len(h[0]["tokens"]) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample["id"].tolist()): has_target = sample["target"] is not None # Remove padding if "src_tokens" in sample["net_input"]: src_tokens = utils.strip_pad( sample["net_input"]["src_tokens"][i, :], tgt_dict.pad()) else: src_tokens = None target_tokens = None if has_target: target_tokens = (utils.strip_pad(sample["target"][i, :], tgt_dict.pad()).int().cpu()) # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset( cfg.dataset.gen_subset).src.get_original_text(sample_id) target_str = task.dataset( cfg.dataset.gen_subset).tgt.get_original_text(sample_id) else: if src_dict is not None: src_str = src_dict.string(src_tokens, cfg.common_eval.post_process) else: src_str = "" if has_target: target_str = tgt_dict.string( target_tokens, cfg.common_eval.post_process, escape_unk=True, extra_symbols_to_ignore= get_symbols_to_strip_from_output(generator), ) src_str = decode_fn(src_str) if has_target: target_str = decode_fn(target_str) if not cfg.common_eval.quiet: if src_dict is not None: print("S-{}\t{}".format(sample_id, src_str), file=output_file) if has_target: print("T-{}\t{}".format(sample_id, target_str), file=output_file) # Process top predictions for j, hypo in enumerate(hypos[i][:cfg.generation.nbest]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo["tokens"].int().cpu(), src_str=src_str, alignment=hypo["alignment"], align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=cfg.common_eval.post_process, extra_symbols_to_ignore=get_symbols_to_strip_from_output( generator), ) detok_hypo_str = decode_fn(hypo_str) if not cfg.common_eval.quiet: score = hypo["score"] / math.log(2) # convert to base 2 # original hypothesis (after tokenization and BPE) print( "H-{}\t{}\t{}".format(sample_id, score, hypo_str), file=output_file, ) # detokenized hypothesis print( "D-{}\t{}\t{}".format(sample_id, score, detok_hypo_str), file=output_file, ) print( "P-{}\t{}".format( sample_id, " ".join( map( lambda x: "{:.4f}".format(x), # convert from base e to base 2 hypo["positional_scores"].div_(math.log(2) ).tolist(), )), ), file=output_file, ) if cfg.generation.print_alignment == "hard": print( "A-{}\t{}".format( sample_id, " ".join([ "{}-{}".format(src_idx, tgt_idx) for src_idx, tgt_idx in alignment ]), ), file=output_file, ) if cfg.generation.print_alignment == "soft": print( "A-{}\t{}".format( sample_id, " ".join([ ",".join(src_probs) for src_probs in alignment ]), ), file=output_file, ) if cfg.generation.print_step: print( "I-{}\t{}".format(sample_id, hypo["steps"]), file=output_file, ) if cfg.generation.retain_iter_history: for step, h in enumerate(hypo["history"]): _, h_str, _ = utils.post_process_prediction( hypo_tokens=h["tokens"].int().cpu(), src_str=src_str, alignment=None, align_dict=None, tgt_dict=tgt_dict, remove_bpe=None, ) print( "E-{}_{}\t{}".format(sample_id, step, h_str), file=output_file, ) # Score only the top hypothesis if has_target and j == 0: if align_dict is not None or cfg.common_eval.post_process is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tgt_dict.encode_line( target_str, add_if_not_exist=True) hypo_tokens = tgt_dict.encode_line( detok_hypo_str, add_if_not_exist=True) if hasattr(scorer, "add_string"): scorer.add_string(target_str, detok_hypo_str) else: scorer.add(target_tokens, hypo_tokens) wps_meter.update(num_generated_tokens) progress.log({"wps": round(wps_meter.avg)}) num_sentences += (sample["nsentences"] if "nsentences" in sample else sample["id"].numel()) logger.info("NOTE: hypothesis and token scores are output in base 2") logger.info( "Translated {:,} sentences ({:,} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)" .format( num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1.0 / gen_timer.avg, )) if has_target: if cfg.bpe and not cfg.generation.sacrebleu: if cfg.common_eval.post_process: logger.warning( "BLEU score is being computed by splitting detokenized string on spaces, this is probably not what you want. Use --sacrebleu for standard 13a BLEU tokenization" ) else: logger.warning( "If you are using BPE on the target side, the BLEU score is computed on BPE tokens, not on proper words. Use --sacrebleu for standard 13a BLEU tokenization" ) # use print to be consistent with other main outputs: S-, H-, T-, D- and so on print( "Generate {} with beam={}: {}".format(cfg.dataset.gen_subset, cfg.generation.beam, scorer.result_string()), file=output_file, ) return scorer
def main(args, task=None, model_state=None): check_args(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 4000000 logger.info(args) use_cuda = torch.cuda.is_available() and not args.cpu if task is None: # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) logger.info("| {} {} {} examples".format( args.data, args.gen_subset, len(task.dataset(args.gen_subset)))) all_trans = [] if 'audio' in args.task: """ tasks that load tsv data trans_path: raw trans (before bpe) """ trans_path = os.path.join(args.data, "{}.word".format(args.gen_subset)) with open(trans_path, "r") as f: for line in f: all_trans.append(line) # Set dictionary tgt_dict = task.target_dictionary logger.info("| decoding with criterion {}".format(args.criterion)) # Load ensemble logger.info("| loading model(s) from {}".format(args.path)) models, criterions, _ = load_models_and_criterions( args.path, data_path=args.data, arg_overrides=eval(args.model_overrides), # noqa task=task, model_state=model_state, ) optimize_models(args, use_cuda, models) # Load dataset (possibly sharded) itr = get_dataset_itr(args, task, models) # Initialize generator gen_timer = StopwatchMeter() generator = CIF_BERT_Decoder(args, task.target_dictionary) num_sentences = 0 if args.results_path is not None and not os.path.exists(args.results_path): os.makedirs(args.results_path) res_files = prepare_result_files(args) errs_t = 0 lengths_t = 0 with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() for sample in t: sample = utils.move_to_cuda(sample) if use_cuda else sample if "net_input" not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample["target"][:, :args.prefix_size] gen_timer.start() hypos = task.inference_step(generator, models, sample, prefix_tokens) num_generated_tokens = sum(len(h[0]["tokens"]) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample["id"].tolist()): speaker = None # id = task.dataset(args.gen_subset).ids[int(sample_id)] id = sample_id toks = sample["target"][ i, :] if 'target_label' not in sample else sample[ "target_label"][i, :] target_tokens = (utils.strip_pad(toks, tgt_dict.pad()).int().cpu()) trans = all_trans[id] if all_trans else task.dataset( args.gen_subset).ids[sample_id][1]['output']['text'].strip( ) # Process top predictions errs, length = process_predictions(args, hypos[i], None, tgt_dict, target_tokens, res_files, speaker, id, trans) errs_t += errs lengths_t += length wps_meter.update(num_generated_tokens) t.log({"wps": round(wps_meter.avg)}) num_sentences += sample[ "nsentences"] if "nsentences" in sample else sample[ "id"].numel() wer = None if lengths_t > 0: wer = errs_t * 100.0 / lengths_t logger.info(f"WER: {wer}") logger.info("| Processed {} sentences ({} tokens) in {:.1f}s ({:.2f}" "sentences/s, {:.2f} tokens/s)".format( num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1.0 / gen_timer.avg, )) logger.info("| Generate {} with beam={}".format(args.gen_subset, args.beam)) return task, wer
def _main(args, output_file): logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=os.environ.get("LOGLEVEL", "INFO").upper(), stream=output_file, ) logger = logging.getLogger("espresso.speech_recognize") if output_file is not sys.stdout: # also print to stdout logger.addHandler(logging.StreamHandler(sys.stdout)) print_options_meaning_changes(args, logger) utils.import_user_module(args) if args.max_tokens is None and args.batch_size is None: args.max_tokens = 12000 logger.info(args) # Fix seed for stochastic decoding if args.seed is not None and not args.no_seed_provided: np.random.seed(args.seed) utils.set_torch_seed(args.seed) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset split task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Set dictionary dictionary = task.target_dictionary overrides = ast.literal_eval(args.model_overrides) # Load ensemble logger.info("loading model(s) from {}".format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( utils.split_paths(args.path), arg_overrides=overrides, task=task, suffix=getattr(args, "checkpoint_suffix", ""), strict=(args.checkpoint_shard_count == 1), num_shards=args.checkpoint_shard_count, ) if args.lm_path is not None: overrides["data"] = args.data try: lms, _ = checkpoint_utils.load_model_ensemble( utils.split_paths(args.lm_path), arg_overrides=overrides, task=None, ) except: logger.warning( f"Failed to load language model! Please make sure that the language model dict is the same " f"as target dict and is located in the data dir ({args.data})") raise assert len(lms) == 1 or len(lms) == 2 # Multi-level LM expects two LMs else: lms = [None] for i, m in enumerate(lms): if m is None: continue if hasattr(m, "is_wordlm") and m.is_wordlm: # assume subword LM comes before word LM if i > 0 and isinstance(lms[i - 1], FairseqLanguageModel): lms[i - 1] = MultiLevelLanguageModel( m, lms[i - 1], subwordlm_weight=args.subwordlm_weight, oov_penalty=args.oov_penalty, open_vocab=not args.disable_open_vocab, ) del lms[i] logger.info("LM fusion with Multi-level LM") else: lms[i] = TensorizedLookaheadLanguageModel( m, dictionary, oov_penalty=args.oov_penalty, open_vocab=not args.disable_open_vocab, ) logger.info("LM fusion with Look-ahead Word LM") else: assert isinstance(m, FairseqLanguageModel) logger.info("LM fusion with Subword LM") if args.lm_weight != 0.0: logger.info("using LM fusion with lm-weight={:.2f}".format( args.lm_weight)) # Optimize ensemble for generation for model in chain(models, lms): if model is None: continue if args.fp16: model.half() if use_cuda and not args.pipeline_model_parallel: model.cuda() model.prepare_for_inference_(args) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.batch_size, max_positions=utils.resolve_max_positions( task.max_positions(), *[ model.max_positions() if hasattr(model, "encoder") else (None, model.max_positions()) for model in models ]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, data_buffer_size=args.data_buffer_size, ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=args.log_format, log_interval=args.log_interval, default_log_format=("tqdm" if not args.no_progress_bar else "none"), ) # Initialize generator if args.match_source_len: logger.warning( "The option match_source_len is not applicable to speech recognition. Ignoring it." ) gen_timer = StopwatchMeter() extra_gen_cls_kwargs = { "lm_model": lms[0], "lm_weight": args.lm_weight, "eos_factor": args.eos_factor, } args.score_reference = False # not applicable for ASR temp_val = args.print_alignment args.print_alignment = False # not applicable for ASR generator = task.build_generator(models, args, extra_gen_cls_kwargs=extra_gen_cls_kwargs) args.print_alignment = temp_val # Handle tokenization and BPE tokenizer = task.build_tokenizer(args) bpe = task.build_bpe(args) def decode_fn(x): if bpe is not None: x = bpe.decode(x) if tokenizer is not None: x = tokenizer.decode(x) return x # Generate and compute WER scorer = wer.Scorer(dictionary, wer_output_filter=args.wer_output_filter) num_sentences = 0 has_target = True wps_meter = TimeMeter() for sample in progress: sample = utils.move_to_cuda(sample) if use_cuda else sample if "net_input" not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample["target"][:, :args.prefix_size] constraints = None if "constraints" in sample: constraints = sample["constraints"] gen_timer.start() hypos = task.inference_step(generator, models, sample, prefix_tokens=prefix_tokens, constraints=constraints) num_generated_tokens = sum(len(h[0]["tokens"]) for h in hypos) gen_timer.stop(num_generated_tokens) # obtain nonpad mask of encoder output to plot attentions if args.print_alignment: net_input = sample["net_input"] src_tokens = net_input["src_tokens"] output_lengths = models[0].encoder.output_lengths( net_input["src_lengths"]) nonpad_idxs = sequence_mask( output_lengths, models[0].encoder.output_lengths(src_tokens.size(1))) for i in range(len(sample["id"])): has_target = sample["target"] is not None utt_id = sample["utt_id"][i] # Retrieve the original sentences if has_target: target_str = sample["target_raw_text"][i] if not args.quiet: detok_target_str = decode_fn(target_str) print("T-{}\t{}".format(utt_id, detok_target_str), file=output_file) # Process top predictions for j, hypo in enumerate(hypos[i][:args.nbest]): hypo_str = dictionary.string( hypo["tokens"].int().cpu(), bpe_symbol=None, extra_symbols_to_ignore=get_symbols_to_strip_from_output( generator), ) # not removing bpe at this point detok_hypo_str = decode_fn(hypo_str) if not args.quiet: score = hypo["score"] / math.log(2) # convert to base 2 print("H-{}\t{}\t{}".format(utt_id, detok_hypo_str, score), file=output_file) # Score and obtain attention only the top hypothesis if j == 0: # src_len x tgt_len attention = hypo["attention"][nonpad_idxs[i]].float().cpu() \ if args.print_alignment and hypo["attention"] is not None else None if args.print_alignment and attention is not None: save_dir = os.path.join(args.results_path, "attn_plots") os.makedirs(save_dir, exist_ok=True) plot_attention(attention, detok_hypo_str, utt_id, save_dir) scorer.add_prediction(utt_id, hypo_str) if has_target: scorer.add_evaluation(utt_id, target_str, hypo_str) wps_meter.update(num_generated_tokens) progress.log({"wps": round(wps_meter.avg)}) num_sentences += sample[ "nsentences"] if "nsentences" in sample else sample["id"].numel() logger.info("NOTE: hypothesis and token scores are output in base 2") logger.info( "Recognized {} utterances ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)" .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if args.print_alignment: logger.info("Saved attention plots in " + save_dir) if has_target: scorer.add_ordered_utt_list(task.datasets[args.gen_subset].tgt.utt_ids) fn = "decoded_char_results.txt" with open(os.path.join(args.results_path, fn), "w", encoding="utf-8") as f: f.write(scorer.print_char_results()) logger.info("Decoded char results saved as " + f.name) fn = "decoded_results.txt" with open(os.path.join(args.results_path, fn), "w", encoding="utf-8") as f: f.write(scorer.print_results()) logger.info("Decoded results saved as " + f.name) if has_target: header = "Recognize {} with beam={}: ".format(args.gen_subset, args.beam) fn = "wer" with open(os.path.join(args.results_path, fn), "w", encoding="utf-8") as f: res = "WER={:.2f}%, Sub={:.2f}%, Ins={:.2f}%, Del={:.2f}%".format( *(scorer.wer())) logger.info(header + res) f.write(res + "\n") logger.info("WER saved in " + f.name) fn = "cer" with open(os.path.join(args.results_path, fn), "w", encoding="utf-8") as f: res = "CER={:.2f}%, Sub={:.2f}%, Ins={:.2f}%, Del={:.2f}%".format( *(scorer.cer())) logger.info(" " * len(header) + res) f.write(res + "\n") logger.info("CER saved in " + f.name) fn = "aligned_results.txt" with open(os.path.join(args.results_path, fn), "w", encoding="utf-8") as f: f.write(scorer.print_aligned_results()) logger.info("Aligned results saved as " + f.name) return scorer
def _main(cfg: DictConfig, output_file): logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=os.environ.get("LOGLEVEL", "INFO").upper(), stream=output_file, ) logger = logging.getLogger('fairseq_cli.predict') utils.import_user_module(cfg.common) if cfg.dataset.max_tokens is None and cfg.dataset.batch_size is None: cfg.dataset.max_tokens = 12000 logger.info(cfg) # Fix seed for stochastic decoding if cfg.common.seed is not None and not cfg.generation.no_seed_provided: np.random.seed(cfg.common.seed) utils.set_torch_seed(cfg.common.seed) use_cuda = torch.cuda.is_available() and not cfg.common.cpu # Load dataset splits task = tasks.setup_task(cfg.task) task.load_dataset(cfg.dataset.gen_subset) # Set dictionaries src_dict = getattr(task, 'source_dictionary', None) tag_dict = task.tag_dictionary overrides = ast.literal_eval(cfg.common_eval.model_overrides) # Load ensemble logger.info("loading model(s) from {}".format(cfg.common_eval.path)) models, saved_cfg = checkpoint_utils.load_model_ensemble( utils.split_paths(cfg.common_eval.path), arg_overrides=overrides, task=task, suffix=cfg.checkpoint.checkpoint_suffix, strict=(cfg.checkpoint.checkpoint_shard_count == 1), num_shards=cfg.checkpoint.checkpoint_shard_count, ) # Optimize ensemble for generation for model in models: model.prepare_for_inference_(cfg) if cfg.common.fp16: model.half() if use_cuda: model.cuda() # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(cfg.dataset.gen_subset), max_tokens=cfg.dataset.max_tokens, max_sentences=cfg.dataset.batch_size, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models] ), ignore_invalid_inputs=cfg.dataset.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=cfg.dataset.required_batch_size_multiple, seed=cfg.common.seed, num_shards=cfg.distributed_training.distributed_world_size, shard_id=cfg.distributed_training.distributed_rank, num_workers=cfg.dataset.num_workers, data_buffer_size=cfg.dataset.data_buffer_size, ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=cfg.common.log_format, log_interval=cfg.common.log_interval, default_log_format=("tqdm" if not cfg.common.no_progress_bar else "simple"), ) # Initialize generator gen_timer = StopwatchMeter() # Handle tokenization and BPE tokenizer = encoders.build_tokenizer(cfg.tokenizer) bpe = encoders.build_bpe(cfg.bpe) def decode_fn(x): # decode tag if bpe is not None: x = bpe.decode(x) if tokenizer is not None: x = tokenizer.decode(x) return x num_sentences = 0 wps_meter = TimeMeter() for sample in progress: sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue gen_timer.start() hypos = task.inference_step(models, sample, cfg.task.tagging_head_name) num_generated_tokens = sample['ntokens'] gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample['id'].tolist()): hypo = hypos[i] hypo_tokens = np.array( hypo) + tag_dict.nspecial # can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first hypo_str = tag_dict.string(hypo_tokens) has_target = sample['target'] is not None # Remove padding if 'src_tokens' in sample['net_input']: src_tokens = utils.strip_pad(sample['net_input']['src_tokens'][i, :], src_dict.pad()) src_str = src_dict.string(src_tokens, cfg.common_eval.post_process) assert len(hypo) == src_tokens.numel() if has_target: tag_offset = 1 tag_tokens = utils.strip_pad(sample['target'][i, :], src_dict.pad()).int().cpu() - tag_offset + tag_dict.nspecial tag_str = tag_dict.string(tag_tokens) src_str = decode_fn(src_str) tag_str = decode_fn(tag_str) if not cfg.common_eval.quiet: if src_dict is not None: print('S-{}\t{}'.format(sample_id, src_str), file=output_file) if has_target: print('T-{}\t{}'.format(sample_id, tag_str), file=output_file) print('H-{}\t{}'.format(sample_id, hypo_str), file=output_file) wps_meter.update(num_generated_tokens) progress.log({'wps': round(wps_meter.avg)}) num_sentences += sample["nsentences"] if "nsentences" in sample else sample['id'].numel() logger.info('NOTE: hypothesis and token scores are output in base 2') logger.info('Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)'.format( num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg))