def generate_main(data_dir, extra_flags=None): generate_parser = options.get_generation_parser() generate_args = options.parse_args_and_arch( generate_parser, [ data_dir, '--path', os.path.join(data_dir, 'checkpoint_last.pt'), '--beam', '3', '--batch-size', '64', '--max-len-b', '5', '--gen-subset', 'valid', '--no-progress-bar', '--print-alignment', ] + (extra_flags or []), ) # evaluate model in batch mode generate.main(generate_args) # evaluate model interactively generate_args.buffer_size = 0 generate_args.max_sentences = None orig_stdin = sys.stdin sys.stdin = StringIO('h e l l o\n') interactive.main(generate_args) sys.stdin = orig_stdin
)) if args.print_alignment: print('A-{}\t{}'.format( sample_id, ' '.join(map(lambda x: str(utils.item(x)), alignment)) )) # Score only the top hypothesis if has_target and i == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tokenizer.Tokenizer.tokenize( target_str, tgt_dict, add_if_not_exist=True) scorer.add(target_tokens, hypo_tokens) wps_meter.update(src_tokens.size(0)) t.log({'wps': round(wps_meter.avg)}) num_sentences += 1 print('| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)'.format( num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string())) if __name__ == '__main__': parser = options.get_generation_parser() args = options.parse_args_and_arch(parser) main(args)
def cli_main(): parser = options.get_generation_parser() parser = add_asr_eval_argument(parser) args = options.parse_args_and_arch(parser) main(args)
from fairseq import data, options, tasks, utils # Parse command-line arguments for generation parser = options.get_generation_parser(default_task='len_pre_transformer') args = options.parse_args_and_arch(parser) # Setup task task = tasks.setup_task(args) # Load model print('| loading model from {}'.format(args.path)) models, _model_args = utils.load_ensemble_for_inference([args.path], task) model = models[0] src_path = "/n/home05/simonx/scratchlfs/zhenyu/length_predic_transforemr/diff_len/data-process/test.de-en.de" tgt_path = "/n/home05/simonx/scratchlfs/zhenyu/length_predic_transforemr/diff_len/data-process/test.de-en.en" src_r = open(src_path,'r',encoding='UTF-8') tgt_r = open(tgt_path,'r',encoding='UTF-8') src_text = src_r.readline() tgt_text = tgt_r.readline().strip() total_num = 0 acc = 0 while total_num < 100: sentence = src_text # Tokenize into characters chars = ' '.join(list(sentence.strip())) tokens = task.source_dictionary.encode_line( chars, add_if_not_exist=False,
def make_parser(): parser = options.get_generation_parser() parser = add_asr_eval_argument(parser) return parser
def score_bw(args): if args.backwards1: scorer1_src = args.target_lang scorer1_tgt = args.source_lang else: scorer1_src = args.source_lang scorer1_tgt = args.target_lang if args.score_model2 is not None: if args.backwards2: scorer2_src = args.target_lang scorer2_tgt = args.source_lang else: scorer2_src = args.source_lang scorer2_tgt = args.target_lang rerank1_is_gen = ( args.gen_model == args.score_model1 and args.source_prefix_frac is None ) rerank2_is_gen = ( args.gen_model == args.score_model2 and args.source_prefix_frac is None ) ( pre_gen, left_to_right_preprocessed_dir, right_to_left_preprocessed_dir, backwards_preprocessed_dir, lm_preprocessed_dir, ) = rerank_utils.get_directories( args.data_dir_name, args.num_rescore, args.gen_subset, args.gen_model_name, args.shard_id, args.num_shards, args.sampling, args.prefix_len, args.target_prefix_frac, args.source_prefix_frac, ) score1_file = rerank_utils.rescore_file_name( pre_gen, args.prefix_len, args.model1_name, target_prefix_frac=args.target_prefix_frac, source_prefix_frac=args.source_prefix_frac, backwards=args.backwards1, ) if args.score_model2 is not None: score2_file = rerank_utils.rescore_file_name( pre_gen, args.prefix_len, args.model2_name, target_prefix_frac=args.target_prefix_frac, source_prefix_frac=args.source_prefix_frac, backwards=args.backwards2, ) if args.right_to_left1: rerank_data1 = right_to_left_preprocessed_dir elif args.backwards1: rerank_data1 = backwards_preprocessed_dir else: rerank_data1 = left_to_right_preprocessed_dir gen_param = ["--batch-size", str(128), "--score-reference", "--gen-subset", "train"] if not rerank1_is_gen and not os.path.isfile(score1_file): print("STEP 4: score the translations for model 1") model_param1 = [ "--path", args.score_model1, "--source-lang", scorer1_src, "--target-lang", scorer1_tgt, ] gen_model1_param = [rerank_data1] + gen_param + model_param1 gen_parser = options.get_generation_parser() input_args = options.parse_args_and_arch(gen_parser, gen_model1_param) with open(score1_file, "w") as f: with redirect_stdout(f): generate.main(input_args) if ( args.score_model2 is not None and not os.path.isfile(score2_file) and not rerank2_is_gen ): print("STEP 4: score the translations for model 2") if args.right_to_left2: rerank_data2 = right_to_left_preprocessed_dir elif args.backwards2: rerank_data2 = backwards_preprocessed_dir else: rerank_data2 = left_to_right_preprocessed_dir model_param2 = [ "--path", args.score_model2, "--source-lang", scorer2_src, "--target-lang", scorer2_tgt, ] gen_model2_param = [rerank_data2] + gen_param + model_param2 gen_parser = options.get_generation_parser() input_args = options.parse_args_and_arch(gen_parser, gen_model2_param) with open(score2_file, "w") as f: with redirect_stdout(f): generate.main(input_args)
def cli_main(): parser = options.get_generation_parser(interactive=False) parser.add_argument('--no-print', action='store_true') parser.add_argument('--truncate-size', default=512, type=int) args = options.parse_args_and_arch(parser) distributed_utils.call_main(args, main)
def _fairseq_generate(complex_filepath, output_pred_filepath, checkpoint_paths, complex_dictionary_path, simple_dictionary_path, beam=5, hypothesis_num=1, lenpen=1., diverse_beam_groups=None, diverse_beam_strength=0.5, sampling=False, batch_size=128): # exp_dir must contain checkpoints/checkpoint_best.pt, and dict.{complex,simple}.txt # First copy input complex file to exp_dir and create dummy simple file tmp_dir = Path(tempfile.mkdtemp()) new_complex_filepath = tmp_dir / 'tmp.complex-simple.complex' dummy_simple_filepath = tmp_dir / 'tmp.complex-simple.simple' shutil.copy(complex_filepath, new_complex_filepath) shutil.copy(complex_filepath, dummy_simple_filepath) shutil.copy(complex_dictionary_path, tmp_dir / 'dict.complex.txt') shutil.copy(simple_dictionary_path, tmp_dir / 'dict.simple.txt') generate_parser = options.get_generation_parser() args = [ tmp_dir, '--path', ':'.join([str(path) for path in checkpoint_paths]), '--beam', beam, '--nbest', hypothesis_num, '--lenpen', lenpen, '--diverse-beam-groups', diverse_beam_groups if diverse_beam_groups is not None else -1, '--diverse-beam-strength', diverse_beam_strength, '--batch-size', batch_size, '--raw-text', '--print-alignment', '--gen-subset', 'tmp', # We don't want to reload pretrained embeddings '--model-overrides', { 'encoder_embed_path': None, 'decoder_embed_path': None }, ] if sampling: args.extend([ '--sampling', '--sampling-topk', 10, ]) args = [str(arg) for arg in args] generate_args = options.parse_args_and_arch(generate_parser, args) out_filepath = tmp_dir / 'generation.out' with log_stdout(out_filepath, mute_stdout=True): # evaluate model in batch mode generate.main(generate_args) # Retrieve translations def parse_all_hypotheses(out_filepath): hypotheses_dict = defaultdict(list) for line in yield_lines(out_filepath): match = re.match(r'^H-(\d+)\t-?\d+\.\d+\t(.*)$', line) if match: sample_id, hypothesis = match.groups() hypotheses_dict[int(sample_id)].append(hypothesis) # Sort in original order return [hypotheses_dict[i] for i in range(len(hypotheses_dict))] all_hypotheses = parse_all_hypotheses(out_filepath) predictions = [ hypotheses[hypothesis_num - 1] for hypotheses in all_hypotheses ] write_lines(predictions, output_pred_filepath) os.remove(dummy_simple_filepath) os.remove(new_complex_filepath)
from torch.utils.data import DataLoader from tqdm import tqdm import sacrebleu logger = logging.getLogger() def get_symbols_to_strip_from_output(generator): if hasattr(generator, 'symbols_to_strip_from_output'): return generator.symbols_to_strip_from_output else: return {generator.eos} if __name__ == "__main__": # Parse command-line arguments for generation parser = options.get_generation_parser(default_task="no_context_tag") parser.add_argument('--output', default='outputs/pred.txt') parser.add_argument('--beam_size', default=5) args = options.parse_args_and_arch(parser) use_cuda = torch.cuda.is_available() and not args.cpu # Setup task task = tasks.setup_task(args) task.load_dataset('test') dataset = task.datasets['test'] task.tokenizer = dataset.dictionary.model dataloader = DataLoader(dataset, batch_size=1, collate_fn=dataset.collater, shuffle=False) dictionary = dataset.dictionary # Load model
def __init__(self, data_dir, checkpoint_path, batch_size=25, constrained_decoding=False): self.constrained_decoding = constrained_decoding self.parser = options.get_generation_parser(interactive=True) # buffer_size is currently not used but we just initialize it to batch # size + 1 to avoid any assertion errors. if self.constrained_decoding: self.parser.set_defaults( path=checkpoint_path, remove_bpe="subword_nmt", num_wokers=-1, constraints="ordered", batch_size=batch_size, buffer_size=batch_size + 1, ) else: self.parser.set_defaults( path=checkpoint_path, remove_bpe="subword_nmt", num_wokers=-1, batch_size=batch_size, buffer_size=batch_size + 1, ) args = options.parse_args_and_arch(self.parser, input_args=[data_dir]) # we are explictly setting src_lang and tgt_lang here # generally the data_dir we pass contains {split}-{src_lang}-{tgt_lang}.*.idx files from # which fairseq infers the src and tgt langs(if these are not passed). In deployment we dont # use any idx files and only store the SRC and TGT dictionaries. args.source_lang = "SRC" args.target_lang = "TGT" args.skip_invalid_size_inputs_valid_test = False # we have custom architechtures in this folder and we will let fairseq # import this args.user_dir = "src/model_configs" self.cfg = convert_namespace_to_omegaconf(args) utils.import_user_module(self.cfg.common) if self.cfg.interactive.buffer_size < 1: self.cfg.interactive.buffer_size = 1 if self.cfg.dataset.max_tokens is None and self.cfg.dataset.batch_size is None: self.cfg.dataset.batch_size = 1 assert (not self.cfg.generation.sampling or self.cfg.generation.nbest == self.cfg.generation.beam ), "--sampling requires --nbest to be equal to --beam" assert (not self.cfg.dataset.batch_size or self.cfg.dataset.batch_size <= self.cfg.interactive.buffer_size ), "--batch-size cannot be larger than --buffer-size" # Fix seed for stochastic decoding # if self.cfg.common.seed is not None and not self.cfg.generation.no_seed_provided: # np.random.seed(self.cfg.common.seed) # utils.set_torch_seed(self.cfg.common.seed) # if not self.constrained_decoding: # self.use_cuda = torch.cuda.is_available() and not self.cfg.common.cpu # else: # self.use_cuda = False self.use_cuda = torch.cuda.is_available() and not self.cfg.common.cpu # Setup task, e.g., translation self.task = tasks.setup_task(self.cfg.task) # Load ensemble overrides = ast.literal_eval(self.cfg.common_eval.model_overrides) self.models, self._model_args = checkpoint_utils.load_model_ensemble( utils.split_paths(self.cfg.common_eval.path), arg_overrides=overrides, task=self.task, suffix=self.cfg.checkpoint.checkpoint_suffix, strict=(self.cfg.checkpoint.checkpoint_shard_count == 1), num_shards=self.cfg.checkpoint.checkpoint_shard_count, ) # Set dictionaries self.src_dict = self.task.source_dictionary self.tgt_dict = self.task.target_dictionary # Optimize ensemble for generation for model in self.models: if model is None: continue if self.cfg.common.fp16: model.half() if (self.use_cuda and not self.cfg.distributed_training.pipeline_model_parallel): model.cuda() model.prepare_for_inference_(self.cfg) # Initialize generator self.generator = self.task.build_generator(self.models, self.cfg.generation) # Handle tokenization and BPE self.tokenizer = self.task.build_tokenizer(self.cfg.tokenizer) self.bpe = self.task.build_bpe(self.cfg.bpe) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) self.align_dict = utils.load_align_dict( self.cfg.generation.replace_unk) self.max_positions = utils.resolve_max_positions( self.task.max_positions(), *[model.max_positions() for model in self.models])
def cli_main(): parser = options.get_generation_parser() args = options.parse_args_and_arch(parser) scorer = main(args) return scorer
def cli_main(): parser = options.get_generation_parser( default_task="speech_recognition_espresso") args = options.parse_args_and_arch(parser) assert args.results_path is not None, "please specify --results-path" main(args)
def main(): script_parser = argparse.ArgumentParser( description= 'Computes greedy completion, single-token prediction, and corresponding targets.' ) script_parser.add_argument('--data-dir', type=str, required=True) script_parser.add_argument('--base-dir', type=str, required=True) script_parser.add_argument('--eval-mode', choices=['all', 'completion', 'singletoken'], default='all') script_parser.add_argument('--data-prefix-length', type=int, default=50, help='Length of prefix') script_parser.add_argument('--batch-size-completions', type=int, default=128) script_parser.add_argument('--batch-size-single-prediction', type=int, default=1024) script_parser.add_argument( '--completion-length', type=int, default=500, help= 'The length of each generated sequence, not counting the prefix length' ) script_parser.add_argument('--model-path', type=str, required=True, help='The path to the folder with checkpoints') script_parser.add_argument('--save-path', type=str, required=True) script_parser.add_argument( '--ckpt', choices=['best', 'last', 'all', 'step', 'epoch'], default='best') script_parser.add_argument('--ckpt-step', type=str, default=None) script_parser.add_argument('--ckpt-epoch', type=str, default=None) script_parser.add_argument('--data-split', choices=['train', 'valid', 'test'], default='valid') script_parser.add_argument('--num-samples', type=int, default=-1) script_parser.add_argument('--beam-size', type=int, default=1) script_parser.add_argument('--beam-ngram-block', type=int, default=0) script_parser.add_argument('--topp', type=float, default=0.0) script_parser.add_argument('--topk', type=int, default=1) script_parser.add_argument('--singletoken-topk', type=int, default=1) script_parser.add_argument('--singletoken-topp', type=float, default=0.0) high_level_args = script_parser.parse_args() if high_level_args.ckpt == 'last': checkpoints = glob( os.path.join(high_level_args.model_path, 'checkpoint_last.pt')) elif high_level_args.ckpt == 'best': checkpoints = glob( os.path.join(high_level_args.model_path, 'checkpoint_best.pt')) elif high_level_args.ckpt == 'step': checkpoints = glob( os.path.join( high_level_args.model_path, 'checkpoint_*_{}.pt'.format(high_level_args.ckpt_step))) elif high_level_args.ckpt == 'epoch': checkpoints = glob( os.path.join(high_level_args.model_path, 'checkpoint{}.pt'.format(high_level_args.ckpt_epoch))) elif high_level_args.ckpt == 'all': checkpoints = glob( os.path.join(high_level_args.model_path, 'checkpoint*')) print("Evaluating {} checkpoints.".format(len(checkpoints))) for i, checkpoint in enumerate(checkpoints): if high_level_args.eval_mode in ['all', 'completion']: num_tokens = high_level_args.data_prefix_length * high_level_args.batch_size_completions FAIRSEQ_OPTS = "--data {} \ --task language_modeling_with_generation \ --path {} \ --tokens-per-sample {} \ --max-tokens {} \ --sample-break-mode none \ --gen-subset {} \ --user-dir {}".format( high_level_args.data_dir, checkpoint, num_tokens, num_tokens, high_level_args.data_split, os.path.join(high_level_args.base_dir, 'fairseq/custom')) sys.argv = shlex.split(FAIRSEQ_OPTS) parser = options.get_generation_parser() args = options.parse_args_and_arch(parser) args.add_bos_token = False args.skip_invalid_size_inputs_valid_test = False task, model, generator, itr, step = evaluate_utils.load(args) task.dictionary.eos_index = len(task.dictionary) - 1 task.dictionary.eos_word = task.dictionary.symbols[-1] fairseq_generator = sequence_generator.SequenceGenerator( tgt_dict=task.dictionary, beam_size=high_level_args.beam_size, no_repeat_ngram_size=high_level_args.beam_ngram_block, max_len_b=high_level_args.completion_length + high_level_args.data_prefix_length, ) filename_suffix = '_{}__st_{}__spl_{}__pfx_{}__cmpl_{}__bs_cmpl_{}__bs_sprd_{}__bms_{}__bnb_{}__tpk_{}__tpp_{}__sttpk_{}__sttpp_{}__ckst_{}__ckep_{}__ckpt_{}'.format( os.path.basename(os.path.normpath(high_level_args.model_path)), step, high_level_args.data_split, high_level_args.data_prefix_length, high_level_args.completion_length, high_level_args.batch_size_completions, high_level_args.batch_size_single_prediction, high_level_args.beam_size, high_level_args.beam_ngram_block, high_level_args.topk, high_level_args.topp, high_level_args.singletoken_topk, high_level_args.singletoken_topp, high_level_args.ckpt_step, high_level_args.ckpt_epoch, high_level_args.ckpt) completions, gen_metrics, actual_metrics = evaluate_utils.generate_completions( model, generator, fairseq_generator, itr, high_level_args.data_prefix_length, high_level_args.completion_length, topk=high_level_args.topk, beam_size=high_level_args.beam_size, num_samples=high_level_args.num_samples, topp=high_level_args.topp) completion_tokens = [[task.dictionary[i] for i in sample] for sample in completions] completion_text = [' '.join(ts) for ts in completion_tokens] # dump generation to text file completion_output_filename = os.path.join( high_level_args.save_path, 'completions_{}.txt'.format(filename_suffix)) with open(completion_output_filename, 'w') as f: for line in completion_text: f.write(line) f.write('\n') print("\tcompletions output file: %s" % completion_output_filename) if high_level_args.eval_mode in ['all', 'singletoken']: num_tokens = high_level_args.batch_size_single_prediction FAIRSEQ_OPTS = "--data {} \ --task language_modeling_with_generation \ --path {} \ --tokens-per-sample {} \ --max-tokens {} \ --sample-break-mode none \ --gen-subset {} \ --user-dir {}".format( high_level_args.data_dir, checkpoint, num_tokens, num_tokens, high_level_args.data_split, os.path.join(high_level_args.base_dir, 'fairseq/custom')) sys.argv = shlex.split(FAIRSEQ_OPTS) parser = options.get_generation_parser() args = options.parse_args_and_arch(parser) args.add_bos_token = False args.skip_invalid_size_inputs_valid_test = False task, model, generator, itr, step = evaluate_utils.load(args) single_predicted_tokens, target_tokens, metrics = evaluate_utils.eval_single_token_prediction( model, itr, task.target_dictionary, singletoken_topk=high_level_args.singletoken_topk, singletoken_topp=high_level_args.singletoken_topp) subset_metrics = {} subset_data = high_level_args.data_split for metric_name, value in metrics.items(): subset_metrics[f'{subset_data}/{metric_name}'] = value subset_metrics['checkpoint_step'] = step filename_suffix = '_{}__st_{}__spl_{}__pfx_{}__cmpl_{}__bs_cmpl_{}__bs_sprd_{}__bms_{}__bnb_{}__tpk_{}__tpp_{}__sttpk_{}__sttpp_{}__ckst_{}__ckep_{}__ckpt_{}'.format( os.path.basename(os.path.normpath(high_level_args.model_path)), step, high_level_args.data_split, high_level_args.data_prefix_length, high_level_args.completion_length, high_level_args.batch_size_completions, high_level_args.batch_size_single_prediction, high_level_args.beam_size, high_level_args.beam_ngram_block, high_level_args.topk, high_level_args.topp, high_level_args.singletoken_topk, high_level_args.singletoken_topp, high_level_args.ckpt_step, high_level_args.ckpt_epoch, high_level_args.ckpt) single_token_predictions_filename = os.path.join( high_level_args.save_path, "single_token_predictions_{}.txt".format(filename_suffix)) pkl_filename = os.path.join( high_level_args.save_path, "metrics_{}.pkl".format(filename_suffix)) pickle.dump(subset_metrics, open(pkl_filename, 'wb')) with open(single_token_predictions_filename, 'w') as f: for single_predicted_tokens_sublist in single_predicted_tokens: _single_token_text = [ task.dictionary[i] for i in single_predicted_tokens_sublist ] f.write(' '.join(_single_token_text)) f.write('\n') target_filename = os.path.join( high_level_args.save_path, "targets_{}.txt".format(filename_suffix)) with open(target_filename, 'w') as f: for target_tokens_sublist in target_tokens: _target_text = [ task.dictionary[i] for i in target_tokens_sublist ] f.write(' '.join(_target_text)) f.write('\n')
def __init__(self, data_path="./data/processed", \ checkpoint_path="./checkpoints/zhen_mass_pre-training.pt",\ task='xmasked_seq2seq',\ user_dir='mass',\ s='zh', t='en',\ langs='en,zh',\ mt_steps='zh-en',\ source_langs='zh',\ target_langs='en',\ beam=5,\ use_cuda=1): self.parser = options.get_generation_parser(interactive=True) self.parser.set_defaults(path=checkpoint_path, task=task, user_dir=user_dir, s=s, t=t,\ source_langs=source_langs, target_langs=target_langs,\ langs=langs, mt_steps=mt_steps, beam=beam) self.use_cuda = use_cuda self.args = options.parse_args_and_arch(self.parser,\ input_args=[data_path]) self.args.user_dir = user_dir self.args.s = s self.args.t = t self.args.langs = langs self.args.mt_steps = mt_steps self.args.source_langs = source_langs self.args.target_langs = target_langs self.args.remove_bpe = '@@ ' #self.args, _ = self.parser.parse_known_args([data_path]) utils.import_user_module(self.args) if self.args.buffer_size < 1: self.args.buffer_size = 1 if self.args.max_tokens is None and self.args.max_sentences is None: self.args.max_sentences = 1 assert not self.args.sampling or self.args.nbest == self.args.beam, \ '--sampling requires --nbest to be equal to --beam' assert not self.args.max_sentences or self.args.max_sentences <= self.args.buffer_size, \ '--max-sentences/--batch-size cannot be larger than --buffer-size' print(self.args) #self.use_cuda = torch.cuda.is_available() and not self.args.cpu # Setup task, e.g., translation self.task = tasks.setup_task(self.args) # Load ensemble print('| loading model(s) from {}'.format(self.args.path)) self.models, self._model_args = checkpoint_utils.load_model_ensemble( self.args.path.split(':'), arg_overrides=eval(self.args.model_overrides), task=self.task, ) # Set dictionaries self.src_dict = self.task.source_dictionary self.tgt_dict = self.task.target_dictionary # Optimize ensemble for generation for model in self.models: model.make_generation_fast_( beamable_mm_beam_size=None if self.args.no_beamable_mm else self.args.beam, need_attn=self.args.print_alignment, ) if self.args.fp16: model.half() if self.use_cuda: model.cuda() # Initialize generator self.generator = self.task.build_generator(self.args) # Hack to support GPT-2 BPE if self.args.remove_bpe == 'gpt2': from fairseq.gpt2_bpe.gpt2_encoding import get_encoder self.decoder = get_encoder( 'fairseq/gpt2_bpe/encoder.json', 'fairseq/gpt2_bpe/vocab.bpe', ) self.encode_fn = lambda x: ' '.join( map(str, self.decoder.encode(x))) else: self.decoder = None self.encode_fn = lambda x: x # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) self.align_dict = utils.load_align_dict(self.args.replace_unk) self.max_positions = utils.resolve_max_positions( self.task.max_positions(), *[model.max_positions() for model in self.models]) if self.args.buffer_size > 1: print('| Sentence buffer size:', self.args.buffer_size)
def cli_main(): parser = options.get_generation_parser() parser.add_argument("--print-attention", action="store_true", help="print attention matrix as jsonline") args = options.parse_args_and_arch(parser) main(args)
def sari_validate(cfg: DictConfig, trainer: Trainer, task: tasks.FairseqTask, epoch_itr, subsets: List[str]) -> List[Optional[float]]: from pathlib import Path from access.resources.paths import get_data_filepath from access.utils.helpers import read_lines from access.preprocessors import load_preprocessors, ComposedPreprocessor from easse.report import get_all_scores from fairseq.data import encoders from fairseq_cli.interactive import buffered_read, make_batches from fairseq_cli.generate import get_symbols_to_strip_from_output from fairseq.token_generation_constraints import pack_constraints, unpack_constraints import tempfile use_cuda = torch.cuda.is_available() and not cfg.common.cpu # Setup task, e.g., translation task = tasks.setup_task(cfg.task) # TODO: Choose parameters for the preprocessors ? # 从pickle文件读取preprocessor # preprocessors = load_preprocessors(Path(cfg.task.data).parent) # composed_preprocessor = ComposedPreprocessor(preprocessors) # 获得turkcorpus.valid.complex的路径 complex_filepath = get_data_filepath('turkcorpus', 'valid', 'complex') # make temp dir # encoded_complex_filepath = tempfile.mkstemp()[1] # encoded_pred_filepath = tempfile.mkstemp()[1] pred_filepath = tempfile.mkstemp()[1] # use preprocessors to encode complex file # composed_preprocessor.encode_file(complex_filepath, encoded_complex_filepath) max_positions = utils.resolve_max_positions( task.max_positions(), trainer.get_model().max_positions(), ) parser = options.get_generation_parser(interactive=True) # TODO: Take args from fairseq_generate gen_args = options.parse_args_and_arch( parser, input_args=['/dummy_data', '--beam', '2']) # Initialize generator generator = task.build_generator([trainer.model], gen_args) # Handle tokenization and BPE tokenizer = encoders.build_tokenizer(cfg.tokenizer) bpe = encoders.build_bpe(cfg.bpe) # Set dictionaries src_dict = task.source_dictionary tgt_dict = task.target_dictionary def encode_fn(x): if tokenizer is not None: x = tokenizer.encode(x) if bpe is not None: x = bpe.encode(x) return x def decode_fn(x): if bpe is not None: x = bpe.decode(x) if tokenizer is not None: x = tokenizer.decode(x) return x align_dict = utils.load_align_dict(cfg.generation.replace_unk) with open(pred_filepath, 'w') as f: start_id = 0 for inputs in buffered_read(complex_filepath, buffer_size=9999): results = [] for batch in make_batches(inputs, cfg, task, max_positions, encode_fn): bsz = batch.src_tokens.size(0) src_tokens = batch.src_tokens src_lengths = batch.src_lengths constraints = batch.constraints if use_cuda: src_tokens = src_tokens.cuda() src_lengths = src_lengths.cuda() if constraints is not None: constraints = constraints.cuda() sample = { "net_input": { "src_tokens": src_tokens, "src_lengths": src_lengths, }, } translations = task.inference_step(generator, [trainer.model], sample, constraints=constraints) list_constraints = [[] for _ in range(bsz)] if cfg.generation.constraints: list_constraints = [ unpack_constraints(c) for c in constraints ] for i, (id, hypos) in enumerate( zip(batch.ids.tolist(), translations)): src_tokens_i = utils.strip_pad(src_tokens[i], tgt_dict.pad()) constraints = list_constraints[i] results.append(( start_id + id, src_tokens_i, hypos, { "constraints": constraints, }, )) # sort output to match input order for id_, src_tokens, hypos, info in sorted(results, key=lambda x: x[0]): if src_dict is not None: src_str = src_dict.string(src_tokens, cfg.common_eval.post_process) for constraint in info["constraints"]: pass # Process top predictions for hypo in hypos[:min(len(hypos), cfg.generation.nbest)]: hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo["tokens"].int().cpu(), src_str=src_str, alignment=hypo["alignment"], align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=cfg.common_eval.post_process, extra_symbols_to_ignore= get_symbols_to_strip_from_output(generator), ) detok_hypo_str = decode_fn(hypo_str) # detokenized hypothesis f.write(f'{detok_hypo_str}\n') if cfg.generation.print_alignment: alignment_str = " ".join([ "{}-{}".format(src, tgt) for src, tgt in alignment ]) # update running id_ counter start_id += len(inputs) # composed_preprocessor.decode_file(encoded_pred_filepath, pred_filepath) ref_filepaths = [ get_data_filepath('turkcorpus', 'valid', 'simple.turk', i) for i in range(8) ] scores = get_all_scores( read_lines(complex_filepath), read_lines(pred_filepath), [read_lines(ref_filepath) for ref_filepath in ref_filepaths]) print(f'num_updates={trainer.get_num_updates()}') print(f'ts_scores={scores}') sari = scores['SARI'] if not hasattr(trainer, 'best_sari'): trainer.best_sari = 0 if not hasattr(trainer, 'n_validations_since_best'): trainer.n_validations_since_best = 0 if sari > trainer.best_sari: trainer.best_sari = sari trainer.n_validations_since_best = 0 else: trainer.n_validations_since_best += 1 print( f'SARI did not improve for {trainer.n_validations_since_best} validations' ) # Does not work because scheduler will set it to previous value everytime # trainer.optimizer.set_lr(0.75 * trainer.optimizer.get_lr()) if trainer.n_validations_since_best >= cfg.validations_before_sari_early_stopping: print( f'Early stopping because SARI did not improve for {trainer.n_validations_since_best} validations' ) trainer.early_stopping = True def is_abort(epoch_itr, best_sari): if (epoch_itr.epoch >= 2 and best_sari < 19): return True if (epoch_itr.epoch >= 5 and best_sari < 22): return True if (epoch_itr.epoch >= 10 and best_sari < 25): return True return False # if is_abort(epoch_itr, best_sari): # print(f'Early stopping because best SARI is too low ({best_sari:.2f}) after {epoch_itr.epoch} epochs.') # # Remove the checkpoint directory as we got nothing interesting # shutil.rmtree(args.save_dir) # # TODO: Abort return [-sari]
def cli_main(): parser = options.get_generation_parser(interactive=True) parser.add_argument("--output-file", type=str, required=True) args = options.parse_args_and_arch(parser) main(args)
def __init__(self, data_path, checkpoint_path, beam, nbest): self.parser = options.get_generation_parser(interactive=True) self.parser.set_defaults(path=checkpoint_path, remove_bpe=None, dataset_impl="lazy", num_wokers=5) self.args = options.parse_args_and_arch(self.parser, input_args=[data_path]) self.args.beam = beam self.args.nbest = nbest utils.import_user_module(self.args) if self.args.buffer_size < 1: self.args.buffer_size = 1 if self.args.max_tokens is None and self.args.max_sentences is None: self.args.max_sentences = 1 assert not self.args.sampling or self.args.nbest == self.args.beam, \ '--sampling requires --nbest to be equal to --beam' assert not self.args.max_sentences or self.args.max_sentences <= self.args.buffer_size, \ '--max-sentences/--batch-size cannot be larger than --buffer-size' self.use_cuda = torch.cuda.is_available() and not self.args.cpu self.task = tasks.setup_task(self.args) self.models, self._model_args = checkpoint_utils.load_model_ensemble( self.args.path.split(':'), arg_overrides=eval(self.args.model_overrides), task=self.task, ) self.src_dict = self.task.source_dictionary self.tgt_dict = self.task.target_dictionary for model in self.models: model.make_generation_fast_( beamable_mm_beam_size=None if self.args.no_beamable_mm else self.args.beam, need_attn=self.args.print_alignment, ) if self.args.fp16: model.half() if self.use_cuda: model.cuda() self.generator = self.task.build_generator(self.args) if self.args.remove_bpe == 'gpt2': from fairseq.gpt2_bpe.gpt2_encoding import get_encoder self.decoder = get_encoder( 'fairseq/gpt2_bpe/encoder.json', 'fairseq/gpt2_bpe/vocab.bpe', ) self.encode_fn = lambda x: ' '.join( map(str, self.decoder.encode(x))) else: self.decoder = None self.encode_fn = lambda x: x self.align_dict = utils.load_align_dict(self.args.replace_unk) self.max_positions = utils.resolve_max_positions( self.task.max_positions(), *[model.max_positions() for model in self.models])
def gen_and_reprocess_nbest(args): if args.score_dict_dir is None: args.score_dict_dir = args.data if args.prefix_len is not None: assert args.right_to_left1 is False, "prefix length not compatible with right to left models" assert args.right_to_left2 is False, "prefix length not compatible with right to left models" if args.nbest_list is not None: assert args.score_model2 is None if args.backwards1: scorer1_src = args.target_lang scorer1_tgt = args.source_lang else: scorer1_src = args.source_lang scorer1_tgt = args.target_lang store_data = os.path.join(os.path.dirname(__file__))+"/rerank_data/"+args.data_dir_name if not os.path.exists(store_data): os.makedirs(store_data) pre_gen, left_to_right_preprocessed_dir, right_to_left_preprocessed_dir, \ backwards_preprocessed_dir, lm_preprocessed_dir = \ rerank_utils.get_directories(args.data_dir_name, args.num_rescore, args.gen_subset, args.gen_model_name, args.shard_id, args.num_shards, args.sampling, args.prefix_len, args.target_prefix_frac, args.source_prefix_frac) assert not (args.right_to_left1 and args.backwards1), "backwards right to left not supported" assert not (args.right_to_left2 and args.backwards2), "backwards right to left not supported" assert not (args.prefix_len is not None and args.target_prefix_frac is not None), \ "target prefix frac and target prefix len incompatible" # make directory to store generation results if not os.path.exists(pre_gen): os.makedirs(pre_gen) rerank1_is_gen = args.gen_model == args.score_model1 and args.source_prefix_frac is None rerank2_is_gen = args.gen_model == args.score_model2 and args.source_prefix_frac is None if args.nbest_list is not None: rerank2_is_gen = True # make directories to store preprossed nbest list for reranking if not os.path.exists(left_to_right_preprocessed_dir): os.makedirs(left_to_right_preprocessed_dir) if not os.path.exists(right_to_left_preprocessed_dir): os.makedirs(right_to_left_preprocessed_dir) if not os.path.exists(lm_preprocessed_dir): os.makedirs(lm_preprocessed_dir) if not os.path.exists(backwards_preprocessed_dir): os.makedirs(backwards_preprocessed_dir) score1_file = rerank_utils.rescore_file_name(pre_gen, args.prefix_len, args.model1_name, target_prefix_frac=args.target_prefix_frac, source_prefix_frac=args.source_prefix_frac, backwards=args.backwards1) if args.score_model2 is not None: score2_file = rerank_utils.rescore_file_name(pre_gen, args.prefix_len, args.model2_name, target_prefix_frac=args.target_prefix_frac, source_prefix_frac=args.source_prefix_frac, backwards=args.backwards2) predictions_bpe_file = pre_gen+"/generate_output_bpe.txt" using_nbest = args.nbest_list is not None if using_nbest: print("Using predefined n-best list from interactive.py") predictions_bpe_file = args.nbest_list else: if not os.path.isfile(predictions_bpe_file): print("STEP 1: generate predictions using the p(T|S) model with bpe") print(args.data) param1 = [args.data, "--path", args.gen_model, "--shard-id", str(args.shard_id), "--num-shards", str(args.num_shards), "--nbest", str(args.num_rescore), "--batch-size", str(args.batch_size), "--beam", str(args.num_rescore), "--max-sentences", str(args.num_rescore), "--gen-subset", args.gen_subset, "--source-lang", args.source_lang, "--target-lang", args.target_lang] if args.sampling: param1 += ["--sampling"] gen_parser = options.get_generation_parser() input_args = options.parse_args_and_arch(gen_parser, param1) print(input_args) with open(predictions_bpe_file, 'w') as f: with redirect_stdout(f): generate.main(input_args) gen_output = rerank_utils.BitextOutputFromGen(predictions_bpe_file, bpe_symbol=args.remove_bpe, nbest=using_nbest, prefix_len=args.prefix_len, target_prefix_frac=args.target_prefix_frac) if args.diff_bpe: rerank_utils.write_reprocessed(gen_output.no_bpe_source, gen_output.no_bpe_hypo, gen_output.no_bpe_target, pre_gen+"/source_gen_bpe."+args.source_lang, pre_gen+"/target_gen_bpe."+args.target_lang, pre_gen+"/reference_gen_bpe."+args.target_lang) bitext_bpe = args.rescore_bpe_code bpe_src_param = ["-c", bitext_bpe, "--input", pre_gen+"/source_gen_bpe."+args.source_lang, "--output", pre_gen+"/rescore_data."+args.source_lang] bpe_tgt_param = ["-c", bitext_bpe, "--input", pre_gen+"/target_gen_bpe."+args.target_lang, "--output", pre_gen+"/rescore_data."+args.target_lang] subprocess.call(["python", os.path.join(os.path.dirname(__file__), "subword-nmt/subword_nmt/apply_bpe.py")] + bpe_src_param, shell=False) subprocess.call(["python", os.path.join(os.path.dirname(__file__), "subword-nmt/subword_nmt/apply_bpe.py")] + bpe_tgt_param, shell=False) if (not os.path.isfile(score1_file) and not rerank1_is_gen) or \ (args.score_model2 is not None and not os.path.isfile(score2_file) and not rerank2_is_gen): print("STEP 2: process the output of generate.py so we have clean text files with the translations") rescore_file = "/rescore_data" if args.prefix_len is not None: prefix_len_rescore_file = rescore_file + "prefix"+str(args.prefix_len) if args.target_prefix_frac is not None: target_prefix_frac_rescore_file = rescore_file + "target_prefix_frac"+str(args.target_prefix_frac) if args.source_prefix_frac is not None: source_prefix_frac_rescore_file = rescore_file + "source_prefix_frac"+str(args.source_prefix_frac) if not args.right_to_left1 or not args.right_to_left2: if not args.diff_bpe: rerank_utils.write_reprocessed(gen_output.source, gen_output.hypo, gen_output.target, pre_gen+rescore_file+"."+args.source_lang, pre_gen+rescore_file+"."+args.target_lang, pre_gen+"/reference_file", bpe_symbol=args.remove_bpe) if args.prefix_len is not None: bw_rescore_file = prefix_len_rescore_file rerank_utils.write_reprocessed(gen_output.source, gen_output.hypo, gen_output.target, pre_gen+prefix_len_rescore_file+"."+args.source_lang, pre_gen+prefix_len_rescore_file+"."+args.target_lang, pre_gen+"/reference_file", prefix_len=args.prefix_len, bpe_symbol=args.remove_bpe) elif args.target_prefix_frac is not None: bw_rescore_file = target_prefix_frac_rescore_file rerank_utils.write_reprocessed(gen_output.source, gen_output.hypo, gen_output.target, pre_gen+target_prefix_frac_rescore_file+"."+args.source_lang, pre_gen+target_prefix_frac_rescore_file+"."+args.target_lang, pre_gen+"/reference_file", bpe_symbol=args.remove_bpe, target_prefix_frac=args.target_prefix_frac) else: bw_rescore_file = rescore_file if args.source_prefix_frac is not None: fw_rescore_file = source_prefix_frac_rescore_file rerank_utils.write_reprocessed(gen_output.source, gen_output.hypo, gen_output.target, pre_gen+source_prefix_frac_rescore_file+"."+args.source_lang, pre_gen+source_prefix_frac_rescore_file+"."+args.target_lang, pre_gen+"/reference_file", bpe_symbol=args.remove_bpe, source_prefix_frac=args.source_prefix_frac) else: fw_rescore_file = rescore_file if args.right_to_left1 or args.right_to_left2: rerank_utils.write_reprocessed(gen_output.source, gen_output.hypo, gen_output.target, pre_gen+"/right_to_left_rescore_data."+args.source_lang, pre_gen+"/right_to_left_rescore_data."+args.target_lang, pre_gen+"/right_to_left_reference_file", right_to_left=True, bpe_symbol=args.remove_bpe) print("STEP 3: binarize the translations") if not args.right_to_left1 or args.score_model2 is not None and not args.right_to_left2 or not rerank1_is_gen: if args.backwards1 or args.backwards2: if args.backwards_score_dict_dir is not None: bw_dict = args.backwards_score_dict_dir else: bw_dict = args.score_dict_dir bw_preprocess_param = ["--source-lang", scorer1_src, "--target-lang", scorer1_tgt, "--trainpref", pre_gen+bw_rescore_file, "--srcdict", bw_dict + "/dict." + scorer1_src + ".txt", "--tgtdict", bw_dict + "/dict." + scorer1_tgt + ".txt", "--destdir", backwards_preprocessed_dir] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(bw_preprocess_param) preprocess.main(input_args) preprocess_param = ["--source-lang", scorer1_src, "--target-lang", scorer1_tgt, "--trainpref", pre_gen+fw_rescore_file, "--srcdict", args.score_dict_dir+"/dict."+scorer1_src+".txt", "--tgtdict", args.score_dict_dir+"/dict."+scorer1_tgt+".txt", "--destdir", left_to_right_preprocessed_dir] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(preprocess_param) preprocess.main(input_args) if args.right_to_left1 or args.right_to_left2: preprocess_param = ["--source-lang", scorer1_src, "--target-lang", scorer1_tgt, "--trainpref", pre_gen+"/right_to_left_rescore_data", "--srcdict", args.score_dict_dir+"/dict."+scorer1_src+".txt", "--tgtdict", args.score_dict_dir+"/dict."+scorer1_tgt+".txt", "--destdir", right_to_left_preprocessed_dir] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(preprocess_param) preprocess.main(input_args) return gen_output
def load_model(): parser = options.get_generation_parser(interactive=True) args = options.parse_args_and_arch(parser) return load_model1(args)
scores_per_target_length[len(target)].append(local_score) print(f"Localism {np.mean(predictions_equal)}") with open("trace_localism.txt", 'w') as f: for pairs, score in all_pairs: f.write("------------------------------\n") for s, t in pairs: f.write("{} -> {}\n".format(s, t)) f.write("{}".format(score)) # # UNCOMMENT TO COMPUTE LOCALISM SCORE PER INPUT / TARGET LENGTH # for key in sorted(list(scores_per_input_length.keys())): # scores = scores_per_input_length[key] # print(f"Input length {key}, {len(scores)}, localism score {sum(scores)/len(scores)}.") # for key in sorted(list(scores_per_target_length.keys())): # scores = scores_per_target_length[key] # print(f"Target length {key}, localism score {sum(scores) / len(scores)}.") if __name__ == '__main__': parser = options.get_generation_parser(interactive=True) args = options.parse_args_and_arch(parser) Batch = namedtuple('Batch', 'srcs tokens lengths') Translation = namedtuple('Translation', 'src_str hypos pos_scores alignments') main(args)
from fairseq import checkpoint_utils, data, options, tasks # Parse command-line arguments for generation parser = options.get_generation_parser(default_task='simple_classification') args = options.parse_args_and_arch(parser) # Setup task task = tasks.setup_task(args) # Load model print('| loading model from {}'.format( '/Users/ashisharora/espresso/fairseq_cli/checkpoints/checkpoint_best.pt')) models, _model_args = checkpoint_utils.load_model_ensemble( ['/Users/ashisharora/espresso/fairseq_cli/checkpoints/checkpoint_best.pt'], task=task) model = models[0] #while True: #sentence = input('\nInput: ') #chars = ' '.join(list(sentence.strip())) #tokens = task.source_dictionary.encode_line( # chars, add_if_not_exist=False, #) chars = 'W i l l i a m' tokens = task.source_dictionary.encode_line( chars, add_if_not_exist=False, ) chars = 'W i l l i a m' tokens2 = task.source_dictionary.encode_line( chars, add_if_not_exist=False,
wps_meter.update(src_tokens.size(0)) t.log({'wps': round(wps_meter.avg)}) num_sentences += 1 print('| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)'.format( num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string())) #output the result return result if __name__ == '__main__': parser = options.get_generation_parser() args = options.parse_args_and_arch(parser) print(args.path) result=main(args) ref_path="result/lts.s2s.test.s" reference = read_sequences(ref_path) inference = [r.strip().split() for r in result] compare(reference, inference) print('If remove stress:') remove_stress(reference) remove_stress(inference) wer=compare(reference, inference) model_name=args.path.split('/')[-2]
from fairseq import checkpoint_utils, data, options, tasks import torch # Parse command-line arguments for generation parser = options.get_generation_parser(default_task='multilingual_translation') args = options.parse_args_and_arch(parser) # Setup task task = tasks.setup_task(args) for valid_sub_split in args.gen_subset.split(','): task.load_dataset(valid_sub_split, combine=True, epoch=0) # Load model print('| loading model from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble([args.path], task=task) model = models[0] M = model.models['de-en'].encoder.M N = model.models['de-en'].encoder.N no_langs = N.size(0) lang2idx = task.lang2idx idx2lang = {} for keys in lang2idx.keys(): idx2lang[lang2idx[keys]] = keys lang2idx2idx = model.models['de-en'].encoder.lang2idx2idx idx2idx2lang = {i.item(): idx for idx, i in enumerate(lang2idx2idx)} # import pdb; pdb.set_trace()
def cli_main(): parser = options.get_generation_parser() args = options.parse_args_and_arch(parser) main(args)
print('A-{}\t{}'.format( sample_id, ' '.join( map(lambda x: str(utils.item(x)), alignment)))) # Score only the top hypothesis if has_target and i == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tokenizer.Tokenizer.tokenize( target_str, tgt_dict, add_if_not_exist=True) scorer.add(target_tokens, hypo_tokens) wps_meter.update(src_tokens.size(0)) t.log({'wps': round(wps_meter.avg)}) num_sentences += 1 print( '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string())) if __name__ == '__main__': parser = options.get_generation_parser(style_transfer=True) args = options.parse_args_and_arch(parser) main(args)
def cli_main(): parser = options.get_generation_parser(interactive=True) args = options.parse_args_and_arch(parser) main(args)
initial_model = os.path.join(args.ckpt_dir, args.initial_model) for (idx, file) in enumerate(files): if file == initial_model: start_idx = idx bleu_ptn = 'BLEU4\s=\s([\d\.]+?),' for x in range(start_idx, len(files)): ckpt_file = files[x] args.path = ckpt_file # Note here, simply calling single_model_main will bring mysterious memory error, # so use bruteforce calling instead single_model_main(args) print('python {}/generate.py {} --path {}'.format( args.fairseq_dir, obtain_sys_argv(), ckpt_file)) pl_process = subprocess.Popen( 'python {}/generate.py {} --path {}'.format( args.fairseq_dir, obtain_sys_argv(), ckpt_file), shell=True, stdout=subprocess.PIPE) pl_output = pl_process.stdout.read() bleu_match = re.search(bleu_ptn, str(pl_output)) if bleu_match: bleu_score = bleu_match.group(1) print(ckpt_file, bleu_score) sys.stdout.flush() time.sleep(15) if __name__ == '__main__': parser = options.get_generation_parser(seq=True) args = options.parse_args_and_arch(parser) main(args)
max_positions = utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models] ) if args.buffer_size > 1: print('| Sentence buffer size:', args.buffer_size) print('| Type the input sentence and press return:') for inputs in buffered_read(args.buffer_size): indices = [] results = [] for batch, batch_indices in make_batches(inputs, args, task, max_positions): indices.extend(batch_indices) results += process_batch(batch) for i in np.argsort(indices): result = results[i] print(result.src_str) for hypo, pos_scores, align in zip(result.hypos, result.pos_scores, result.alignments): print(hypo) print(pos_scores) if align is not None: print(align) if __name__ == '__main__': parser = options.get_generation_parser(interactive=True) args = options.parse_args_and_arch(parser) main(args)
def cli_main(): parser = options.get_generation_parser(default_task="speech_recognition_hybrid") args = options.parse_args_and_arch(parser) main(args)
def model_fn(model_dir): model_name = 'checkpoint_best.pt' model_path = os.path.join(model_dir, model_name) logger.info('Loading the model') with open(model_path, 'rb') as f: model_info = torch.load(f, map_location=torch.device('cpu')) # Will be overidden by the model_info['args'] - need to keep for pre-trained models parser = options.get_generation_parser(interactive=True) # get args for FairSeq by converting the hyperparameters as if they were command-line arguments argv_copy = copy.deepcopy(sys.argv) # remove the modifications we did in the command-line arguments sys.argv[1:] = ['--path', model_path, model_dir] args = options.parse_args_and_arch(parser) # restore previous command-line args sys.argv = argv_copy saved_args = model_info['args'] for key, value in vars(saved_args).items(): setattr(args, key, value) args.data = [model_dir] print(args) # Setup task, e.g., translation task = tasks.setup_task(args) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") logger.info('Current device: {}'.format(device)) model_paths = [os.path.join(model_dir, model_name)] models, model_args = utils.load_ensemble_for_inference(model_paths, task, model_arg_overrides={}) # Set dictionaries tgt_dict = task.target_dictionary # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() # Initialize generator translator = SequenceGenerator( models, tgt_dict, beam_size=args.beam, minlen=args.min_len, stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized), len_penalty=args.lenpen, unk_penalty=args.unkpen, sampling=args.sampling, sampling_topk=args.sampling_topk, sampling_temperature=args.sampling_temperature, diverse_beam_groups=args.diverse_beam_groups, diverse_beam_strength=args.diverse_beam_strength, ) if device.type == 'cuda': translator.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) # align_dict = utils.load_align_dict(args.replace_unk) align_dict = utils.load_align_dict(None) max_positions = utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models] ) return dict( translator=translator, task=task, max_positions=max_positions, align_dict=align_dict, tgt_dict=tgt_dict, args=args, device=device, )
def initialize(self, data_dir=_data_dir, model_path=_model, user_dir=_user_dir, task='xmasked_seq2seq', s_lang='en', t_lang='zh', beam=5, cpu=False, align_dict=None, bpe_codes=_bpe_codes_en, tokenizer=True): self.parser = options.get_generation_parser(interactive=True) self.src, self.tgt = s_lang, t_lang # generate args input_args = [data_dir, '--path', model_path] if cpu: input_args.append('--cpu') if user_dir: input_args.append('--user-dir') input_args.append(user_dir) if task: input_args.append('--task') input_args.append(task) if align_dict: input_args.append('--replace-unk') input_args.append(align_dict) input_args.append('--langs') input_args.append('{},{}'.format(s_lang, t_lang)) input_args.append('--source-langs') input_args.append(s_lang) input_args.append('--target-langs') input_args.append(t_lang) input_args.append('-s') input_args.append(s_lang) input_args.append('-t') input_args.append(t_lang) input_args.append('--beam') input_args.append(str(beam)) input_args.append('--remove-bpe') self.bpe = BPE(open(bpe_codes, 'r')) self.tokenizer = tokenizer self.args = options.parse_args_and_arch(self.parser, input_args=input_args) # initialize model utils.import_user_module(self.args) if self.args.buffer_size < 1: self.args.buffer_size = 1 if self.args.max_tokens is None and self.args.max_sentences is None: self.args.max_sentences = 1 assert not self.args.sampling or self.args.nbest == self.args.beam, \ '--sampling requires --nbest to be equal to --beam' assert not self.args.max_sentences or self.args.max_sentences <= self.args.buffer_size, \ '--max-sentences/--batch-size cannot be larger than --buffer-size' self.use_cuda = torch.cuda.is_available() and not self.args.cpu # Setup task, e.g., translation self.task = tasks.setup_task(self.args) # Load ensemble self.models, _model_args = checkpoint_utils.load_model_ensemble( self.args.path.split(':'), arg_overrides=eval(self.args.model_overrides), task=self.task, ) # Set dictionaries self.src_dict = self.task.source_dictionary self.tgt_dict = self.task.target_dictionary # Optimize ensemble for generation for model in self.models: model.make_generation_fast_( beamable_mm_beam_size=None if self.args.no_beamable_mm else self.args.beam, need_attn=self.args.print_alignment, ) if self.args.fp16: model.half() if self.use_cuda: model.cuda() # Initialize generator self.generator = self.task.build_generator(self.args) def encode_fn(x): if tokenizer: x = tokenize(x, is_zh=(s_lang == 'zh')) if bpe_codes: x = self.bpe.process_line(x) return x # Hack to support GPT-2 BPE if self.args.remove_bpe == 'gpt2': pass else: self.decoder = None # self.encode_fn = lambda x: x self.encode_fn = encode_fn # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) self.align_dict = utils.load_align_dict(self.args.replace_unk) self.max_positions = utils.resolve_max_positions( self.task.max_positions(), *[model.max_positions() for model in self.models])