def _generate_adversarial_inputs( adv_trainer, args, task, adv_split, ): """Run the adversarial attack over the dataset""" # Keep track of the generated sentences # Initialize with empty translations adversarial_sentences = [""] * len(task.dataset(adv_split)) # Initialize iterator itr = create_iterator(args, adv_trainer, task, adv_split) num_sentences = 0 adversarial_samples = [] with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() # Keep more detailed timing when invoked from benchmark if "keep_detailed_timing" in args: adv_timer = pytorch_translate_utils.BucketStopwatchMeter( args.increment, args.max_length, args.samples_per_length) else: adv_timer = StopwatchMeter() for attack_info in adversarial_attack_iterator(t, adv_trainer, task, adv_split, adv_timer, args.reverse_source): if not args.quiet: print(f"S-{attack_info.sample_id}\t{attack_info.src_str}") print(f"A-{attack_info.sample_id}\t{attack_info.adv_str}") # Keep track of everything adversarial_sentences[attack_info.sample_id] = attack_info.adv_str adversarial_samples.append( collections.OrderedDict({ "sample_id": attack_info.sample_id, "src_str": attack_info.src_str, "target_str": attack_info.target_str, "adv_str": attack_info.adv_str, })) wps_meter.update(attack_info.src_tokens.size(0)) num_sentences += 1 log_mid_attack_stats(t, adv_trainer) # If applicable, save the translations to the output file # For eg. external evaluation if getattr(args, "adversarial_output_file", False): with open(args.adversarial_output_file, "w") as out_file: for adv_str in adversarial_sentences: print(adv_str, file=out_file) return num_sentences, adv_timer, adversarial_samples
def _generate_score(models, args, dataset, dataset_split, optimize=True): use_cuda = torch.cuda.is_available() and not args.cpu # Load ensemble if not args.quiet: print("| loading model(s) from {}".format(", ".join(args.path))) # Optimize ensemble for generation if optimize: for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam ) # Initialize generator model_weights = None if args.model_weights: model_weights = [float(w.strip()) for w in args.model_weights.split(",")] use_char_source = isinstance(models[0], char_source_model.CharSourceModel) # Use a different sequence generator in the multisource setting if getattr(args, "source_ensembling", False): translator_class = multisource_decode.MultiSourceSequenceGenerator else: translator_class = beam_decode.SequenceGenerator translator = translator_class( models, beam_size=args.beam, stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized), len_penalty=args.length_penalty, unk_reward=args.unk_reward, word_reward=args.word_reward, model_weights=model_weights, use_char_source=use_char_source, ) if use_cuda: translator.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Keep track of translations # Initialize with empty translations # and zero probs scores translated_sentences = [""] * len(dataset.splits[dataset_split]) translated_scores = [0.0] * len(dataset.splits[dataset_split]) # Generate and compute BLEU score scorer = bleu.Scorer( dataset.dst_dict.pad(), dataset.dst_dict.eos(), dataset.dst_dict.unk() ) max_positions = min(model.max_encoder_positions() for model in models) itr = dataset.eval_dataloader( dataset_split, max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions, skip_invalid_size_inputs_valid_test=(args.skip_invalid_size_inputs_valid_test), ) if args.num_shards > 1: if args.shard_id < 0 or args.shard_id >= args.num_shards: raise ValueError("--shard-id must be between 0 and num_shards") itr = data.sharded_iterator(itr, args.num_shards, args.shard_id) num_sentences = 0 translation_samples = [] with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() # Keep more detailed timing when invoked from benchmark if "keep_detailed_timing" in args: gen_timer = pytorch_translate_utils.BucketStopwatchMeter( args.increment, args.max_length, args.samples_per_length ) else: gen_timer = StopwatchMeter() translations = translator.generate_batched_itr( t, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b, cuda=use_cuda, timer=gen_timer, prefix_size=1 if pytorch_translate_data.is_multilingual(args) else 0, ) if pytorch_translate_data.is_multilingual(args): first_best_translations = _iter_first_best_multilingual else: first_best_translations = _iter_first_best_bilingual for trans_info in first_best_translations( args, dataset, dataset_split, translations, align_dict ): scorer.add(trans_info.target_tokens, trans_info.hypo_tokens) translated_sentences[trans_info.sample_id] = trans_info.hypo_str translated_scores[trans_info.sample_id] = trans_info.hypo_score translation_samples.append( collections.OrderedDict( { "sample_id": trans_info.sample_id, "src_str": trans_info.src_str, "target_str": trans_info.target_str, "hypo_str": trans_info.hypo_str, } ) ) wps_meter.update(trans_info.src_tokens.size(0)) t.log({"wps": round(wps_meter.avg)}) num_sentences += 1 # If applicable, save the translations to the output file # For eg. external evaluation if getattr(args, "translation_output_file", False): with open(args.translation_output_file, "w") as out_file: for hypo_str in translated_sentences: print(hypo_str, file=out_file) if getattr(args, "translation_probs_file", False): with open(args.translation_probs_file, "w") as out_file: for hypo_score in translated_scores: print(np.exp(hypo_score), file=out_file) return scorer, num_sentences, gen_timer, translation_samples
def _generate_score(models, args, task, dataset_split, optimize=True): use_cuda = torch.cuda.is_available() and not args.cpu # Load ensemble if not args.quiet: print("| loading model(s) from {}".format(", ".join( args.path.split(":")))) # Optimize ensemble for generation if optimize: for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=True, ) translator = build_sequence_generator(args, task, models) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Keep track of translations # Initialize with empty translations # and zero probs scores translated_sentences = [""] * len(task.dataset(dataset_split)) translated_scores = [0.0] * len(task.dataset(dataset_split)) # Generate and compute BLEU score dst_dict = task.target_dictionary scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(), dst_dict.unk()) itr = get_eval_itr(args, models, task, dataset_split) num_sentences = 0 translation_samples = [] with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() # Keep more detailed timing when invoked from benchmark if "keep_detailed_timing" in args: gen_timer = pytorch_translate_utils.BucketStopwatchMeter( args.increment, args.max_length, args.samples_per_length) else: gen_timer = StopwatchMeter() translations = translator.generate_batched_itr( t, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b, cuda=use_cuda, timer=gen_timer, prefix_size=1 if pytorch_translate_data.is_multilingual(args) else 0, ) if pytorch_translate_data.is_multilingual(args): first_best_translations = _iter_first_best_multilingual else: first_best_translations = _iter_first_best_bilingual for trans_info in first_best_translations(args, task, dataset_split, translations, align_dict): scorer.add(trans_info.target_tokens, trans_info.hypo_tokens) translated_sentences[trans_info.sample_id] = trans_info.hypo_str translated_scores[trans_info.sample_id] = trans_info.hypo_score translation_samples.append( collections.OrderedDict({ "sample_id": trans_info.sample_id, "src_str": trans_info.src_str, "target_str": trans_info.target_str, "hypo_str": trans_info.hypo_str, })) wps_meter.update(trans_info.src_tokens.size(0)) t.log({"wps": round(wps_meter.avg)}) num_sentences += 1 # If applicable, save the translations to the output file # For eg. external evaluation if getattr(args, "translation_output_file", False): with open(args.translation_output_file, "w") as out_file: for hypo_str in translated_sentences: print(hypo_str, file=out_file) if getattr(args, "translation_probs_file", False): with open(args.translation_probs_file, "w") as out_file: for hypo_score in translated_scores: print(np.exp(hypo_score), file=out_file) return scorer, num_sentences, gen_timer, translation_samples
def _generate_score(models, args, dataset, dataset_split): use_cuda = torch.cuda.is_available() and not args.cpu # Load ensemble if not args.quiet: print("| loading model(s) from {}".format(", ".join(args.path))) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam ) # Initialize generator model_weights = None if args.model_weights: model_weights = [float(w.strip()) for w in args.model_weights.split(",")] use_char_source = isinstance(models[0], char_source_model.CharSourceModel) translator = beam_decode.SequenceGenerator( models, beam_size=args.beam, stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized), len_penalty=args.lenpen, unk_penalty=args.unkpen, word_reward=args.word_reward, model_weights=model_weights, use_char_source=use_char_source, ) if use_cuda: translator.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Keep track of translations # Initialize with empty translations translated_sentences = [""] * len(dataset.splits[dataset_split]) # Generate and compute BLEU score scorer = bleu.Scorer( dataset.dst_dict.pad(), dataset.dst_dict.eos(), dataset.dst_dict.unk() ) max_positions = min(model.max_encoder_positions() for model in models) itr = dataset.eval_dataloader( dataset_split, max_sentences=args.max_sentences, max_positions=max_positions, skip_invalid_size_inputs_valid_test=(args.skip_invalid_size_inputs_valid_test), ) if args.num_shards > 1: if args.shard_id < 0 or args.shard_id >= args.num_shards: raise ValueError("--shard-id must be between 0 and num_shards") itr = data.sharded_iterator(itr, args.num_shards, args.shard_id) num_sentences = 0 with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() # Keep more detailed timing when invoked from benchmark if "keep_detailed_timing" in args: gen_timer = pytorch_translate_utils.BucketStopwatchMeter( args.increment, args.max_length, args.samples_per_length, ) else: gen_timer = StopwatchMeter() translations = translator.generate_batched_itr( t, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b, cuda=use_cuda, timer=gen_timer, ) for sample_id, src_tokens, target_tokens, hypos in translations: # Process input and ground truth target_tokens = target_tokens.int().cpu() # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = dataset.splits[dataset_split].src.get_original_text(sample_id) target_str = dataset.splits[dataset_split].dst.get_original_text( sample_id ) else: src_str = dataset.src_dict.string(src_tokens, args.remove_bpe) target_str = dataset.dst_dict.string( target_tokens, args.remove_bpe, escape_unk=True ) if not args.quiet: print(f"S-{sample_id}\t{src_str}") print(f"T-{sample_id}\t{target_str}") # Process top predictions for i, hypo in enumerate(hypos[: min(len(hypos), args.nbest)]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo["tokens"].int().cpu(), src_str=src_str, alignment=hypo["alignment"].int().cpu(), align_dict=align_dict, dst_dict=dataset.dst_dict, remove_bpe=args.remove_bpe, ) if not args.quiet: print(f"H-{sample_id}\t{hypo['score']}\t{hypo_str}") print( "A-{}\t{}".format( sample_id, " ".join(map(lambda x: str(utils.item(x)), alignment)), ) ) # Score only the top hypothesis if i == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement # and/or without BPE target_tokens = tokenizer.Tokenizer.tokenize( target_str, dataset.dst_dict, add_if_not_exist=True ) scorer.add(target_tokens, hypo_tokens) translated_sentences[sample_id] = hypo_str wps_meter.update(src_tokens.size(0)) t.log({"wps": round(wps_meter.avg)}) num_sentences += 1 # If applicable, save the translations to the output file # For eg. external evaluation if getattr(args, "translation_output_file", False): with open(args.translation_output_file, 'w') as out_file: for hypo_str in translated_sentences: print(hypo_str, file=out_file) return scorer, num_sentences, gen_timer