Пример #1
0
def generate_main(data_dir, extra_flags=None):
    generate_parser = options.get_generation_parser()
    generate_args = options.parse_args_and_arch(
        generate_parser,
        [
            data_dir,
            '--path', os.path.join(data_dir, 'checkpoint_last.pt'),
            '--beam', '3',
            '--batch-size', '64',
            '--max-len-b', '5',
            '--gen-subset', 'valid',
            '--no-progress-bar',
            '--print-alignment',
        ] + (extra_flags or []),
    )

    # evaluate model in batch mode
    generate.main(generate_args)

    # evaluate model interactively
    generate_args.buffer_size = 0
    generate_args.max_sentences = None
    orig_stdin = sys.stdin
    sys.stdin = StringIO('h e l l o\n')
    interactive.main(generate_args)
    sys.stdin = orig_stdin
Пример #2
0
                    ))

                    if args.print_alignment:
                        print('A-{}\t{}'.format(
                            sample_id,
                            ' '.join(map(lambda x: str(utils.item(x)), alignment))
                        ))

                # Score only the top hypothesis
                if has_target and i == 0:
                    if align_dict is not None or args.remove_bpe is not None:
                        # Convert back to tokens for evaluation with unk replacement and/or without BPE
                        target_tokens = tokenizer.Tokenizer.tokenize(
                            target_str, tgt_dict, add_if_not_exist=True)
                    scorer.add(target_tokens, hypo_tokens)

            wps_meter.update(src_tokens.size(0))
            t.log({'wps': round(wps_meter.avg)})
            num_sentences += 1

    print('| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)'.format(
        num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg))
    if has_target:
        print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string()))


if __name__ == '__main__':
    parser = options.get_generation_parser()
    args = options.parse_args_and_arch(parser)
    main(args)
Пример #3
0
def cli_main():
    parser = options.get_generation_parser()
    parser = add_asr_eval_argument(parser)
    args = options.parse_args_and_arch(parser)
    main(args)
from fairseq import data, options, tasks, utils

# Parse command-line arguments for generation
parser = options.get_generation_parser(default_task='len_pre_transformer')
args = options.parse_args_and_arch(parser)

# Setup task
task = tasks.setup_task(args)

# Load model
print('| loading model from {}'.format(args.path))
models, _model_args = utils.load_ensemble_for_inference([args.path], task)
model = models[0]

src_path = "/n/home05/simonx/scratchlfs/zhenyu/length_predic_transforemr/diff_len/data-process/test.de-en.de"
tgt_path = "/n/home05/simonx/scratchlfs/zhenyu/length_predic_transforemr/diff_len/data-process/test.de-en.en"
src_r = open(src_path,'r',encoding='UTF-8')
tgt_r = open(tgt_path,'r',encoding='UTF-8')

src_text = src_r.readline()
tgt_text = tgt_r.readline().strip()

total_num = 0
acc = 0
while total_num < 100:
    sentence = src_text

    # Tokenize into characters
    chars = ' '.join(list(sentence.strip()))
    tokens = task.source_dictionary.encode_line(
        chars, add_if_not_exist=False,
Пример #5
0
def make_parser():
    parser = options.get_generation_parser()
    parser = add_asr_eval_argument(parser)
    return parser
Пример #6
0
def score_bw(args):
    if args.backwards1:
        scorer1_src = args.target_lang
        scorer1_tgt = args.source_lang
    else:
        scorer1_src = args.source_lang
        scorer1_tgt = args.target_lang

    if args.score_model2 is not None:
        if args.backwards2:
            scorer2_src = args.target_lang
            scorer2_tgt = args.source_lang
        else:
            scorer2_src = args.source_lang
            scorer2_tgt = args.target_lang

    rerank1_is_gen = (
        args.gen_model == args.score_model1 and args.source_prefix_frac is None
    )
    rerank2_is_gen = (
        args.gen_model == args.score_model2 and args.source_prefix_frac is None
    )

    (
        pre_gen,
        left_to_right_preprocessed_dir,
        right_to_left_preprocessed_dir,
        backwards_preprocessed_dir,
        lm_preprocessed_dir,
    ) = rerank_utils.get_directories(
        args.data_dir_name,
        args.num_rescore,
        args.gen_subset,
        args.gen_model_name,
        args.shard_id,
        args.num_shards,
        args.sampling,
        args.prefix_len,
        args.target_prefix_frac,
        args.source_prefix_frac,
    )

    score1_file = rerank_utils.rescore_file_name(
        pre_gen,
        args.prefix_len,
        args.model1_name,
        target_prefix_frac=args.target_prefix_frac,
        source_prefix_frac=args.source_prefix_frac,
        backwards=args.backwards1,
    )

    if args.score_model2 is not None:
        score2_file = rerank_utils.rescore_file_name(
            pre_gen,
            args.prefix_len,
            args.model2_name,
            target_prefix_frac=args.target_prefix_frac,
            source_prefix_frac=args.source_prefix_frac,
            backwards=args.backwards2,
        )

    if args.right_to_left1:
        rerank_data1 = right_to_left_preprocessed_dir
    elif args.backwards1:
        rerank_data1 = backwards_preprocessed_dir
    else:
        rerank_data1 = left_to_right_preprocessed_dir

    gen_param = ["--batch-size", str(128), "--score-reference", "--gen-subset", "train"]
    if not rerank1_is_gen and not os.path.isfile(score1_file):
        print("STEP 4: score the translations for model 1")

        model_param1 = [
            "--path",
            args.score_model1,
            "--source-lang",
            scorer1_src,
            "--target-lang",
            scorer1_tgt,
        ]
        gen_model1_param = [rerank_data1] + gen_param + model_param1

        gen_parser = options.get_generation_parser()
        input_args = options.parse_args_and_arch(gen_parser, gen_model1_param)

        with open(score1_file, "w") as f:
            with redirect_stdout(f):
                generate.main(input_args)

    if (
        args.score_model2 is not None
        and not os.path.isfile(score2_file)
        and not rerank2_is_gen
    ):
        print("STEP 4: score the translations for model 2")

        if args.right_to_left2:
            rerank_data2 = right_to_left_preprocessed_dir
        elif args.backwards2:
            rerank_data2 = backwards_preprocessed_dir
        else:
            rerank_data2 = left_to_right_preprocessed_dir

        model_param2 = [
            "--path",
            args.score_model2,
            "--source-lang",
            scorer2_src,
            "--target-lang",
            scorer2_tgt,
        ]
        gen_model2_param = [rerank_data2] + gen_param + model_param2

        gen_parser = options.get_generation_parser()
        input_args = options.parse_args_and_arch(gen_parser, gen_model2_param)

        with open(score2_file, "w") as f:
            with redirect_stdout(f):
                generate.main(input_args)
Пример #7
0
def cli_main():
    parser = options.get_generation_parser(interactive=False)
    parser.add_argument('--no-print', action='store_true')
    parser.add_argument('--truncate-size', default=512, type=int)
    args = options.parse_args_and_arch(parser)
    distributed_utils.call_main(args, main)
Пример #8
0
def _fairseq_generate(complex_filepath,
                      output_pred_filepath,
                      checkpoint_paths,
                      complex_dictionary_path,
                      simple_dictionary_path,
                      beam=5,
                      hypothesis_num=1,
                      lenpen=1.,
                      diverse_beam_groups=None,
                      diverse_beam_strength=0.5,
                      sampling=False,
                      batch_size=128):
    # exp_dir must contain checkpoints/checkpoint_best.pt, and dict.{complex,simple}.txt
    # First copy input complex file to exp_dir and create dummy simple file
    tmp_dir = Path(tempfile.mkdtemp())
    new_complex_filepath = tmp_dir / 'tmp.complex-simple.complex'
    dummy_simple_filepath = tmp_dir / 'tmp.complex-simple.simple'
    shutil.copy(complex_filepath, new_complex_filepath)
    shutil.copy(complex_filepath, dummy_simple_filepath)
    shutil.copy(complex_dictionary_path, tmp_dir / 'dict.complex.txt')
    shutil.copy(simple_dictionary_path, tmp_dir / 'dict.simple.txt')
    generate_parser = options.get_generation_parser()
    args = [
        tmp_dir,
        '--path',
        ':'.join([str(path) for path in checkpoint_paths]),
        '--beam',
        beam,
        '--nbest',
        hypothesis_num,
        '--lenpen',
        lenpen,
        '--diverse-beam-groups',
        diverse_beam_groups if diverse_beam_groups is not None else -1,
        '--diverse-beam-strength',
        diverse_beam_strength,
        '--batch-size',
        batch_size,
        '--raw-text',
        '--print-alignment',
        '--gen-subset',
        'tmp',
        # We don't want to reload pretrained embeddings
        '--model-overrides',
        {
            'encoder_embed_path': None,
            'decoder_embed_path': None
        },
    ]
    if sampling:
        args.extend([
            '--sampling',
            '--sampling-topk',
            10,
        ])
    args = [str(arg) for arg in args]
    generate_args = options.parse_args_and_arch(generate_parser, args)
    out_filepath = tmp_dir / 'generation.out'
    with log_stdout(out_filepath, mute_stdout=True):
        # evaluate model in batch mode
        generate.main(generate_args)
    # Retrieve translations

    def parse_all_hypotheses(out_filepath):
        hypotheses_dict = defaultdict(list)
        for line in yield_lines(out_filepath):
            match = re.match(r'^H-(\d+)\t-?\d+\.\d+\t(.*)$', line)
            if match:
                sample_id, hypothesis = match.groups()
                hypotheses_dict[int(sample_id)].append(hypothesis)
        # Sort in original order
        return [hypotheses_dict[i] for i in range(len(hypotheses_dict))]

    all_hypotheses = parse_all_hypotheses(out_filepath)
    predictions = [
        hypotheses[hypothesis_num - 1] for hypotheses in all_hypotheses
    ]
    write_lines(predictions, output_pred_filepath)
    os.remove(dummy_simple_filepath)
    os.remove(new_complex_filepath)
Пример #9
0
from torch.utils.data import DataLoader
from tqdm import tqdm
import sacrebleu

logger = logging.getLogger()

def get_symbols_to_strip_from_output(generator):
    if hasattr(generator, 'symbols_to_strip_from_output'):
        return generator.symbols_to_strip_from_output
    else:
        return {generator.eos}

if __name__ == "__main__":

    # Parse command-line arguments for generation
    parser = options.get_generation_parser(default_task="no_context_tag")
    parser.add_argument('--output', default='outputs/pred.txt')
    parser.add_argument('--beam_size', default=5)
    args = options.parse_args_and_arch(parser)

    use_cuda = torch.cuda.is_available() and not args.cpu

    # Setup task
    task = tasks.setup_task(args)
    task.load_dataset('test')
    dataset = task.datasets['test']
    task.tokenizer = dataset.dictionary.model
    dataloader = DataLoader(dataset, batch_size=1, collate_fn=dataset.collater, shuffle=False)
    dictionary = dataset.dictionary

    # Load model
    def __init__(self,
                 data_dir,
                 checkpoint_path,
                 batch_size=25,
                 constrained_decoding=False):

        self.constrained_decoding = constrained_decoding
        self.parser = options.get_generation_parser(interactive=True)
        # buffer_size is currently not used but we just initialize it to batch
        # size + 1 to avoid any assertion errors.
        if self.constrained_decoding:
            self.parser.set_defaults(
                path=checkpoint_path,
                remove_bpe="subword_nmt",
                num_wokers=-1,
                constraints="ordered",
                batch_size=batch_size,
                buffer_size=batch_size + 1,
            )
        else:
            self.parser.set_defaults(
                path=checkpoint_path,
                remove_bpe="subword_nmt",
                num_wokers=-1,
                batch_size=batch_size,
                buffer_size=batch_size + 1,
            )
        args = options.parse_args_and_arch(self.parser, input_args=[data_dir])
        # we are explictly setting src_lang and tgt_lang here
        # generally the data_dir we pass contains {split}-{src_lang}-{tgt_lang}.*.idx files from
        # which fairseq infers the src and tgt langs(if these are not passed). In deployment we dont
        # use any idx files and only store the SRC and TGT dictionaries.
        args.source_lang = "SRC"
        args.target_lang = "TGT"

        args.skip_invalid_size_inputs_valid_test = False

        # we have custom architechtures in this folder and we will let fairseq
        # import this
        args.user_dir = "src/model_configs"
        self.cfg = convert_namespace_to_omegaconf(args)

        utils.import_user_module(self.cfg.common)

        if self.cfg.interactive.buffer_size < 1:
            self.cfg.interactive.buffer_size = 1
        if self.cfg.dataset.max_tokens is None and self.cfg.dataset.batch_size is None:
            self.cfg.dataset.batch_size = 1

        assert (not self.cfg.generation.sampling
                or self.cfg.generation.nbest == self.cfg.generation.beam
                ), "--sampling requires --nbest to be equal to --beam"
        assert (not self.cfg.dataset.batch_size or
                self.cfg.dataset.batch_size <= self.cfg.interactive.buffer_size
                ), "--batch-size cannot be larger than --buffer-size"

        # Fix seed for stochastic decoding
        # if self.cfg.common.seed is not None and not self.cfg.generation.no_seed_provided:
        #     np.random.seed(self.cfg.common.seed)
        #     utils.set_torch_seed(self.cfg.common.seed)

        # if not self.constrained_decoding:
        #     self.use_cuda = torch.cuda.is_available() and not self.cfg.common.cpu
        # else:
        #     self.use_cuda = False

        self.use_cuda = torch.cuda.is_available() and not self.cfg.common.cpu

        # Setup task, e.g., translation
        self.task = tasks.setup_task(self.cfg.task)

        # Load ensemble
        overrides = ast.literal_eval(self.cfg.common_eval.model_overrides)
        self.models, self._model_args = checkpoint_utils.load_model_ensemble(
            utils.split_paths(self.cfg.common_eval.path),
            arg_overrides=overrides,
            task=self.task,
            suffix=self.cfg.checkpoint.checkpoint_suffix,
            strict=(self.cfg.checkpoint.checkpoint_shard_count == 1),
            num_shards=self.cfg.checkpoint.checkpoint_shard_count,
        )

        # Set dictionaries
        self.src_dict = self.task.source_dictionary
        self.tgt_dict = self.task.target_dictionary

        # Optimize ensemble for generation
        for model in self.models:
            if model is None:
                continue
            if self.cfg.common.fp16:
                model.half()
            if (self.use_cuda and
                    not self.cfg.distributed_training.pipeline_model_parallel):
                model.cuda()
            model.prepare_for_inference_(self.cfg)

        # Initialize generator
        self.generator = self.task.build_generator(self.models,
                                                   self.cfg.generation)

        # Handle tokenization and BPE
        self.tokenizer = self.task.build_tokenizer(self.cfg.tokenizer)
        self.bpe = self.task.build_bpe(self.cfg.bpe)

        # Load alignment dictionary for unknown word replacement
        # (None if no unknown word replacement, empty if no path to align dictionary)
        self.align_dict = utils.load_align_dict(
            self.cfg.generation.replace_unk)

        self.max_positions = utils.resolve_max_positions(
            self.task.max_positions(),
            *[model.max_positions() for model in self.models])
Пример #11
0
def cli_main():
    parser = options.get_generation_parser()
    args = options.parse_args_and_arch(parser)
    scorer = main(args)
    return scorer
Пример #12
0
def cli_main():
    parser = options.get_generation_parser(
        default_task="speech_recognition_espresso")
    args = options.parse_args_and_arch(parser)
    assert args.results_path is not None, "please specify --results-path"
    main(args)
Пример #13
0
def main():
    script_parser = argparse.ArgumentParser(
        description=
        'Computes greedy completion, single-token prediction, and corresponding targets.'
    )
    script_parser.add_argument('--data-dir', type=str, required=True)
    script_parser.add_argument('--base-dir', type=str, required=True)
    script_parser.add_argument('--eval-mode',
                               choices=['all', 'completion', 'singletoken'],
                               default='all')
    script_parser.add_argument('--data-prefix-length',
                               type=int,
                               default=50,
                               help='Length of prefix')
    script_parser.add_argument('--batch-size-completions',
                               type=int,
                               default=128)
    script_parser.add_argument('--batch-size-single-prediction',
                               type=int,
                               default=1024)

    script_parser.add_argument(
        '--completion-length',
        type=int,
        default=500,
        help=
        'The length of each generated sequence, not counting the prefix length'
    )
    script_parser.add_argument('--model-path',
                               type=str,
                               required=True,
                               help='The path to the folder with checkpoints')
    script_parser.add_argument('--save-path', type=str, required=True)
    script_parser.add_argument(
        '--ckpt',
        choices=['best', 'last', 'all', 'step', 'epoch'],
        default='best')
    script_parser.add_argument('--ckpt-step', type=str, default=None)
    script_parser.add_argument('--ckpt-epoch', type=str, default=None)
    script_parser.add_argument('--data-split',
                               choices=['train', 'valid', 'test'],
                               default='valid')
    script_parser.add_argument('--num-samples', type=int, default=-1)
    script_parser.add_argument('--beam-size', type=int, default=1)
    script_parser.add_argument('--beam-ngram-block', type=int, default=0)
    script_parser.add_argument('--topp', type=float, default=0.0)
    script_parser.add_argument('--topk', type=int, default=1)
    script_parser.add_argument('--singletoken-topk', type=int, default=1)
    script_parser.add_argument('--singletoken-topp', type=float, default=0.0)

    high_level_args = script_parser.parse_args()

    if high_level_args.ckpt == 'last':
        checkpoints = glob(
            os.path.join(high_level_args.model_path, 'checkpoint_last.pt'))
    elif high_level_args.ckpt == 'best':
        checkpoints = glob(
            os.path.join(high_level_args.model_path, 'checkpoint_best.pt'))
    elif high_level_args.ckpt == 'step':
        checkpoints = glob(
            os.path.join(
                high_level_args.model_path,
                'checkpoint_*_{}.pt'.format(high_level_args.ckpt_step)))
    elif high_level_args.ckpt == 'epoch':
        checkpoints = glob(
            os.path.join(high_level_args.model_path,
                         'checkpoint{}.pt'.format(high_level_args.ckpt_epoch)))
    elif high_level_args.ckpt == 'all':
        checkpoints = glob(
            os.path.join(high_level_args.model_path, 'checkpoint*'))

    print("Evaluating {} checkpoints.".format(len(checkpoints)))
    for i, checkpoint in enumerate(checkpoints):

        if high_level_args.eval_mode in ['all', 'completion']:
            num_tokens = high_level_args.data_prefix_length * high_level_args.batch_size_completions
            FAIRSEQ_OPTS = "--data {} \
                            --task language_modeling_with_generation \
                            --path {} \
                            --tokens-per-sample {} \
                            --max-tokens {} \
                            --sample-break-mode none \
                            --gen-subset {} \
                            --user-dir {}".format(
                high_level_args.data_dir, checkpoint, num_tokens, num_tokens,
                high_level_args.data_split,
                os.path.join(high_level_args.base_dir, 'fairseq/custom'))
            sys.argv = shlex.split(FAIRSEQ_OPTS)
            parser = options.get_generation_parser()
            args = options.parse_args_and_arch(parser)
            args.add_bos_token = False
            args.skip_invalid_size_inputs_valid_test = False

            task, model, generator, itr, step = evaluate_utils.load(args)

            task.dictionary.eos_index = len(task.dictionary) - 1
            task.dictionary.eos_word = task.dictionary.symbols[-1]

            fairseq_generator = sequence_generator.SequenceGenerator(
                tgt_dict=task.dictionary,
                beam_size=high_level_args.beam_size,
                no_repeat_ngram_size=high_level_args.beam_ngram_block,
                max_len_b=high_level_args.completion_length +
                high_level_args.data_prefix_length,
            )

            filename_suffix = '_{}__st_{}__spl_{}__pfx_{}__cmpl_{}__bs_cmpl_{}__bs_sprd_{}__bms_{}__bnb_{}__tpk_{}__tpp_{}__sttpk_{}__sttpp_{}__ckst_{}__ckep_{}__ckpt_{}'.format(
                os.path.basename(os.path.normpath(high_level_args.model_path)),
                step, high_level_args.data_split,
                high_level_args.data_prefix_length,
                high_level_args.completion_length,
                high_level_args.batch_size_completions,
                high_level_args.batch_size_single_prediction,
                high_level_args.beam_size, high_level_args.beam_ngram_block,
                high_level_args.topk, high_level_args.topp,
                high_level_args.singletoken_topk,
                high_level_args.singletoken_topp, high_level_args.ckpt_step,
                high_level_args.ckpt_epoch, high_level_args.ckpt)

            completions, gen_metrics, actual_metrics = evaluate_utils.generate_completions(
                model,
                generator,
                fairseq_generator,
                itr,
                high_level_args.data_prefix_length,
                high_level_args.completion_length,
                topk=high_level_args.topk,
                beam_size=high_level_args.beam_size,
                num_samples=high_level_args.num_samples,
                topp=high_level_args.topp)

            completion_tokens = [[task.dictionary[i] for i in sample]
                                 for sample in completions]
            completion_text = [' '.join(ts) for ts in completion_tokens]

            # dump generation to text file
            completion_output_filename = os.path.join(
                high_level_args.save_path,
                'completions_{}.txt'.format(filename_suffix))
            with open(completion_output_filename, 'w') as f:
                for line in completion_text:
                    f.write(line)
                    f.write('\n')
                print("\tcompletions output file: %s" %
                      completion_output_filename)

        if high_level_args.eval_mode in ['all', 'singletoken']:
            num_tokens = high_level_args.batch_size_single_prediction
            FAIRSEQ_OPTS = "--data {} \
                                        --task language_modeling_with_generation \
                                        --path {} \
                                        --tokens-per-sample {} \
                                        --max-tokens {} \
                                        --sample-break-mode none \
                                        --gen-subset {} \
                                        --user-dir {}".format(
                high_level_args.data_dir, checkpoint, num_tokens, num_tokens,
                high_level_args.data_split,
                os.path.join(high_level_args.base_dir, 'fairseq/custom'))
            sys.argv = shlex.split(FAIRSEQ_OPTS)
            parser = options.get_generation_parser()
            args = options.parse_args_and_arch(parser)
            args.add_bos_token = False
            args.skip_invalid_size_inputs_valid_test = False

            task, model, generator, itr, step = evaluate_utils.load(args)

            single_predicted_tokens, target_tokens, metrics = evaluate_utils.eval_single_token_prediction(
                model,
                itr,
                task.target_dictionary,
                singletoken_topk=high_level_args.singletoken_topk,
                singletoken_topp=high_level_args.singletoken_topp)

            subset_metrics = {}
            subset_data = high_level_args.data_split

            for metric_name, value in metrics.items():
                subset_metrics[f'{subset_data}/{metric_name}'] = value
            subset_metrics['checkpoint_step'] = step

            filename_suffix = '_{}__st_{}__spl_{}__pfx_{}__cmpl_{}__bs_cmpl_{}__bs_sprd_{}__bms_{}__bnb_{}__tpk_{}__tpp_{}__sttpk_{}__sttpp_{}__ckst_{}__ckep_{}__ckpt_{}'.format(
                os.path.basename(os.path.normpath(high_level_args.model_path)),
                step, high_level_args.data_split,
                high_level_args.data_prefix_length,
                high_level_args.completion_length,
                high_level_args.batch_size_completions,
                high_level_args.batch_size_single_prediction,
                high_level_args.beam_size, high_level_args.beam_ngram_block,
                high_level_args.topk, high_level_args.topp,
                high_level_args.singletoken_topk,
                high_level_args.singletoken_topp, high_level_args.ckpt_step,
                high_level_args.ckpt_epoch, high_level_args.ckpt)

            single_token_predictions_filename = os.path.join(
                high_level_args.save_path,
                "single_token_predictions_{}.txt".format(filename_suffix))

            pkl_filename = os.path.join(
                high_level_args.save_path,
                "metrics_{}.pkl".format(filename_suffix))
            pickle.dump(subset_metrics, open(pkl_filename, 'wb'))

            with open(single_token_predictions_filename, 'w') as f:
                for single_predicted_tokens_sublist in single_predicted_tokens:
                    _single_token_text = [
                        task.dictionary[i]
                        for i in single_predicted_tokens_sublist
                    ]
                    f.write(' '.join(_single_token_text))
                    f.write('\n')

            target_filename = os.path.join(
                high_level_args.save_path,
                "targets_{}.txt".format(filename_suffix))

            with open(target_filename, 'w') as f:
                for target_tokens_sublist in target_tokens:
                    _target_text = [
                        task.dictionary[i] for i in target_tokens_sublist
                    ]
                    f.write(' '.join(_target_text))
                    f.write('\n')
Пример #14
0
    def __init__(self, data_path="./data/processed", \
                 checkpoint_path="./checkpoints/zhen_mass_pre-training.pt",\
                 task='xmasked_seq2seq',\
                 user_dir='mass',\
                 s='zh', t='en',\
                 langs='en,zh',\
                 mt_steps='zh-en',\
                 source_langs='zh',\
                 target_langs='en',\
                 beam=5,\
                 use_cuda=1):
        self.parser = options.get_generation_parser(interactive=True)
        self.parser.set_defaults(path=checkpoint_path, task=task, user_dir=user_dir, s=s, t=t,\
                                 source_langs=source_langs, target_langs=target_langs,\
                                 langs=langs, mt_steps=mt_steps, beam=beam)
        self.use_cuda = use_cuda
        self.args = options.parse_args_and_arch(self.parser,\
                                               input_args=[data_path])
        self.args.user_dir = user_dir
        self.args.s = s
        self.args.t = t
        self.args.langs = langs
        self.args.mt_steps = mt_steps
        self.args.source_langs = source_langs
        self.args.target_langs = target_langs
        self.args.remove_bpe = '@@ '
        #self.args, _ = self.parser.parse_known_args([data_path])

        utils.import_user_module(self.args)

        if self.args.buffer_size < 1:
            self.args.buffer_size = 1
        if self.args.max_tokens is None and self.args.max_sentences is None:
            self.args.max_sentences = 1

        assert not self.args.sampling or self.args.nbest == self.args.beam, \
            '--sampling requires --nbest to be equal to --beam'
        assert not self.args.max_sentences or self.args.max_sentences <= self.args.buffer_size, \
            '--max-sentences/--batch-size cannot be larger than --buffer-size'

        print(self.args)

        #self.use_cuda = torch.cuda.is_available() and not self.args.cpu

        # Setup task, e.g., translation
        self.task = tasks.setup_task(self.args)

        # Load ensemble
        print('| loading model(s) from {}'.format(self.args.path))
        self.models, self._model_args = checkpoint_utils.load_model_ensemble(
            self.args.path.split(':'),
            arg_overrides=eval(self.args.model_overrides),
            task=self.task,
        )

        # Set dictionaries
        self.src_dict = self.task.source_dictionary
        self.tgt_dict = self.task.target_dictionary

        # Optimize ensemble for generation
        for model in self.models:
            model.make_generation_fast_(
                beamable_mm_beam_size=None
                if self.args.no_beamable_mm else self.args.beam,
                need_attn=self.args.print_alignment,
            )
            if self.args.fp16:
                model.half()
            if self.use_cuda:
                model.cuda()

        # Initialize generator
        self.generator = self.task.build_generator(self.args)

        # Hack to support GPT-2 BPE
        if self.args.remove_bpe == 'gpt2':
            from fairseq.gpt2_bpe.gpt2_encoding import get_encoder
            self.decoder = get_encoder(
                'fairseq/gpt2_bpe/encoder.json',
                'fairseq/gpt2_bpe/vocab.bpe',
            )
            self.encode_fn = lambda x: ' '.join(
                map(str, self.decoder.encode(x)))
        else:
            self.decoder = None
            self.encode_fn = lambda x: x

        # Load alignment dictionary for unknown word replacement
        # (None if no unknown word replacement, empty if no path to align dictionary)
        self.align_dict = utils.load_align_dict(self.args.replace_unk)

        self.max_positions = utils.resolve_max_positions(
            self.task.max_positions(),
            *[model.max_positions() for model in self.models])

        if self.args.buffer_size > 1:
            print('| Sentence buffer size:', self.args.buffer_size)
Пример #15
0
def cli_main():
    parser = options.get_generation_parser()
    parser.add_argument("--print-attention", action="store_true", help="print attention matrix as jsonline")
    args = options.parse_args_and_arch(parser)
    main(args)
Пример #16
0
def sari_validate(cfg: DictConfig, trainer: Trainer, task: tasks.FairseqTask,
                  epoch_itr, subsets: List[str]) -> List[Optional[float]]:
    from pathlib import Path
    from access.resources.paths import get_data_filepath
    from access.utils.helpers import read_lines
    from access.preprocessors import load_preprocessors, ComposedPreprocessor
    from easse.report import get_all_scores
    from fairseq.data import encoders
    from fairseq_cli.interactive import buffered_read, make_batches
    from fairseq_cli.generate import get_symbols_to_strip_from_output
    from fairseq.token_generation_constraints import pack_constraints, unpack_constraints
    import tempfile

    use_cuda = torch.cuda.is_available() and not cfg.common.cpu

    # Setup task, e.g., translation
    task = tasks.setup_task(cfg.task)

    # TODO: Choose parameters for the preprocessors ?
    # 从pickle文件读取preprocessor
    # preprocessors = load_preprocessors(Path(cfg.task.data).parent)
    # composed_preprocessor = ComposedPreprocessor(preprocessors)
    # 获得turkcorpus.valid.complex的路径
    complex_filepath = get_data_filepath('turkcorpus', 'valid', 'complex')
    # make temp dir
    # encoded_complex_filepath = tempfile.mkstemp()[1]
    # encoded_pred_filepath = tempfile.mkstemp()[1]
    pred_filepath = tempfile.mkstemp()[1]
    # use preprocessors to encode complex file
    # composed_preprocessor.encode_file(complex_filepath, encoded_complex_filepath)
    max_positions = utils.resolve_max_positions(
        task.max_positions(),
        trainer.get_model().max_positions(),
    )
    parser = options.get_generation_parser(interactive=True)
    # TODO: Take args from fairseq_generate
    gen_args = options.parse_args_and_arch(
        parser, input_args=['/dummy_data', '--beam', '2'])
    # Initialize generator
    generator = task.build_generator([trainer.model], gen_args)

    # Handle tokenization and BPE
    tokenizer = encoders.build_tokenizer(cfg.tokenizer)
    bpe = encoders.build_bpe(cfg.bpe)

    # Set dictionaries
    src_dict = task.source_dictionary
    tgt_dict = task.target_dictionary

    def encode_fn(x):
        if tokenizer is not None:
            x = tokenizer.encode(x)
        if bpe is not None:
            x = bpe.encode(x)
        return x

    def decode_fn(x):
        if bpe is not None:
            x = bpe.decode(x)
        if tokenizer is not None:
            x = tokenizer.decode(x)
        return x

    align_dict = utils.load_align_dict(cfg.generation.replace_unk)

    with open(pred_filepath, 'w') as f:
        start_id = 0
        for inputs in buffered_read(complex_filepath, buffer_size=9999):
            results = []
            for batch in make_batches(inputs, cfg, task, max_positions,
                                      encode_fn):
                bsz = batch.src_tokens.size(0)
                src_tokens = batch.src_tokens
                src_lengths = batch.src_lengths
                constraints = batch.constraints
                if use_cuda:
                    src_tokens = src_tokens.cuda()
                    src_lengths = src_lengths.cuda()
                    if constraints is not None:
                        constraints = constraints.cuda()
                sample = {
                    "net_input": {
                        "src_tokens": src_tokens,
                        "src_lengths": src_lengths,
                    },
                }
                translations = task.inference_step(generator, [trainer.model],
                                                   sample,
                                                   constraints=constraints)
                list_constraints = [[] for _ in range(bsz)]
                if cfg.generation.constraints:
                    list_constraints = [
                        unpack_constraints(c) for c in constraints
                    ]
                for i, (id, hypos) in enumerate(
                        zip(batch.ids.tolist(), translations)):
                    src_tokens_i = utils.strip_pad(src_tokens[i],
                                                   tgt_dict.pad())
                    constraints = list_constraints[i]
                    results.append((
                        start_id + id,
                        src_tokens_i,
                        hypos,
                        {
                            "constraints": constraints,
                        },
                    ))

            # sort output to match input order
            for id_, src_tokens, hypos, info in sorted(results,
                                                       key=lambda x: x[0]):
                if src_dict is not None:
                    src_str = src_dict.string(src_tokens,
                                              cfg.common_eval.post_process)
                    for constraint in info["constraints"]:
                        pass

                # Process top predictions
                for hypo in hypos[:min(len(hypos), cfg.generation.nbest)]:
                    hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                        hypo_tokens=hypo["tokens"].int().cpu(),
                        src_str=src_str,
                        alignment=hypo["alignment"],
                        align_dict=align_dict,
                        tgt_dict=tgt_dict,
                        remove_bpe=cfg.common_eval.post_process,
                        extra_symbols_to_ignore=
                        get_symbols_to_strip_from_output(generator),
                    )
                    detok_hypo_str = decode_fn(hypo_str)
                    # detokenized hypothesis
                    f.write(f'{detok_hypo_str}\n')
                    if cfg.generation.print_alignment:
                        alignment_str = " ".join([
                            "{}-{}".format(src, tgt) for src, tgt in alignment
                        ])

            # update running id_ counter
            start_id += len(inputs)

        # composed_preprocessor.decode_file(encoded_pred_filepath, pred_filepath)
        ref_filepaths = [
            get_data_filepath('turkcorpus', 'valid', 'simple.turk', i)
            for i in range(8)
        ]
        scores = get_all_scores(
            read_lines(complex_filepath), read_lines(pred_filepath),
            [read_lines(ref_filepath) for ref_filepath in ref_filepaths])
        print(f'num_updates={trainer.get_num_updates()}')
        print(f'ts_scores={scores}')
        sari = scores['SARI']
        if not hasattr(trainer, 'best_sari'):
            trainer.best_sari = 0
        if not hasattr(trainer, 'n_validations_since_best'):
            trainer.n_validations_since_best = 0
        if sari > trainer.best_sari:
            trainer.best_sari = sari
            trainer.n_validations_since_best = 0
        else:
            trainer.n_validations_since_best += 1
            print(
                f'SARI did not improve for {trainer.n_validations_since_best} validations'
            )
            # Does not work because scheduler will set it to previous value everytime
            # trainer.optimizer.set_lr(0.75 * trainer.optimizer.get_lr())
            if trainer.n_validations_since_best >= cfg.validations_before_sari_early_stopping:
                print(
                    f'Early stopping because SARI did not improve for {trainer.n_validations_since_best} validations'
                )
                trainer.early_stopping = True

            def is_abort(epoch_itr, best_sari):
                if (epoch_itr.epoch >= 2 and best_sari < 19):
                    return True
                if (epoch_itr.epoch >= 5 and best_sari < 22):
                    return True
                if (epoch_itr.epoch >= 10 and best_sari < 25):
                    return True
                return False

            # if is_abort(epoch_itr, best_sari):
            #     print(f'Early stopping because best SARI is too low ({best_sari:.2f}) after {epoch_itr.epoch} epochs.')
            #     # Remove the checkpoint directory as we got nothing interesting
            #     shutil.rmtree(args.save_dir)
            #     # TODO: Abort
    return [-sari]
Пример #17
0
def cli_main():
    parser = options.get_generation_parser(interactive=True)
    parser.add_argument("--output-file", type=str, required=True)
    args = options.parse_args_and_arch(parser)
    main(args)
Пример #18
0
    def __init__(self, data_path, checkpoint_path, beam, nbest):
        self.parser = options.get_generation_parser(interactive=True)
        self.parser.set_defaults(path=checkpoint_path,
                                 remove_bpe=None,
                                 dataset_impl="lazy",
                                 num_wokers=5)
        self.args = options.parse_args_and_arch(self.parser,
                                                input_args=[data_path])
        self.args.beam = beam
        self.args.nbest = nbest

        utils.import_user_module(self.args)

        if self.args.buffer_size < 1:
            self.args.buffer_size = 1
        if self.args.max_tokens is None and self.args.max_sentences is None:
            self.args.max_sentences = 1

        assert not self.args.sampling or self.args.nbest == self.args.beam, \
            '--sampling requires --nbest to be equal to --beam'
        assert not self.args.max_sentences or self.args.max_sentences <= self.args.buffer_size, \
            '--max-sentences/--batch-size cannot be larger than --buffer-size'

        self.use_cuda = torch.cuda.is_available() and not self.args.cpu

        self.task = tasks.setup_task(self.args)

        self.models, self._model_args = checkpoint_utils.load_model_ensemble(
            self.args.path.split(':'),
            arg_overrides=eval(self.args.model_overrides),
            task=self.task,
        )

        self.src_dict = self.task.source_dictionary
        self.tgt_dict = self.task.target_dictionary

        for model in self.models:
            model.make_generation_fast_(
                beamable_mm_beam_size=None
                if self.args.no_beamable_mm else self.args.beam,
                need_attn=self.args.print_alignment,
            )
            if self.args.fp16:
                model.half()
            if self.use_cuda:
                model.cuda()

        self.generator = self.task.build_generator(self.args)

        if self.args.remove_bpe == 'gpt2':
            from fairseq.gpt2_bpe.gpt2_encoding import get_encoder
            self.decoder = get_encoder(
                'fairseq/gpt2_bpe/encoder.json',
                'fairseq/gpt2_bpe/vocab.bpe',
            )
            self.encode_fn = lambda x: ' '.join(
                map(str, self.decoder.encode(x)))
        else:
            self.decoder = None
            self.encode_fn = lambda x: x

        self.align_dict = utils.load_align_dict(self.args.replace_unk)

        self.max_positions = utils.resolve_max_positions(
            self.task.max_positions(),
            *[model.max_positions() for model in self.models])
Пример #19
0
def gen_and_reprocess_nbest(args):
    if args.score_dict_dir is None:
        args.score_dict_dir = args.data
    if args.prefix_len is not None:
        assert args.right_to_left1 is False, "prefix length not compatible with right to left models"
        assert args.right_to_left2 is False, "prefix length not compatible with right to left models"

    if args.nbest_list is not None:
        assert args.score_model2 is None

    if args.backwards1:
        scorer1_src = args.target_lang
        scorer1_tgt = args.source_lang
    else:
        scorer1_src = args.source_lang
        scorer1_tgt = args.target_lang

    store_data = os.path.join(os.path.dirname(__file__))+"/rerank_data/"+args.data_dir_name
    if not os.path.exists(store_data):
        os.makedirs(store_data)

    pre_gen, left_to_right_preprocessed_dir, right_to_left_preprocessed_dir, \
        backwards_preprocessed_dir, lm_preprocessed_dir = \
        rerank_utils.get_directories(args.data_dir_name, args.num_rescore, args.gen_subset,
                                     args.gen_model_name, args.shard_id, args.num_shards,
                                     args.sampling, args.prefix_len, args.target_prefix_frac,
                                     args.source_prefix_frac)
    assert not (args.right_to_left1 and args.backwards1), "backwards right to left not supported"
    assert not (args.right_to_left2 and args.backwards2), "backwards right to left not supported"
    assert not (args.prefix_len is not None and args.target_prefix_frac is not None), \
        "target prefix frac and target prefix len incompatible"

    # make directory to store generation results
    if not os.path.exists(pre_gen):
        os.makedirs(pre_gen)

    rerank1_is_gen = args.gen_model == args.score_model1 and args.source_prefix_frac is None
    rerank2_is_gen = args.gen_model == args.score_model2 and args.source_prefix_frac is None

    if args.nbest_list is not None:
        rerank2_is_gen = True

    # make directories to store preprossed nbest list for reranking
    if not os.path.exists(left_to_right_preprocessed_dir):
        os.makedirs(left_to_right_preprocessed_dir)
    if not os.path.exists(right_to_left_preprocessed_dir):
        os.makedirs(right_to_left_preprocessed_dir)
    if not os.path.exists(lm_preprocessed_dir):
        os.makedirs(lm_preprocessed_dir)
    if not os.path.exists(backwards_preprocessed_dir):
        os.makedirs(backwards_preprocessed_dir)

    score1_file = rerank_utils.rescore_file_name(pre_gen, args.prefix_len, args.model1_name,
                                                 target_prefix_frac=args.target_prefix_frac,
                                                 source_prefix_frac=args.source_prefix_frac,
                                                 backwards=args.backwards1)
    if args.score_model2 is not None:
        score2_file = rerank_utils.rescore_file_name(pre_gen, args.prefix_len, args.model2_name,
                                                     target_prefix_frac=args.target_prefix_frac,
                                                     source_prefix_frac=args.source_prefix_frac,
                                                     backwards=args.backwards2)

    predictions_bpe_file = pre_gen+"/generate_output_bpe.txt"

    using_nbest = args.nbest_list is not None

    if using_nbest:
        print("Using predefined n-best list from interactive.py")
        predictions_bpe_file = args.nbest_list

    else:
        if not os.path.isfile(predictions_bpe_file):
            print("STEP 1: generate predictions using the p(T|S) model with bpe")
            print(args.data)
            param1 = [args.data,
                      "--path", args.gen_model,
                      "--shard-id", str(args.shard_id),
                      "--num-shards", str(args.num_shards),
                      "--nbest", str(args.num_rescore),
                      "--batch-size", str(args.batch_size),
                      "--beam", str(args.num_rescore),
                      "--max-sentences", str(args.num_rescore),
                      "--gen-subset", args.gen_subset,
                      "--source-lang", args.source_lang,
                      "--target-lang", args.target_lang]
            if args.sampling:
                param1 += ["--sampling"]

            gen_parser = options.get_generation_parser()
            input_args = options.parse_args_and_arch(gen_parser, param1)

            print(input_args)
            with open(predictions_bpe_file, 'w') as f:
                with redirect_stdout(f):
                    generate.main(input_args)

    gen_output = rerank_utils.BitextOutputFromGen(predictions_bpe_file, bpe_symbol=args.remove_bpe,
                                                  nbest=using_nbest, prefix_len=args.prefix_len,
                                                  target_prefix_frac=args.target_prefix_frac)

    if args.diff_bpe:
        rerank_utils.write_reprocessed(gen_output.no_bpe_source, gen_output.no_bpe_hypo,
                                       gen_output.no_bpe_target, pre_gen+"/source_gen_bpe."+args.source_lang,
                                       pre_gen+"/target_gen_bpe."+args.target_lang,
                                       pre_gen+"/reference_gen_bpe."+args.target_lang)
        bitext_bpe = args.rescore_bpe_code
        bpe_src_param = ["-c", bitext_bpe,
                         "--input", pre_gen+"/source_gen_bpe."+args.source_lang,
                         "--output", pre_gen+"/rescore_data."+args.source_lang]
        bpe_tgt_param = ["-c", bitext_bpe,
                         "--input", pre_gen+"/target_gen_bpe."+args.target_lang,
                         "--output", pre_gen+"/rescore_data."+args.target_lang]

        subprocess.call(["python",
                         os.path.join(os.path.dirname(__file__),
                                      "subword-nmt/subword_nmt/apply_bpe.py")] + bpe_src_param,
                        shell=False)

        subprocess.call(["python",
                         os.path.join(os.path.dirname(__file__),
                                      "subword-nmt/subword_nmt/apply_bpe.py")] + bpe_tgt_param,
                        shell=False)

    if (not os.path.isfile(score1_file) and not rerank1_is_gen) or \
            (args.score_model2 is not None and not os.path.isfile(score2_file) and not rerank2_is_gen):
        print("STEP 2: process the output of generate.py so we have clean text files with the translations")

        rescore_file = "/rescore_data"
        if args.prefix_len is not None:
            prefix_len_rescore_file = rescore_file + "prefix"+str(args.prefix_len)
        if args.target_prefix_frac is not None:
            target_prefix_frac_rescore_file = rescore_file + "target_prefix_frac"+str(args.target_prefix_frac)
        if args.source_prefix_frac is not None:
            source_prefix_frac_rescore_file = rescore_file + "source_prefix_frac"+str(args.source_prefix_frac)

        if not args.right_to_left1 or not args.right_to_left2:
            if not args.diff_bpe:
                rerank_utils.write_reprocessed(gen_output.source, gen_output.hypo, gen_output.target,
                                               pre_gen+rescore_file+"."+args.source_lang,
                                               pre_gen+rescore_file+"."+args.target_lang,
                                               pre_gen+"/reference_file", bpe_symbol=args.remove_bpe)
                if args.prefix_len is not None:
                    bw_rescore_file = prefix_len_rescore_file
                    rerank_utils.write_reprocessed(gen_output.source, gen_output.hypo, gen_output.target,
                                                   pre_gen+prefix_len_rescore_file+"."+args.source_lang,
                                                   pre_gen+prefix_len_rescore_file+"."+args.target_lang,
                                                   pre_gen+"/reference_file", prefix_len=args.prefix_len,
                                                   bpe_symbol=args.remove_bpe)
                elif args.target_prefix_frac is not None:
                    bw_rescore_file = target_prefix_frac_rescore_file
                    rerank_utils.write_reprocessed(gen_output.source, gen_output.hypo, gen_output.target,
                                                   pre_gen+target_prefix_frac_rescore_file+"."+args.source_lang,
                                                   pre_gen+target_prefix_frac_rescore_file+"."+args.target_lang,
                                                   pre_gen+"/reference_file", bpe_symbol=args.remove_bpe,
                                                   target_prefix_frac=args.target_prefix_frac)
                else:
                    bw_rescore_file = rescore_file

                if args.source_prefix_frac is not None:
                    fw_rescore_file = source_prefix_frac_rescore_file
                    rerank_utils.write_reprocessed(gen_output.source, gen_output.hypo, gen_output.target,
                                                   pre_gen+source_prefix_frac_rescore_file+"."+args.source_lang,
                                                   pre_gen+source_prefix_frac_rescore_file+"."+args.target_lang,
                                                   pre_gen+"/reference_file", bpe_symbol=args.remove_bpe,
                                                   source_prefix_frac=args.source_prefix_frac)
                else:
                    fw_rescore_file = rescore_file

        if args.right_to_left1 or args.right_to_left2:
            rerank_utils.write_reprocessed(gen_output.source, gen_output.hypo, gen_output.target,
                                           pre_gen+"/right_to_left_rescore_data."+args.source_lang,
                                           pre_gen+"/right_to_left_rescore_data."+args.target_lang,
                                           pre_gen+"/right_to_left_reference_file",
                                           right_to_left=True, bpe_symbol=args.remove_bpe)

        print("STEP 3: binarize the translations")
        if not args.right_to_left1 or args.score_model2 is not None and not args.right_to_left2 or not rerank1_is_gen:

            if args.backwards1 or args.backwards2:
                if args.backwards_score_dict_dir is not None:
                    bw_dict = args.backwards_score_dict_dir
                else:
                    bw_dict = args.score_dict_dir
                bw_preprocess_param = ["--source-lang", scorer1_src,
                                       "--target-lang", scorer1_tgt,
                                       "--trainpref", pre_gen+bw_rescore_file,
                                       "--srcdict", bw_dict + "/dict." + scorer1_src + ".txt",
                                       "--tgtdict", bw_dict + "/dict." + scorer1_tgt + ".txt",
                                       "--destdir", backwards_preprocessed_dir]
                preprocess_parser = options.get_preprocessing_parser()
                input_args = preprocess_parser.parse_args(bw_preprocess_param)
                preprocess.main(input_args)

            preprocess_param = ["--source-lang", scorer1_src,
                                "--target-lang", scorer1_tgt,
                                "--trainpref", pre_gen+fw_rescore_file,
                                "--srcdict", args.score_dict_dir+"/dict."+scorer1_src+".txt",
                                "--tgtdict", args.score_dict_dir+"/dict."+scorer1_tgt+".txt",
                                "--destdir", left_to_right_preprocessed_dir]
            preprocess_parser = options.get_preprocessing_parser()
            input_args = preprocess_parser.parse_args(preprocess_param)
            preprocess.main(input_args)

        if args.right_to_left1 or args.right_to_left2:
            preprocess_param = ["--source-lang", scorer1_src,
                                "--target-lang", scorer1_tgt,
                                "--trainpref", pre_gen+"/right_to_left_rescore_data",
                                "--srcdict", args.score_dict_dir+"/dict."+scorer1_src+".txt",
                                "--tgtdict", args.score_dict_dir+"/dict."+scorer1_tgt+".txt",
                                "--destdir", right_to_left_preprocessed_dir]
            preprocess_parser = options.get_preprocessing_parser()
            input_args = preprocess_parser.parse_args(preprocess_param)
            preprocess.main(input_args)

    return gen_output
Пример #20
0
def load_model():
    parser = options.get_generation_parser(interactive=True)
    args = options.parse_args_and_arch(parser)
    return load_model1(args)
Пример #21
0
        scores_per_target_length[len(target)].append(local_score)

    print(f"Localism {np.mean(predictions_equal)}")

    with open("trace_localism.txt", 'w') as f:
        for pairs, score in all_pairs:
            f.write("------------------------------\n")
            for s, t in pairs:
                f.write("{} -> {}\n".format(s, t))
            f.write("{}".format(score))

    # # UNCOMMENT TO COMPUTE LOCALISM SCORE PER INPUT / TARGET LENGTH
    # for key in sorted(list(scores_per_input_length.keys())):
    #     scores = scores_per_input_length[key]
    #     print(f"Input length {key}, {len(scores)}, localism score {sum(scores)/len(scores)}.")

    # for key in sorted(list(scores_per_target_length.keys())):
    #     scores = scores_per_target_length[key]
    #     print(f"Target length {key}, localism score {sum(scores) / len(scores)}.")


if __name__ == '__main__':
    parser = options.get_generation_parser(interactive=True)
    args = options.parse_args_and_arch(parser)

    Batch = namedtuple('Batch', 'srcs tokens lengths')
    Translation = namedtuple('Translation',
                             'src_str hypos pos_scores alignments')

    main(args)
Пример #22
0
from fairseq import checkpoint_utils, data, options, tasks

# Parse command-line arguments for generation
parser = options.get_generation_parser(default_task='simple_classification')
args = options.parse_args_and_arch(parser)

# Setup task
task = tasks.setup_task(args)
# Load model
print('| loading model from {}'.format(
    '/Users/ashisharora/espresso/fairseq_cli/checkpoints/checkpoint_best.pt'))
models, _model_args = checkpoint_utils.load_model_ensemble(
    ['/Users/ashisharora/espresso/fairseq_cli/checkpoints/checkpoint_best.pt'],
    task=task)
model = models[0]

#while True:
#sentence = input('\nInput: ')
#chars = ' '.join(list(sentence.strip()))
#tokens = task.source_dictionary.encode_line(
#    chars, add_if_not_exist=False,
#)
chars = 'W i l l i a m'
tokens = task.source_dictionary.encode_line(
    chars,
    add_if_not_exist=False,
)
chars = 'W i l l i a m'
tokens2 = task.source_dictionary.encode_line(
    chars,
    add_if_not_exist=False,
Пример #23
0
            wps_meter.update(src_tokens.size(0))
            t.log({'wps': round(wps_meter.avg)})
            num_sentences += 1

    print('| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)'.format(
        num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg))
    if has_target:
        print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string()))
    #output the result
    return result




if __name__ == '__main__':
    parser = options.get_generation_parser()
    args = options.parse_args_and_arch(parser)
    print(args.path)
    result=main(args)

    ref_path="result/lts.s2s.test.s"

    reference = read_sequences(ref_path)
    inference = [r.strip().split() for r in result]
    compare(reference, inference)

    print('If remove stress:')
    remove_stress(reference)
    remove_stress(inference)
    wer=compare(reference, inference)
    model_name=args.path.split('/')[-2]
Пример #24
0
from fairseq import checkpoint_utils, data, options, tasks
import torch

# Parse command-line arguments for generation
parser = options.get_generation_parser(default_task='multilingual_translation')
args = options.parse_args_and_arch(parser)

# Setup task
task = tasks.setup_task(args)
for valid_sub_split in args.gen_subset.split(','):
    task.load_dataset(valid_sub_split, combine=True, epoch=0)
# Load model
print('| loading model from {}'.format(args.path))
models, _model_args = checkpoint_utils.load_model_ensemble([args.path],
                                                           task=task)
model = models[0]

M = model.models['de-en'].encoder.M
N = model.models['de-en'].encoder.N

no_langs = N.size(0)

lang2idx = task.lang2idx
idx2lang = {}
for keys in lang2idx.keys():
    idx2lang[lang2idx[keys]] = keys

lang2idx2idx = model.models['de-en'].encoder.lang2idx2idx
idx2idx2lang = {i.item(): idx for idx, i in enumerate(lang2idx2idx)}
# import pdb; pdb.set_trace()
Пример #25
0
def cli_main():
    parser = options.get_generation_parser()
    args = options.parse_args_and_arch(parser)
    main(args)
Пример #26
0
                        print('A-{}\t{}'.format(
                            sample_id, ' '.join(
                                map(lambda x: str(utils.item(x)), alignment))))

                # Score only the top hypothesis
                if has_target and i == 0:
                    if align_dict is not None or args.remove_bpe is not None:
                        # Convert back to tokens for evaluation with unk replacement and/or without BPE
                        target_tokens = tokenizer.Tokenizer.tokenize(
                            target_str, tgt_dict, add_if_not_exist=True)
                    scorer.add(target_tokens, hypo_tokens)

            wps_meter.update(src_tokens.size(0))
            t.log({'wps': round(wps_meter.avg)})
            num_sentences += 1

    print(
        '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)'
        .format(num_sentences, gen_timer.n, gen_timer.sum,
                num_sentences / gen_timer.sum, 1. / gen_timer.avg))
    if has_target:
        print('| Generate {} with beam={}: {}'.format(args.gen_subset,
                                                      args.beam,
                                                      scorer.result_string()))


if __name__ == '__main__':
    parser = options.get_generation_parser(style_transfer=True)
    args = options.parse_args_and_arch(parser)
    main(args)
Пример #27
0
def cli_main():
    parser = options.get_generation_parser(interactive=True)
    args = options.parse_args_and_arch(parser)
    main(args)
Пример #28
0
        initial_model = os.path.join(args.ckpt_dir, args.initial_model)
        for (idx, file) in enumerate(files):
            if file == initial_model:
                start_idx = idx

    bleu_ptn = 'BLEU4\s=\s([\d\.]+?),'
    for x in range(start_idx, len(files)):
        ckpt_file = files[x]
        args.path = ckpt_file
        # Note here, simply calling single_model_main will bring mysterious memory error,
        # so use bruteforce calling instead single_model_main(args)
        print('python {}/generate.py {} --path {}'.format(
            args.fairseq_dir, obtain_sys_argv(), ckpt_file))
        pl_process = subprocess.Popen(
            'python {}/generate.py {} --path {}'.format(
                args.fairseq_dir, obtain_sys_argv(), ckpt_file),
            shell=True,
            stdout=subprocess.PIPE)
        pl_output = pl_process.stdout.read()
        bleu_match = re.search(bleu_ptn, str(pl_output))
        if bleu_match:
            bleu_score = bleu_match.group(1)
            print(ckpt_file, bleu_score)
            sys.stdout.flush()
        time.sleep(15)


if __name__ == '__main__':
    parser = options.get_generation_parser(seq=True)
    args = options.parse_args_and_arch(parser)
    main(args)
Пример #29
0
    max_positions = utils.resolve_max_positions(
        task.max_positions(),
        *[model.max_positions() for model in models]
    )

    if args.buffer_size > 1:
        print('| Sentence buffer size:', args.buffer_size)
    print('| Type the input sentence and press return:')
    for inputs in buffered_read(args.buffer_size):
        indices = []
        results = []
        for batch, batch_indices in make_batches(inputs, args, task, max_positions):
            indices.extend(batch_indices)
            results += process_batch(batch)

        for i in np.argsort(indices):
            result = results[i]
            print(result.src_str)
            for hypo, pos_scores, align in zip(result.hypos, result.pos_scores, result.alignments):
                print(hypo)
                print(pos_scores)
                if align is not None:
                    print(align)


if __name__ == '__main__':
    parser = options.get_generation_parser(interactive=True)
    args = options.parse_args_and_arch(parser)
    main(args)
Пример #30
0
def cli_main():
    parser = options.get_generation_parser(default_task="speech_recognition_hybrid")
    args = options.parse_args_and_arch(parser)
    main(args)
def model_fn(model_dir):
    
    model_name = 'checkpoint_best.pt'
    model_path = os.path.join(model_dir, model_name)

    logger.info('Loading the model')
    with open(model_path, 'rb') as f:
        model_info = torch.load(f, map_location=torch.device('cpu'))

    # Will be overidden by the model_info['args'] - need to keep for pre-trained models   
    parser = options.get_generation_parser(interactive=True)
    # get args for FairSeq by converting the hyperparameters as if they were command-line arguments
    argv_copy = copy.deepcopy(sys.argv)
    # remove the modifications we did in the command-line arguments
    sys.argv[1:] = ['--path', model_path, model_dir]
    args = options.parse_args_and_arch(parser)
    # restore previous command-line args
    sys.argv = argv_copy
    
    saved_args = model_info['args']
    for key, value in vars(saved_args).items():
        setattr(args, key, value)

    args.data = [model_dir]
    print(args)

    # Setup task, e.g., translation
    task = tasks.setup_task(args)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logger.info('Current device: {}'.format(device))

    model_paths = [os.path.join(model_dir, model_name)]
    models, model_args = utils.load_ensemble_for_inference(model_paths, task, model_arg_overrides={})

    # Set dictionaries
    tgt_dict = task.target_dictionary

    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(
            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
            need_attn=args.print_alignment,
        )
        if args.fp16:
            model.half()

    # Initialize generator
    translator = SequenceGenerator(
        models, tgt_dict, beam_size=args.beam, minlen=args.min_len,
        stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized),
        len_penalty=args.lenpen, unk_penalty=args.unkpen,
        sampling=args.sampling, sampling_topk=args.sampling_topk, sampling_temperature=args.sampling_temperature,
        diverse_beam_groups=args.diverse_beam_groups, diverse_beam_strength=args.diverse_beam_strength,
    )

    if device.type == 'cuda':
        translator.cuda()

    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    # align_dict = utils.load_align_dict(args.replace_unk)
    align_dict = utils.load_align_dict(None)


    max_positions = utils.resolve_max_positions(
        task.max_positions(),
        *[model.max_positions() for model in models]
    )

    return dict(
        translator=translator,
        task=task,
        max_positions=max_positions,
        align_dict=align_dict,
        tgt_dict=tgt_dict,
        args=args,
        device=device,
    )
Пример #32
0
    def initialize(self,
                   data_dir=_data_dir,
                   model_path=_model,
                   user_dir=_user_dir,
                   task='xmasked_seq2seq',
                   s_lang='en',
                   t_lang='zh',
                   beam=5,
                   cpu=False,
                   align_dict=None,
                   bpe_codes=_bpe_codes_en,
                   tokenizer=True):
        self.parser = options.get_generation_parser(interactive=True)
        self.src, self.tgt = s_lang, t_lang

        # generate args
        input_args = [data_dir, '--path', model_path]
        if cpu:
            input_args.append('--cpu')
        if user_dir:
            input_args.append('--user-dir')
            input_args.append(user_dir)
        if task:
            input_args.append('--task')
            input_args.append(task)
        if align_dict:
            input_args.append('--replace-unk')
            input_args.append(align_dict)
        input_args.append('--langs')
        input_args.append('{},{}'.format(s_lang, t_lang))
        input_args.append('--source-langs')
        input_args.append(s_lang)
        input_args.append('--target-langs')
        input_args.append(t_lang)
        input_args.append('-s')
        input_args.append(s_lang)
        input_args.append('-t')
        input_args.append(t_lang)
        input_args.append('--beam')
        input_args.append(str(beam))
        input_args.append('--remove-bpe')

        self.bpe = BPE(open(bpe_codes, 'r'))
        self.tokenizer = tokenizer

        self.args = options.parse_args_and_arch(self.parser,
                                                input_args=input_args)

        # initialize model
        utils.import_user_module(self.args)

        if self.args.buffer_size < 1:
            self.args.buffer_size = 1
        if self.args.max_tokens is None and self.args.max_sentences is None:
            self.args.max_sentences = 1

        assert not self.args.sampling or self.args.nbest == self.args.beam, \
            '--sampling requires --nbest to be equal to --beam'
        assert not self.args.max_sentences or self.args.max_sentences <= self.args.buffer_size, \
            '--max-sentences/--batch-size cannot be larger than --buffer-size'

        self.use_cuda = torch.cuda.is_available() and not self.args.cpu

        # Setup task, e.g., translation
        self.task = tasks.setup_task(self.args)

        # Load ensemble
        self.models, _model_args = checkpoint_utils.load_model_ensemble(
            self.args.path.split(':'),
            arg_overrides=eval(self.args.model_overrides),
            task=self.task,
        )

        # Set dictionaries
        self.src_dict = self.task.source_dictionary
        self.tgt_dict = self.task.target_dictionary

        # Optimize ensemble for generation
        for model in self.models:
            model.make_generation_fast_(
                beamable_mm_beam_size=None
                if self.args.no_beamable_mm else self.args.beam,
                need_attn=self.args.print_alignment,
            )
            if self.args.fp16:
                model.half()
            if self.use_cuda:
                model.cuda()

        # Initialize generator
        self.generator = self.task.build_generator(self.args)

        def encode_fn(x):
            if tokenizer:
                x = tokenize(x, is_zh=(s_lang == 'zh'))
            if bpe_codes:
                x = self.bpe.process_line(x)
            return x

        # Hack to support GPT-2 BPE
        if self.args.remove_bpe == 'gpt2':
            pass
        else:
            self.decoder = None
            # self.encode_fn = lambda x: x
            self.encode_fn = encode_fn

        # Load alignment dictionary for unknown word replacement
        # (None if no unknown word replacement, empty if no path to align dictionary)
        self.align_dict = utils.load_align_dict(self.args.replace_unk)

        self.max_positions = utils.resolve_max_positions(
            self.task.max_positions(),
            *[model.max_positions() for model in self.models])