Пример #1
0
        epoch=epoch,
        disable_iterator_cache=not cached,
        # Set this to False to speed up. However, if set to False, changing max_tokens beyond
        # first call of this method has no effect.
    )
    return batch_iterator


if __name__ == '__main__':
    args = parse_args()
    # todo: 返回是tuble 而不是cfgdic
    hw5_config = get_cfg(args)
    task_cfg = TranslationConfig(
        data=hw5_config.get("data_path"),
        source_lang=hw5_config.get("source_lang"),
        target_lang=hw5_config.get("target_lang"),
        train_subset="train",
        required_seq_len_multiple=8,
        dataset_impl="mmap",
        upsample_primary=1,
    )
    task = TranslationTask.setup_task(task_cfg)
    demo_epoch_obj = load_data_iterator(task,
                                        "valid",
                                        epoch=1,
                                        max_tokens=20,
                                        num_workers=1,
                                        cached=False)
    demo_iter = demo_epoch_obj.next_epoch_itr(shuffle=True)
    sample = next(demo_iter)
    sample
    def build_generator(self,
                        models,
                        args,
                        seq_gen_cls=None,
                        extra_gen_cls_kwargs=None):
        if getattr(args, "score_reference", False):
            raise NotImplementedError()
        else:
            from .noisy_channel_sequence_generator import NoisyChannelSequenceGenerator
            use_cuda = torch.cuda.is_available() and not self.args.cpu
            assert self.args.lm_model is not None, '--lm-model required for noisy channel generation!'
            assert self.args.lm_data is not None, '--lm-data required for noisy channel generation to map between LM and bitext vocabs'
            if self.args.channel_model is not None:
                import copy
                ch_args_task = copy.deepcopy(self.args)
                tmp = ch_args_task.source_lang
                ch_args_task.source_lang = ch_args_task.target_lang
                ch_args_task.target_lang = tmp
                ch_args_task._name = 'translation'
                channel_task = TranslationTask.setup_task(ch_args_task)

            arg_dict = {}
            arg_dict['task'] = 'language_modeling'
            arg_dict['sample_break_mode'] = 'eos'
            arg_dict['data'] = self.args.lm_data
            arg_dict['output_dictionary_size'] = -1
            lm_args = argparse.Namespace(**arg_dict)
            lm_task = LanguageModelingTask.setup_task(lm_args)
            lm_dict = lm_task.output_dictionary

            if self.args.channel_model is not None:
                channel_models, _ = checkpoint_utils.load_model_ensemble(
                    self.args.channel_model.split(':'), task=channel_task)

                for model in channel_models:
                    model.make_generation_fast_(
                        beamable_mm_beam_size=None
                        if args.no_beamable_mm else args.beam,
                        need_attn=args.print_alignment,
                    )
                    if self.args.fp16:
                        model.half()
                    if use_cuda:
                        model.cuda()
            else:
                channel_models = None

            lm_models, _ = checkpoint_utils.load_model_ensemble(
                self.args.lm_model.split(':'), task=lm_task)

            for model in lm_models:
                model.make_generation_fast_(
                    beamable_mm_beam_size=None
                    if args.no_beamable_mm else args.beam,
                    need_attn=args.print_alignment,
                )
                if self.args.fp16:
                    model.half()
                if use_cuda:
                    model.cuda()
            return NoisyChannelSequenceGenerator(
                combine_method=self.args.combine_method,
                tgt_dict=self.target_dictionary,
                src_dict=self.source_dictionary,
                beam_size=getattr(args, 'beam', 5),
                max_len_a=getattr(args, 'max_len_a', 0),
                max_len_b=getattr(args, 'max_len_b', 200),
                min_len=getattr(args, 'min_len', 1),
                len_penalty=getattr(args, 'lenpen', 1),
                unk_penalty=getattr(args, 'unkpen', 0),
                temperature=getattr(args, 'temperature', 1.),
                match_source_len=getattr(args, 'match_source_len', False),
                no_repeat_ngram_size=getattr(args, 'no_repeat_ngram_size', 0),
                normalize_scores=(not getattr(args, 'unnormalized', False)),
                channel_models=channel_models,
                k2=getattr(self.args, 'k2', 50),
                ch_weight=getattr(self.args, 'ch_wt', 1),
                channel_scoring_type=self.args.channel_scoring_type,
                top_k_vocab=self.args.top_k_vocab,
                lm_models=lm_models,
                lm_dict=lm_dict,
                lm_weight=getattr(self.args, 'lm_wt', 1),
                normalize_lm_scores_by_tgt_len=getattr(
                    self.args, 'normalize_lm_scores_by_tgt_len', False),
            )
Пример #3
0
def convert_fsmt_checkpoint_to_pytorch(fsmt_checkpoint_path,
                                       pytorch_dump_folder_path,
                                       data_path,
                                       spm_model_path=None):
    # assumes join dicitionary

    json_indent = 2

    # prep
    assert os.path.exists(fsmt_checkpoint_path)
    os.makedirs(pytorch_dump_folder_path, exist_ok=True)
    print(f"Writing results to {pytorch_dump_folder_path}")

    chkpt = torch.load(fsmt_checkpoint_path)
    chkpt['cfg']['task'].data = data_path
    chkpt['cfg']['model'].data = data_path
    torch.save(chkpt, fsmt_checkpoint_path)

    task_args, model_args = chkpt['cfg']['task'], chkpt['cfg']['model']

    task = TranslationTask.setup_task(task_args)
    model = task.build_model(model_args)

    # model config
    fsmt_model_config_file = os.path.join(pytorch_dump_folder_path,
                                          "config.json")

    model_conf = {
        "architectures": ["FSMTForConditionalGeneration"],
        "model_type":
        "fsmt",
        "activation_dropout":
        model_args.activation_dropout,
        "activation_function":
        "relu",
        "attention_dropout":
        model_args.attention_dropout,
        "d_model":
        model_args.decoder_embed_dim,
        "dropout":
        model_args.dropout,
        "init_std":
        0.02,
        "max_position_embeddings":
        model_args.max_source_positions,
        "num_hidden_layers":
        model_args.encoder_layers,
        "src_vocab_size":
        len(task.source_dictionary),
        "tgt_vocab_size":
        len(task.target_dictionary),
        "langs": [task_args.source_lang, task_args.target_lang],
        "encoder_attention_heads":
        model_args.encoder_attention_heads,
        "encoder_ffn_dim":
        model_args.encoder_ffn_embed_dim,
        "encoder_layerdrop":
        model_args.encoder_layerdrop,
        "encoder_layers":
        model_args.encoder_layers,
        "decoder_attention_heads":
        model_args.decoder_attention_heads,
        "decoder_ffn_dim":
        model_args.decoder_ffn_embed_dim,
        "decoder_layerdrop":
        model_args.decoder_layerdrop,
        "decoder_layers":
        model_args.decoder_layers,
        "bos_token_id":
        0,
        "pad_token_id":
        1,
        "eos_token_id":
        2,
        "is_encoder_decoder":
        True,
        "scale_embedding":
        not model_args.no_scale_embedding,
        "tie_word_embeddings":
        model_args.share_all_embeddings,
        "share_decoder_input_output_embed":
        model_args.share_decoder_input_output_embed
    }

    # good hparam defaults to start with
    model_conf["num_beams"] = 5
    model_conf["early_stopping"] = False
    model_conf["length_penalty"] = 1.0

    print(f"Generating {fsmt_model_config_file}")
    with open(fsmt_model_config_file, "w", encoding="utf-8") as f:
        f.write(json.dumps(model_conf, ensure_ascii=False, indent=json_indent))

    # model
    hub_gen = TransformerModel.from_pretrained(
        dirname(fsmt_checkpoint_path),
        checkpoint_file=basename(fsmt_checkpoint_path),
        data_name_or_path=task_args.data)

    model_state_dict = hub_gen.models[0].state_dict()

    # rename keys to start with 'model.'
    model_state_dict = OrderedDict(
        ("model." + k, v) for k, v in model_state_dict.items())

    # remove unneeded keys
    ignore_keys = [
        "model.model",
        "model.encoder.version",
        "model.decoder.version",
        #"model.encoder_embed_tokens.weight",
        #"model.decoder_embed_tokens.weight",
        "model.encoder.embed_positions._float_tensor",
        "model.decoder.embed_positions._float_tensor",
    ]
    for k in ignore_keys:
        model_state_dict.pop(k, None)

    #print(model_state_dict.keys())

    config = FSMTConfig.from_pretrained(pytorch_dump_folder_path)
    model_new = FSMTForConditionalGeneration(config)

    # check that it loads ok
    model_new.load_state_dict(model_state_dict, strict=False)

    # save
    pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path,
                                             WEIGHTS_NAME)
    print(f"Generating {pytorch_weights_dump_path}")
    torch.save(model_state_dict, pytorch_weights_dump_path)

    pytorch_vocab_dump_path = os.path.join(pytorch_dump_folder_path,
                                           "vocab.txt")
    print(f"Generating {pytorch_vocab_dump_path}")
    assert hub_gen.src_dict.indices == hub_gen.tgt_dict.indices
    with open(pytorch_vocab_dump_path, 'w') as f:
        for item in hub_gen.src_dict.indices:
            f.write("%s\n" % item)

    if spm_model_path is not None:
        copyfile(spm_model_path, f"{pytorch_dump_folder_path}/spm_model.spm")

    print("Conversion is done!")