epoch=epoch, disable_iterator_cache=not cached, # Set this to False to speed up. However, if set to False, changing max_tokens beyond # first call of this method has no effect. ) return batch_iterator if __name__ == '__main__': args = parse_args() # todo: 返回是tuble 而不是cfgdic hw5_config = get_cfg(args) task_cfg = TranslationConfig( data=hw5_config.get("data_path"), source_lang=hw5_config.get("source_lang"), target_lang=hw5_config.get("target_lang"), train_subset="train", required_seq_len_multiple=8, dataset_impl="mmap", upsample_primary=1, ) task = TranslationTask.setup_task(task_cfg) demo_epoch_obj = load_data_iterator(task, "valid", epoch=1, max_tokens=20, num_workers=1, cached=False) demo_iter = demo_epoch_obj.next_epoch_itr(shuffle=True) sample = next(demo_iter) sample
def build_generator(self, models, args, seq_gen_cls=None, extra_gen_cls_kwargs=None): if getattr(args, "score_reference", False): raise NotImplementedError() else: from .noisy_channel_sequence_generator import NoisyChannelSequenceGenerator use_cuda = torch.cuda.is_available() and not self.args.cpu assert self.args.lm_model is not None, '--lm-model required for noisy channel generation!' assert self.args.lm_data is not None, '--lm-data required for noisy channel generation to map between LM and bitext vocabs' if self.args.channel_model is not None: import copy ch_args_task = copy.deepcopy(self.args) tmp = ch_args_task.source_lang ch_args_task.source_lang = ch_args_task.target_lang ch_args_task.target_lang = tmp ch_args_task._name = 'translation' channel_task = TranslationTask.setup_task(ch_args_task) arg_dict = {} arg_dict['task'] = 'language_modeling' arg_dict['sample_break_mode'] = 'eos' arg_dict['data'] = self.args.lm_data arg_dict['output_dictionary_size'] = -1 lm_args = argparse.Namespace(**arg_dict) lm_task = LanguageModelingTask.setup_task(lm_args) lm_dict = lm_task.output_dictionary if self.args.channel_model is not None: channel_models, _ = checkpoint_utils.load_model_ensemble( self.args.channel_model.split(':'), task=channel_task) for model in channel_models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if self.args.fp16: model.half() if use_cuda: model.cuda() else: channel_models = None lm_models, _ = checkpoint_utils.load_model_ensemble( self.args.lm_model.split(':'), task=lm_task) for model in lm_models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if self.args.fp16: model.half() if use_cuda: model.cuda() return NoisyChannelSequenceGenerator( combine_method=self.args.combine_method, tgt_dict=self.target_dictionary, src_dict=self.source_dictionary, beam_size=getattr(args, 'beam', 5), max_len_a=getattr(args, 'max_len_a', 0), max_len_b=getattr(args, 'max_len_b', 200), min_len=getattr(args, 'min_len', 1), len_penalty=getattr(args, 'lenpen', 1), unk_penalty=getattr(args, 'unkpen', 0), temperature=getattr(args, 'temperature', 1.), match_source_len=getattr(args, 'match_source_len', False), no_repeat_ngram_size=getattr(args, 'no_repeat_ngram_size', 0), normalize_scores=(not getattr(args, 'unnormalized', False)), channel_models=channel_models, k2=getattr(self.args, 'k2', 50), ch_weight=getattr(self.args, 'ch_wt', 1), channel_scoring_type=self.args.channel_scoring_type, top_k_vocab=self.args.top_k_vocab, lm_models=lm_models, lm_dict=lm_dict, lm_weight=getattr(self.args, 'lm_wt', 1), normalize_lm_scores_by_tgt_len=getattr( self.args, 'normalize_lm_scores_by_tgt_len', False), )
def convert_fsmt_checkpoint_to_pytorch(fsmt_checkpoint_path, pytorch_dump_folder_path, data_path, spm_model_path=None): # assumes join dicitionary json_indent = 2 # prep assert os.path.exists(fsmt_checkpoint_path) os.makedirs(pytorch_dump_folder_path, exist_ok=True) print(f"Writing results to {pytorch_dump_folder_path}") chkpt = torch.load(fsmt_checkpoint_path) chkpt['cfg']['task'].data = data_path chkpt['cfg']['model'].data = data_path torch.save(chkpt, fsmt_checkpoint_path) task_args, model_args = chkpt['cfg']['task'], chkpt['cfg']['model'] task = TranslationTask.setup_task(task_args) model = task.build_model(model_args) # model config fsmt_model_config_file = os.path.join(pytorch_dump_folder_path, "config.json") model_conf = { "architectures": ["FSMTForConditionalGeneration"], "model_type": "fsmt", "activation_dropout": model_args.activation_dropout, "activation_function": "relu", "attention_dropout": model_args.attention_dropout, "d_model": model_args.decoder_embed_dim, "dropout": model_args.dropout, "init_std": 0.02, "max_position_embeddings": model_args.max_source_positions, "num_hidden_layers": model_args.encoder_layers, "src_vocab_size": len(task.source_dictionary), "tgt_vocab_size": len(task.target_dictionary), "langs": [task_args.source_lang, task_args.target_lang], "encoder_attention_heads": model_args.encoder_attention_heads, "encoder_ffn_dim": model_args.encoder_ffn_embed_dim, "encoder_layerdrop": model_args.encoder_layerdrop, "encoder_layers": model_args.encoder_layers, "decoder_attention_heads": model_args.decoder_attention_heads, "decoder_ffn_dim": model_args.decoder_ffn_embed_dim, "decoder_layerdrop": model_args.decoder_layerdrop, "decoder_layers": model_args.decoder_layers, "bos_token_id": 0, "pad_token_id": 1, "eos_token_id": 2, "is_encoder_decoder": True, "scale_embedding": not model_args.no_scale_embedding, "tie_word_embeddings": model_args.share_all_embeddings, "share_decoder_input_output_embed": model_args.share_decoder_input_output_embed } # good hparam defaults to start with model_conf["num_beams"] = 5 model_conf["early_stopping"] = False model_conf["length_penalty"] = 1.0 print(f"Generating {fsmt_model_config_file}") with open(fsmt_model_config_file, "w", encoding="utf-8") as f: f.write(json.dumps(model_conf, ensure_ascii=False, indent=json_indent)) # model hub_gen = TransformerModel.from_pretrained( dirname(fsmt_checkpoint_path), checkpoint_file=basename(fsmt_checkpoint_path), data_name_or_path=task_args.data) model_state_dict = hub_gen.models[0].state_dict() # rename keys to start with 'model.' model_state_dict = OrderedDict( ("model." + k, v) for k, v in model_state_dict.items()) # remove unneeded keys ignore_keys = [ "model.model", "model.encoder.version", "model.decoder.version", #"model.encoder_embed_tokens.weight", #"model.decoder_embed_tokens.weight", "model.encoder.embed_positions._float_tensor", "model.decoder.embed_positions._float_tensor", ] for k in ignore_keys: model_state_dict.pop(k, None) #print(model_state_dict.keys()) config = FSMTConfig.from_pretrained(pytorch_dump_folder_path) model_new = FSMTForConditionalGeneration(config) # check that it loads ok model_new.load_state_dict(model_state_dict, strict=False) # save pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) print(f"Generating {pytorch_weights_dump_path}") torch.save(model_state_dict, pytorch_weights_dump_path) pytorch_vocab_dump_path = os.path.join(pytorch_dump_folder_path, "vocab.txt") print(f"Generating {pytorch_vocab_dump_path}") assert hub_gen.src_dict.indices == hub_gen.tgt_dict.indices with open(pytorch_vocab_dump_path, 'w') as f: for item in hub_gen.src_dict.indices: f.write("%s\n" % item) if spm_model_path is not None: copyfile(spm_model_path, f"{pytorch_dump_folder_path}/spm_model.spm") print("Conversion is done!")