def build(self, filepath=None, vocab_path=None, threshold=-1, max_vocab=-1): if vocab_path and os.path.exists(vocab_path): print("loading vocab from {}".format(vocab_path)) d = Dictionary.load(vocab_path) print('vocab size {}'.format(len(d))) else: print("building vocab...") d = Dictionary() for step, line in enumerate(sentence_iterator(filepath)): if not step % 1000: print("working on {}kth line".format(step // 1000), end='\r') tokens = [self.get_lemma(w) for w in line] for tok in tokens: d.add_symbol(tok) d.finalize(threshold=threshold, nwords=max_vocab) print('build done. vocab size {}'.format(len(d))) d.save('{}/dict.txt'.format(self.data_dir)) self.vocab = d self.unk = self.vocab.unk()
def build_fairseq_vocab( vocab_file: str, dictionary_class: Dictionary = Dictionary, special_token_replacements: Dict[str, Token] = None, max_vocab: int = -1, min_count: int = -1, tokens_to_add: Optional[List[str]] = None, ) -> Vocabulary: """ Function builds a PyText vocabulary for models pre-trained using Fairseq modules. The dictionary class can take any Fairseq Dictionary class and is used to load the vocab file. """ dictionary = dictionary_class.load(vocab_file) # finalize will sort the dict based on frequency so only do this if # a min_count or max_vocab size is specified if min_count > 0 or max_vocab > 0: dictionary.finalize(threshold=min_count, nwords=max_vocab, padding_factor=1) if tokens_to_add: for token in tokens_to_add: dictionary.add_symbol(token) return Vocabulary(dictionary.symbols, dictionary.count, replacements=special_token_replacements)
def build_fairseq_vocab( vocab_file: str, dictionary_class: Dictionary = Dictionary, special_token_replacements: Dict[str, SpecialToken] = None, max_vocab: int = -1, min_count: int = -1, tokens_to_add: Optional[List[str]] = None, ): """ Function builds a PyText vocabulary for models pre-trained using Fairseq modules. The dictionary class can take any Fairseq Dictionary class and is used to load the vocab file. """ if not special_token_replacements: special_token_replacements = { "<pad>": SpecialTokens.PAD, "<s>": SpecialTokens.BOS, "</s>": SpecialTokens.EOS, "<unk>": SpecialTokens.UNK, "<mask>": SpecialTokens.MASK, } with PathManager.open(vocab_file) as f: dictionary = dictionary_class.load(f) # finalize will sort the dict based on frequency so only do this if # a min_count or max_vocab size is specified if min_count > 0 or max_vocab > 0: dictionary.finalize(threshold=min_count, nwords=max_vocab, padding_factor=1) if tokens_to_add: for token in tokens_to_add: dictionary.add_symbol(token) return Vocabulary( dictionary.symbols, dictionary.count, replacements=special_token_replacements, )
def from_config(cls, config: Config): dictionary = Dictionary.load(config.token_dictionary_path) bpe = create_gpt2_bpe(config.bpe_encoder_path, config.bpe_vocab_path) # This hacks the bpe instance to be picklable bpe = copy.copy(bpe) bpe.__class__ = PickleableGPT2BPEEncoder return cls(bpe, dictionary)
def load_model(cls, vocab_path, model_path, embedding_size=300, cpu=False): d = Dictionary.load(vocab_path) vocab_size = len(d) model = Word2Vec(vocab_size=vocab_size, embedding_size=embedding_size) sgns = SGNS(embedding=model, vocab_size=vocab_size, n_negs=1, weights=None) sgns.load_state_dict(torch.load(model_path)) sgns.eval() use_cuda = torch.cuda.is_available() and not cpu return cls(sgns, d, use_cuda)
def output_trained_embeddings_to_file(emb, dict_path, tgt_path): emb_dict = Dictionary.load(dict_path) emb = emb.data with open(tgt_path, 'w') as f: sys.stdout = f print(emb.shape[0], emb.shape[1]) for i in range(emb.shape[0]): print(emb_dict.symbols[i], ' '.join(['%f' % x for x in emb[i]])) sys.stdout = sys.__stdout__
def build_fairseq_vocab( vocab_file: str, dictionary_class: Dictionary = Dictionary, special_token_replacements: Dict[str, str] = None, unk_token: str = "<unk>", max_vocab: int = -1, min_count: int = -1, tokens_to_add: Optional[List[str]] = None, ): """Function builds a torchtext Vocab for models pre-trained using Fairseq modules. The dictionary class can take any Fairseq Dictionary class and is used to load the vocab file. """ if not special_token_replacements: special_token_replacements = { "<pad>": "__PAD__", "<s>": "__BEGIN_OF_SENTENCE__", "</s>": "__END_OF_SENTENCE__", "<unk>": "__UNKNOWN__", "<mask>": "__MASK__", } unk_replacement = special_token_replacements[ unk_token] if unk_token in special_token_replacements else unk_token special_tokens_to_remove = [special_pair[0] for special_pair in special_token_replacements] special_tokens_to_add = tuple( special_pair[1] for special_pair in special_token_replacements if special_pair[0] != unk_token) with open(vocab_file) as f: dictionary = dictionary_class.load(f) # finalize will sort the dict based on frequency so only do this if # a min_count or max_vocab size is specified if min_count > 0 or max_vocab > 0: dictionary.finalize(threshold=min_count, nwords=max_vocab, padding_factor=1) if tokens_to_add: for token in tokens_to_add: dictionary.add_symbol(token) dictionary_items = list(zip(dictionary.symbols, dictionary.count)) ordered_dict = OrderedDict() # add special tokens to beginning of ordered_dict for s in special_tokens_to_add: ordered_dict[s] = 1 # add all other tokens from dictionary_items for token, freq in dictionary_items: ordered_dict[token] = freq # remove special_tokens_to_remove from dict for s in special_tokens_to_remove: if s in ordered_dict: del ordered_dict[s] return vocab(ordered_dict, unk_token=unk_replacement)
def initalize_kaldi(cfg: KaldiInitializerConfig) -> Path: if cfg.fst_dir is None: cfg.fst_dir = osp.join(cfg.data_dir, "kaldi") if cfg.out_labels is None: cfg.out_labels = cfg.in_labels kaldi_root = Path(cfg.kaldi_root) data_dir = Path(cfg.data_dir) fst_dir = Path(cfg.fst_dir) fst_dir.mkdir(parents=True, exist_ok=True) arpa_base = osp.splitext(osp.basename(cfg.lm_arpa))[0] unique_label = f"{cfg.in_labels}.{arpa_base}" with open(data_dir / f"dict.{cfg.in_labels}.txt", "r") as f: vocab = Dictionary.load(f) in_units_file = create_units(fst_dir, cfg.in_labels, vocab) grammar_graph, out_words_file = create_G(kaldi_root, fst_dir, Path(cfg.lm_arpa), arpa_base) disambig_lexicon_file, disambig_L_in_units_file = create_lexicon( cfg, fst_dir, unique_label, in_units_file, out_words_file) h_graph, h_out_units_file, disambig_in_units_file_int = create_H( kaldi_root, fst_dir, disambig_L_in_units_file, cfg.in_labels, vocab, cfg.blank_symbol, cfg.silence_symbol, ) lexicon_graph = create_L( kaldi_root, fst_dir, unique_label, disambig_lexicon_file, disambig_L_in_units_file, out_words_file, ) lg_graph = create_LG(kaldi_root, fst_dir, unique_label, lexicon_graph, grammar_graph) hlga_graph = create_HLGa(kaldi_root, fst_dir, unique_label, h_graph, lg_graph, disambig_in_units_file_int) hlg_graph = create_HLG(kaldi_root, fst_dir, unique_label, hlga_graph) # for debugging # hla_graph = create_HLa(kaldi_root, fst_dir, unique_label, h_graph, lexicon_graph, disambig_in_units_file_int) # hl_graph = create_HLG(kaldi_root, fst_dir, unique_label, hla_graph, prefix="HL_looped") # create_HLG(kaldi_root, fst_dir, "phnc", h_graph, prefix="H_looped") return hlg_graph
def train(args): d = Dictionary.load(args.vocab) wf = np.array(d.count) wf[wf == 0] = 1 wf = wf / wf.sum() ws = 1 - np.sqrt(args.ss_t / wf) ws = np.clip(ws, 0, 1) vocab_size = len(d) weights = wf if args.weights else None if not os.path.isdir(args.save_dir): os.makedirs(args.save_dir) model = Word2Vec(vocab_size=vocab_size, embedding_size=args.e_dim) modelpath = os.path.join(args.save_dir, '{}.pt'.format(args.name)) sgns = SGNS(embedding=model, vocab_size=vocab_size, n_negs=args.n_negs, weights=weights, pad=d.unk()) if os.path.isfile(modelpath) and args.conti: sgns.load_state_dict(t.load(modelpath)) if args.cuda: sgns = sgns.cuda() optim = Adam(sgns.parameters()) optimpath = os.path.join(args.save_dir, '{}.optim.pt'.format(args.name)) if os.path.isfile(optimpath) and args.conti: optim.load_state_dict(t.load(optimpath)) dataset = PermutedSubsampledCorpus(args.data, ws=ws) dataloader = DataLoader(dataset, batch_size=args.mb, shuffle=True, num_workers=0) for epoch in range(1, args.epoch + 1): total_batches = int(np.ceil(len(dataset) / args.mb)) pbar = tqdm(dataloader) pbar.set_description("[Epoch {}]".format(epoch)) for iword, owords in pbar: loss = sgns(iword, owords) optim.zero_grad() loss.backward() optim.step() pbar.set_postfix(loss=loss.item()) t.save( sgns.state_dict(), os.path.join(args.save_dir, '{}-e{}.pt'.format(args.name, epoch))) t.save( optim.state_dict(), os.path.join(args.save_dir, '{}-e{}.optim.pt'.format(args.name, epoch)))
def __init__(self, state_dict, vocab_path): self.dictionary = Dictionary.load(vocab_path) if any(k in state_dict["model"] for k in ["encoder.layer_norm.weight", "layer_norm.weight"]): self.dictionary.add_symbol("<mask>") cfg = state_dict["cfg"]["model"] self.sentemb_criterion = cfg.sentemb_criterion self.pad_idx = self.dictionary.pad_index self.bos_idx = self.dictionary.bos_index embed_tokens = Embedding( len(self.dictionary), cfg.encoder_embed_dim, self.pad_idx, ) super().__init__(cfg, self.dictionary, embed_tokens) if "decoder.version" in state_dict["model"]: self._remove_decoder_layers(state_dict) if "layer_norm.weight" in state_dict["model"]: self.layer_norm = LayerNorm(cfg.encoder_embed_dim) self.load_state_dict(state_dict["model"])
#!/usr/bin/env python # -*- coding: utf-8 -*- import sys sys.path.append(".") import os from fairseq.data.indexed_dataset import IndexedDataset from fairseq.data.dictionary import Dictionary from argparse import ArgumentParser ap = ArgumentParser() ap.add_argument("prefix") ap.add_argument("dict") ap.add_argument("--save", type=str) args = ap.parse_args() index = IndexedDataset(args.prefix) dict = Dictionary.load(args.dict) print("len", len(index)) with open(args.save, "w") as outf: for i in range(len(index)): outf.write(dict.string(index[i] - 1) + "\n")
def convert_fsmt_checkpoint_to_pytorch(fsmt_checkpoint_path, pytorch_dump_folder_path): # prep assert os.path.exists(fsmt_checkpoint_path) os.makedirs(pytorch_dump_folder_path, exist_ok=True) print(f"Writing results to {pytorch_dump_folder_path}") # handle various types of models checkpoint_file = basename(fsmt_checkpoint_path) fsmt_folder_path = dirname(fsmt_checkpoint_path) cls = fairseq.model_parallel.models.transformer.ModelParallelTransformerModel models = cls.hub_models() kwargs = {"bpe": "fastbpe", "tokenizer": "moses"} data_name_or_path = "." # note: since the model dump is old, fairseq has upgraded its model some # time later, and it does a whole lot of rewrites and splits on the saved # weights, therefore we can't use torch.load() directly on the model file. # see: upgrade_state_dict(state_dict) in fairseq_model.py print(f"using checkpoint {checkpoint_file}") chkpt = hub_utils.from_pretrained( fsmt_folder_path, checkpoint_file, data_name_or_path, archive_map=models, **kwargs ) args = vars(chkpt["args"]["model"]) src_lang = args["source_lang"] tgt_lang = args["target_lang"] data_root = dirname(pytorch_dump_folder_path) model_dir = basename(pytorch_dump_folder_path) # dicts src_dict_file = os.path.join(fsmt_folder_path, f"dict.{src_lang}.txt") tgt_dict_file = os.path.join(fsmt_folder_path, f"dict.{tgt_lang}.txt") src_dict = Dictionary.load(src_dict_file) src_vocab = rewrite_dict_keys(src_dict.indices) src_vocab_size = len(src_vocab) src_vocab_file = os.path.join(pytorch_dump_folder_path, "vocab-src.json") print(f"Generating {src_vocab_file} of {src_vocab_size} of {src_lang} records") with open(src_vocab_file, "w", encoding="utf-8") as f: f.write(json.dumps(src_vocab, ensure_ascii=False, indent=json_indent)) # detect whether this is a do_lower_case situation, which can be derived by checking whether we # have at least one upcase letter in the source vocab do_lower_case = True for k in src_vocab.keys(): if not k.islower(): do_lower_case = False break tgt_dict = Dictionary.load(tgt_dict_file) tgt_vocab = rewrite_dict_keys(tgt_dict.indices) tgt_vocab_size = len(tgt_vocab) tgt_vocab_file = os.path.join(pytorch_dump_folder_path, "vocab-tgt.json") print(f"Generating {tgt_vocab_file} of {tgt_vocab_size} of {tgt_lang} records") with open(tgt_vocab_file, "w", encoding="utf-8") as f: f.write(json.dumps(tgt_vocab, ensure_ascii=False, indent=json_indent)) # merges_file (bpecodes) merges_file = os.path.join(pytorch_dump_folder_path, VOCAB_FILES_NAMES["merges_file"]) for fn in ["bpecodes", "code"]: # older fairseq called the merges file "code" fsmt_merges_file = os.path.join(fsmt_folder_path, fn) if os.path.exists(fsmt_merges_file): break with open(fsmt_merges_file, encoding="utf-8") as fin: merges = fin.read() merges = re.sub(r" \d+$", "", merges, 0, re.M) # remove frequency number print(f"Generating {merges_file}") with open(merges_file, "w", encoding="utf-8") as fout: fout.write(merges) # model config fsmt_model_config_file = os.path.join(pytorch_dump_folder_path, "config.json") # validate bpe/tokenizer config, as currently it's hardcoded to moses+fastbpe - # may have to modify the tokenizer if a different type is used by a future model assert args["bpe"] == "fastbpe", f"need to extend tokenizer to support bpe={args['bpe']}" assert args["tokenizer"] == "moses", f"need to extend tokenizer to support bpe={args['tokenizer']}" model_conf = { "architectures": ["FSMTForConditionalGeneration"], "model_type": "fsmt", "activation_dropout": args["activation_dropout"], "activation_function": "relu", "attention_dropout": args["attention_dropout"], "d_model": args["decoder_embed_dim"], "dropout": args["dropout"], "init_std": 0.02, "max_position_embeddings": args["max_source_positions"], "num_hidden_layers": args["encoder_layers"], "src_vocab_size": src_vocab_size, "tgt_vocab_size": tgt_vocab_size, "langs": [src_lang, tgt_lang], "encoder_attention_heads": args["encoder_attention_heads"], "encoder_ffn_dim": args["encoder_ffn_embed_dim"], "encoder_layerdrop": args["encoder_layerdrop"], "encoder_layers": args["encoder_layers"], "decoder_attention_heads": args["decoder_attention_heads"], "decoder_ffn_dim": args["decoder_ffn_embed_dim"], "decoder_layerdrop": args["decoder_layerdrop"], "decoder_layers": args["decoder_layers"], "bos_token_id": 0, "pad_token_id": 1, "eos_token_id": 2, "is_encoder_decoder": True, "scale_embedding": not args["no_scale_embedding"], "tie_word_embeddings": args["share_all_embeddings"], } # good hparam defaults to start with model_conf["num_beams"] = 5 model_conf["early_stopping"] = False if model_dir in best_score_hparams and "length_penalty" in best_score_hparams[model_dir]: model_conf["length_penalty"] = best_score_hparams[model_dir]["length_penalty"] else: model_conf["length_penalty"] = 1.0 print(f"Generating {fsmt_model_config_file}") with open(fsmt_model_config_file, "w", encoding="utf-8") as f: f.write(json.dumps(model_conf, ensure_ascii=False, indent=json_indent)) # tokenizer config fsmt_tokenizer_config_file = os.path.join(pytorch_dump_folder_path, TOKENIZER_CONFIG_FILE) tokenizer_conf = { "langs": [src_lang, tgt_lang], "model_max_length": 1024, "do_lower_case": do_lower_case, } print(f"Generating {fsmt_tokenizer_config_file}") with open(fsmt_tokenizer_config_file, "w", encoding="utf-8") as f: f.write(json.dumps(tokenizer_conf, ensure_ascii=False, indent=json_indent)) # model model = chkpt["models"][0] model_state_dict = model.state_dict() # rename keys to start with 'model.' model_state_dict = OrderedDict(("model." + k, v) for k, v in model_state_dict.items()) # remove unneeded keys ignore_keys = [ "model.model", "model.encoder.version", "model.decoder.version", "model.encoder_embed_tokens.weight", "model.decoder_embed_tokens.weight", "model.encoder.embed_positions._float_tensor", "model.decoder.embed_positions._float_tensor", ] for k in ignore_keys: model_state_dict.pop(k, None) config = FSMTConfig.from_pretrained(pytorch_dump_folder_path) model_new = FSMTForConditionalGeneration(config) # check that it loads ok model_new.load_state_dict(model_state_dict, strict=False) # save pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) print(f"Generating {pytorch_weights_dump_path}") torch.save(model_state_dict, pytorch_weights_dump_path) print("Conversion is done!") print("\nLast step is to upload the files to s3") print(f"cd {data_root}") print(f"transformers-cli upload {model_dir}")
def build_vocab(self): print('vocab path:', self.vocab_path) if os.path.exists(self.vocab_path): self._vocab = Dictionary.load(self.vocab_path) else: self.rebuild_vocab()
def from_pickle(cls, pkl_file, vocab_file): d = Dictionary.load(vocab_file) with open(pkl_file, 'rb') as fin: obj = pickle.load(fin) return cls(obj['vectors'], d)
def setup_task(cls, args, **kwargs): dictionary = Dictionary.load(os.path.join(args.data, 'dict.txt')) print('| Dictionary: {} types'.format(len(dictionary)), flush=True) return cls(args, dictionary)
self.decoder_attention_heads = decoder_attention_heads self.decoder_ffn_embed_dim = decoder_ffn_embed_dim self.adaptive_softmax_cutoff = adaptive_softmax_cutoff self.dropout = dropout self.attention_dropout = attention_dropout self.activation_fn = activation_fn self.fp16 = False self.adaptive_input = adaptive_input self.quant_noise_pq = quant_noise_pq assert self.decoder_embed_dim == self.decoder_output_dim args = Args(0, True, 1024, 1024, 1024, False, False, False, True, 1, 16, 4096, None, 0.1, 0.1) embedding = nn.Embedding(13672, 1024, padding_idx=1) dictionary = Dictionary.load('resource/dict.txt') def main(): torch.set_grad_enabled(False) input = np.random.randint(3,9000,seq_len * batch,dtype="int64") inputs = np.random.randint(3,9000,1 * batch,dtype="int64") input_full = torch.from_numpy(input).long().reshape(batch, seq_len).cuda() input_inc = torch.from_numpy(inputs).long().reshape(batch, 1).cuda() data_type = torch.float32 if using_half: data_type = torch.float16 torch_decoder = TransformerDecoder(args, dictionary, embedding, True) if using_half:
def build_vocab(self): if os.path.exists(self.vocab_path): self._vocab = Dictionary.load(self.vocab_path) else: self.rebuild_vocab()
def to_pytorch(fsmt_checkpoint_path, save_path): assert os.path.exists(fsmt_checkpoint_path) os.makedirs(save_path, exist_ok=True) print(f"Writing results to {save_path}") checkpoint_file = basename(fsmt_checkpoint_path) fsmt_folder_path = dirname(fsmt_checkpoint_path) cls = fairseq.model_parallel.models.transformer.ModelParallelTransformerModel models = cls.hub_models() kw = {"bpe": "fastbpe", "tokenizer": "moses"} data_name_or_path = "." print(f"using checkpoint {checkpoint_file}") chkpt = hub_utils.from_pretrained(fsmt_folder_path, checkpoint_file, data_name_or_path, archive_map=models, **kw) args = vars(chkpt["args"]["model"]) src_lang = args["source_lang"] tgt_lang = args["target_lang"] data_root = dirname(save_path) model_dir = basename(save_path) src_dict_file = os.path.join(fsmt_folder_path, f"dict.{src_lang}.txt") tgt_dict_file = os.path.join(fsmt_folder_path, f"dict.{tgt_lang}.txt") src_dict = Dictionary.load(src_dict_file) src_vocab = rewrite_dict_keys(src_dict.indices) s_src_vocab = len(src_vocab) src_vocab_file = os.path.join(save_path, "vocab-src.json") print( f"Generating {src_vocab_file} of {s_src_vocab} of {src_lang} records") with open(src_vocab_file, "w", encoding="utf-8") as f: f.write(json.dumps(src_vocab, ensure_ascii=False, indent=json_indent)) do_lower_case = True for k in src_vocab.keys(): if not k.islower(): do_lower_case = False break tgt_dict = Dictionary.load(tgt_dict_file) tgt_vocab = rewrite_dict_keys(tgt_dict.indices) s_tgt_vocab = len(tgt_vocab) tgt_vocab_file = os.path.join(save_path, "vocab-tgt.json") print( f"Generating {tgt_vocab_file} of {s_tgt_vocab} of {tgt_lang} records") with open(tgt_vocab_file, "w", encoding="utf-8") as f: f.write(json.dumps(tgt_vocab, ensure_ascii=False, indent=json_indent)) merges_file = os.path.join(save_path, VOCAB_FS["merges_file"]) for fn in ["bpecodes", "code"]: # older fairseq called the merges file "code" fsmt_merges_file = os.path.join(fsmt_folder_path, fn) if os.path.exists(fsmt_merges_file): break with open(fsmt_merges_file, encoding="utf-8") as fin: merges = fin.read() merges = re.sub(r" \d+$", "", merges, 0, re.M) # remove frequency number print(f"Generating {merges_file}") with open(merges_file, "w", encoding="utf-8") as fout: fout.write(merges) fsmt_model_config_file = os.path.join(save_path, "config.json") assert args[ "bpe"] == "fastbpe", f"need to extend tokenizer to support bpe={args['bpe']}" assert (args["tokenizer"] == "moses" ), f"need to extend tokenizer to support bpe={args['tokenizer']}" model_conf = { "archs": ["FSMTForConditionalGeneration"], "model_type": "fsmt", "drop_act": args["drop_act"], "act_fun": "relu", "drop_attn": args["drop_attn"], "d_hidden": args["decoder_embed_dim"], "drop": args["drop"], "init_std": 0.02, "n_pos": args["max_source_positions"], "n_lays": args["n_enc_lays"], "s_src_vocab": s_src_vocab, "s_tgt_vocab": s_tgt_vocab, "langs": [src_lang, tgt_lang], "n_enc_heads": args["n_enc_heads"], "d_enc_ffn": args["encoder_ffn_embed_dim"], "drop_enc": args["drop_enc"], "n_enc_lays": args["n_enc_lays"], "n_dec_heads": args["n_dec_heads"], "d_dec_ffn": args["decoder_ffn_embed_dim"], "drop_dec": args["drop_dec"], "n_dec_lays": args["n_dec_lays"], "BOS": 0, "PAD": 1, "EOS": 2, "is_enc_dec": True, "scale": not args["no_scale_embedding"], "tie_word_embeds": args["share_all_embeddings"], } model_conf["n_beams"] = 5 model_conf["early_stop"] = False if model_dir in best_score_hparams and "len_penalty" in best_score_hparams[ model_dir]: model_conf["len_penalty"] = best_score_hparams[model_dir][ "len_penalty"] else: model_conf["len_penalty"] = 1.0 print(f"Generating {fsmt_model_config_file}") with open(fsmt_model_config_file, "w", encoding="utf-8") as f: f.write(json.dumps(model_conf, ensure_ascii=False, indent=json_indent)) fsmt_tokenizer_config_file = os.path.join(save_path, TOKENIZER_CONFIG_FILE) tokenizer_conf = { "langs": [src_lang, tgt_lang], "model_max_length": 1024, "do_lower_case": do_lower_case, } print(f"Generating {fsmt_tokenizer_config_file}") with open(fsmt_tokenizer_config_file, "w", encoding="utf-8") as f: f.write( json.dumps(tokenizer_conf, ensure_ascii=False, indent=json_indent)) model = chkpt["models"][0] model_state_dict = model.state_dict() model_state_dict = OrderedDict( ("model." + k, v) for k, v in model_state_dict.items()) ignore_keys = [ "model.model", "model.encoder.version", "model.decoder.version", "model.encoder_embed_tokens.weight", "model.decoder_embed_tokens.weight", "model.encoder.embed_positions._float_tensor", "model.decoder.embed_positions._float_tensor", ] for k in ignore_keys: model_state_dict.pop(k, None) config = PreTrained.from_pretrained(save_path) model_new = ForConditionalGen(config) model_new.load_state_dict(model_state_dict, strict=False) pytorch_weights_dump_path = os.path.join(save_path, WEIGHTS_NAME) print(f"Generating {pytorch_weights_dump_path}") torch.save(model_state_dict, pytorch_weights_dump_path) print("Conversion is done!") print("\nLast step is to upload the files to s3") print(f"cd {data_root}") print(f"transformers-cli upload {model_dir}")