Exemplo n.º 1
0
    def build(self,
              filepath=None,
              vocab_path=None,
              threshold=-1,
              max_vocab=-1):
        if vocab_path and os.path.exists(vocab_path):
            print("loading vocab from {}".format(vocab_path))
            d = Dictionary.load(vocab_path)
            print('vocab size {}'.format(len(d)))
        else:
            print("building vocab...")
            d = Dictionary()
            for step, line in enumerate(sentence_iterator(filepath)):
                if not step % 1000:
                    print("working on {}kth line".format(step // 1000),
                          end='\r')
                tokens = [self.get_lemma(w) for w in line]
                for tok in tokens:
                    d.add_symbol(tok)
            d.finalize(threshold=threshold, nwords=max_vocab)
            print('build done. vocab size {}'.format(len(d)))
            d.save('{}/dict.txt'.format(self.data_dir))

        self.vocab = d
        self.unk = self.vocab.unk()
Exemplo n.º 2
0
def build_fairseq_vocab(
    vocab_file: str,
    dictionary_class: Dictionary = Dictionary,
    special_token_replacements: Dict[str, Token] = None,
    max_vocab: int = -1,
    min_count: int = -1,
    tokens_to_add: Optional[List[str]] = None,
) -> Vocabulary:
    """
    Function builds a PyText vocabulary for models pre-trained using Fairseq
    modules. The dictionary class can take any Fairseq Dictionary class
    and is used to load the vocab file.
    """
    dictionary = dictionary_class.load(vocab_file)
    # finalize will sort the dict based on frequency so only do this if
    # a min_count or max_vocab size is specified
    if min_count > 0 or max_vocab > 0:
        dictionary.finalize(threshold=min_count,
                            nwords=max_vocab,
                            padding_factor=1)
    if tokens_to_add:
        for token in tokens_to_add:
            dictionary.add_symbol(token)
    return Vocabulary(dictionary.symbols,
                      dictionary.count,
                      replacements=special_token_replacements)
Exemplo n.º 3
0
def build_fairseq_vocab(
    vocab_file: str,
    dictionary_class: Dictionary = Dictionary,
    special_token_replacements: Dict[str, SpecialToken] = None,
    max_vocab: int = -1,
    min_count: int = -1,
    tokens_to_add: Optional[List[str]] = None,
):
    """
    Function builds a PyText vocabulary for models pre-trained using Fairseq
    modules. The dictionary class can take any Fairseq Dictionary class
    and is used to load the vocab file.
    """
    if not special_token_replacements:
        special_token_replacements = {
            "<pad>": SpecialTokens.PAD,
            "<s>": SpecialTokens.BOS,
            "</s>": SpecialTokens.EOS,
            "<unk>": SpecialTokens.UNK,
            "<mask>": SpecialTokens.MASK,
        }
    with PathManager.open(vocab_file) as f:
        dictionary = dictionary_class.load(f)
        # finalize will sort the dict based on frequency so only do this if
        # a min_count or max_vocab size is specified
        if min_count > 0 or max_vocab > 0:
            dictionary.finalize(threshold=min_count, nwords=max_vocab, padding_factor=1)
        if tokens_to_add:
            for token in tokens_to_add:
                dictionary.add_symbol(token)
        return Vocabulary(
            dictionary.symbols,
            dictionary.count,
            replacements=special_token_replacements,
        )
Exemplo n.º 4
0
    def from_config(cls, config: Config):
        dictionary = Dictionary.load(config.token_dictionary_path)
        bpe = create_gpt2_bpe(config.bpe_encoder_path, config.bpe_vocab_path)
        # This hacks the bpe instance to be picklable
        bpe = copy.copy(bpe)
        bpe.__class__ = PickleableGPT2BPEEncoder

        return cls(bpe, dictionary)
Exemplo n.º 5
0
 def load_model(cls, vocab_path, model_path, embedding_size=300, cpu=False):
     d = Dictionary.load(vocab_path)
     vocab_size = len(d)
     model = Word2Vec(vocab_size=vocab_size, embedding_size=embedding_size)
     sgns = SGNS(embedding=model, vocab_size=vocab_size, n_negs=1, weights=None)
     sgns.load_state_dict(torch.load(model_path))
     sgns.eval()
     use_cuda = torch.cuda.is_available() and not cpu
     return cls(sgns, d, use_cuda)
def output_trained_embeddings_to_file(emb, dict_path, tgt_path):
    emb_dict = Dictionary.load(dict_path)
    emb = emb.data
    with open(tgt_path, 'w') as f:
        sys.stdout = f
        print(emb.shape[0], emb.shape[1])
        for i in range(emb.shape[0]):
            print(emb_dict.symbols[i], ' '.join(['%f' % x for x in emb[i]]))
        sys.stdout = sys.__stdout__
Exemplo n.º 7
0
def build_fairseq_vocab(
        vocab_file: str,
        dictionary_class: Dictionary = Dictionary,
        special_token_replacements: Dict[str, str] = None,
        unk_token: str = "<unk>",
        max_vocab: int = -1,
        min_count: int = -1,
        tokens_to_add: Optional[List[str]] = None,
):
    """Function builds a torchtext Vocab for models pre-trained using Fairseq
    modules.
    The dictionary class can take any Fairseq Dictionary class and is
    used to load the vocab file.
    """
    if not special_token_replacements:
        special_token_replacements = {
            "<pad>": "__PAD__",
            "<s>": "__BEGIN_OF_SENTENCE__",
            "</s>": "__END_OF_SENTENCE__",
            "<unk>": "__UNKNOWN__",
            "<mask>": "__MASK__",
        }
        unk_replacement = special_token_replacements[
            unk_token] if unk_token in special_token_replacements else unk_token
        special_tokens_to_remove = [special_pair[0] for special_pair in special_token_replacements]
        special_tokens_to_add = tuple(
            special_pair[1] for special_pair in special_token_replacements if special_pair[0] != unk_token)

    with open(vocab_file) as f:
        dictionary = dictionary_class.load(f)
        # finalize will sort the dict based on frequency so only do this if
        # a min_count or max_vocab size is specified
        if min_count > 0 or max_vocab > 0:
            dictionary.finalize(threshold=min_count, nwords=max_vocab, padding_factor=1)
        if tokens_to_add:
            for token in tokens_to_add:
                dictionary.add_symbol(token)

        dictionary_items = list(zip(dictionary.symbols, dictionary.count))

        ordered_dict = OrderedDict()
        # add special tokens to beginning of ordered_dict
        for s in special_tokens_to_add:
            ordered_dict[s] = 1

        # add all other tokens from dictionary_items
        for token, freq in dictionary_items:
            ordered_dict[token] = freq

        # remove special_tokens_to_remove from dict
        for s in special_tokens_to_remove:
            if s in ordered_dict:
                del ordered_dict[s]

        return vocab(ordered_dict, unk_token=unk_replacement)
Exemplo n.º 8
0
def initalize_kaldi(cfg: KaldiInitializerConfig) -> Path:
    if cfg.fst_dir is None:
        cfg.fst_dir = osp.join(cfg.data_dir, "kaldi")
    if cfg.out_labels is None:
        cfg.out_labels = cfg.in_labels

    kaldi_root = Path(cfg.kaldi_root)
    data_dir = Path(cfg.data_dir)
    fst_dir = Path(cfg.fst_dir)
    fst_dir.mkdir(parents=True, exist_ok=True)

    arpa_base = osp.splitext(osp.basename(cfg.lm_arpa))[0]
    unique_label = f"{cfg.in_labels}.{arpa_base}"

    with open(data_dir / f"dict.{cfg.in_labels}.txt", "r") as f:
        vocab = Dictionary.load(f)

    in_units_file = create_units(fst_dir, cfg.in_labels, vocab)

    grammar_graph, out_words_file = create_G(kaldi_root, fst_dir,
                                             Path(cfg.lm_arpa), arpa_base)

    disambig_lexicon_file, disambig_L_in_units_file = create_lexicon(
        cfg, fst_dir, unique_label, in_units_file, out_words_file)

    h_graph, h_out_units_file, disambig_in_units_file_int = create_H(
        kaldi_root,
        fst_dir,
        disambig_L_in_units_file,
        cfg.in_labels,
        vocab,
        cfg.blank_symbol,
        cfg.silence_symbol,
    )
    lexicon_graph = create_L(
        kaldi_root,
        fst_dir,
        unique_label,
        disambig_lexicon_file,
        disambig_L_in_units_file,
        out_words_file,
    )
    lg_graph = create_LG(kaldi_root, fst_dir, unique_label, lexicon_graph,
                         grammar_graph)
    hlga_graph = create_HLGa(kaldi_root, fst_dir, unique_label, h_graph,
                             lg_graph, disambig_in_units_file_int)
    hlg_graph = create_HLG(kaldi_root, fst_dir, unique_label, hlga_graph)

    # for debugging
    # hla_graph = create_HLa(kaldi_root, fst_dir, unique_label, h_graph, lexicon_graph, disambig_in_units_file_int)
    # hl_graph = create_HLG(kaldi_root, fst_dir, unique_label, hla_graph, prefix="HL_looped")
    # create_HLG(kaldi_root, fst_dir, "phnc", h_graph, prefix="H_looped")

    return hlg_graph
Exemplo n.º 9
0
def train(args):
    d = Dictionary.load(args.vocab)
    wf = np.array(d.count)
    wf[wf == 0] = 1
    wf = wf / wf.sum()
    ws = 1 - np.sqrt(args.ss_t / wf)
    ws = np.clip(ws, 0, 1)
    vocab_size = len(d)
    weights = wf if args.weights else None
    if not os.path.isdir(args.save_dir):
        os.makedirs(args.save_dir)
    model = Word2Vec(vocab_size=vocab_size, embedding_size=args.e_dim)
    modelpath = os.path.join(args.save_dir, '{}.pt'.format(args.name))
    sgns = SGNS(embedding=model,
                vocab_size=vocab_size,
                n_negs=args.n_negs,
                weights=weights,
                pad=d.unk())
    if os.path.isfile(modelpath) and args.conti:
        sgns.load_state_dict(t.load(modelpath))
    if args.cuda:
        sgns = sgns.cuda()
    optim = Adam(sgns.parameters())
    optimpath = os.path.join(args.save_dir, '{}.optim.pt'.format(args.name))
    if os.path.isfile(optimpath) and args.conti:
        optim.load_state_dict(t.load(optimpath))
    dataset = PermutedSubsampledCorpus(args.data, ws=ws)
    dataloader = DataLoader(dataset,
                            batch_size=args.mb,
                            shuffle=True,
                            num_workers=0)
    for epoch in range(1, args.epoch + 1):
        total_batches = int(np.ceil(len(dataset) / args.mb))
        pbar = tqdm(dataloader)
        pbar.set_description("[Epoch {}]".format(epoch))
        for iword, owords in pbar:
            loss = sgns(iword, owords)
            optim.zero_grad()
            loss.backward()
            optim.step()
            pbar.set_postfix(loss=loss.item())

        t.save(
            sgns.state_dict(),
            os.path.join(args.save_dir, '{}-e{}.pt'.format(args.name, epoch)))
        t.save(
            optim.state_dict(),
            os.path.join(args.save_dir,
                         '{}-e{}.optim.pt'.format(args.name, epoch)))
Exemplo n.º 10
0
 def __init__(self, state_dict, vocab_path):
     self.dictionary = Dictionary.load(vocab_path)
     if any(k in state_dict["model"]
            for k in ["encoder.layer_norm.weight", "layer_norm.weight"]):
         self.dictionary.add_symbol("<mask>")
     cfg = state_dict["cfg"]["model"]
     self.sentemb_criterion = cfg.sentemb_criterion
     self.pad_idx = self.dictionary.pad_index
     self.bos_idx = self.dictionary.bos_index
     embed_tokens = Embedding(
         len(self.dictionary),
         cfg.encoder_embed_dim,
         self.pad_idx,
     )
     super().__init__(cfg, self.dictionary, embed_tokens)
     if "decoder.version" in state_dict["model"]:
         self._remove_decoder_layers(state_dict)
     if "layer_norm.weight" in state_dict["model"]:
         self.layer_norm = LayerNorm(cfg.encoder_embed_dim)
     self.load_state_dict(state_dict["model"])
Exemplo n.º 11
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
sys.path.append(".")
import os

from fairseq.data.indexed_dataset import IndexedDataset
from fairseq.data.dictionary import Dictionary
from argparse import ArgumentParser

ap = ArgumentParser()
ap.add_argument("prefix")
ap.add_argument("dict")
ap.add_argument("--save", type=str)
args = ap.parse_args()

index = IndexedDataset(args.prefix)
dict = Dictionary.load(args.dict)
print("len", len(index))
with open(args.save, "w") as outf:
    for i in range(len(index)):
        outf.write(dict.string(index[i] - 1) + "\n")
def convert_fsmt_checkpoint_to_pytorch(fsmt_checkpoint_path, pytorch_dump_folder_path):

    # prep
    assert os.path.exists(fsmt_checkpoint_path)
    os.makedirs(pytorch_dump_folder_path, exist_ok=True)
    print(f"Writing results to {pytorch_dump_folder_path}")

    # handle various types of models

    checkpoint_file = basename(fsmt_checkpoint_path)
    fsmt_folder_path = dirname(fsmt_checkpoint_path)

    cls = fairseq.model_parallel.models.transformer.ModelParallelTransformerModel
    models = cls.hub_models()
    kwargs = {"bpe": "fastbpe", "tokenizer": "moses"}
    data_name_or_path = "."
    # note: since the model dump is old, fairseq has upgraded its model some
    # time later, and it does a whole lot of rewrites and splits on the saved
    # weights, therefore we can't use torch.load() directly on the model file.
    # see: upgrade_state_dict(state_dict) in fairseq_model.py
    print(f"using checkpoint {checkpoint_file}")
    chkpt = hub_utils.from_pretrained(
        fsmt_folder_path, checkpoint_file, data_name_or_path, archive_map=models, **kwargs
    )

    args = vars(chkpt["args"]["model"])

    src_lang = args["source_lang"]
    tgt_lang = args["target_lang"]

    data_root = dirname(pytorch_dump_folder_path)
    model_dir = basename(pytorch_dump_folder_path)

    # dicts
    src_dict_file = os.path.join(fsmt_folder_path, f"dict.{src_lang}.txt")
    tgt_dict_file = os.path.join(fsmt_folder_path, f"dict.{tgt_lang}.txt")

    src_dict = Dictionary.load(src_dict_file)
    src_vocab = rewrite_dict_keys(src_dict.indices)
    src_vocab_size = len(src_vocab)
    src_vocab_file = os.path.join(pytorch_dump_folder_path, "vocab-src.json")
    print(f"Generating {src_vocab_file} of {src_vocab_size} of {src_lang} records")
    with open(src_vocab_file, "w", encoding="utf-8") as f:
        f.write(json.dumps(src_vocab, ensure_ascii=False, indent=json_indent))

    # detect whether this is a do_lower_case situation, which can be derived by checking whether we
    # have at least one upcase letter in the source vocab
    do_lower_case = True
    for k in src_vocab.keys():
        if not k.islower():
            do_lower_case = False
            break

    tgt_dict = Dictionary.load(tgt_dict_file)
    tgt_vocab = rewrite_dict_keys(tgt_dict.indices)
    tgt_vocab_size = len(tgt_vocab)
    tgt_vocab_file = os.path.join(pytorch_dump_folder_path, "vocab-tgt.json")
    print(f"Generating {tgt_vocab_file} of {tgt_vocab_size} of {tgt_lang} records")
    with open(tgt_vocab_file, "w", encoding="utf-8") as f:
        f.write(json.dumps(tgt_vocab, ensure_ascii=False, indent=json_indent))

    # merges_file (bpecodes)
    merges_file = os.path.join(pytorch_dump_folder_path, VOCAB_FILES_NAMES["merges_file"])
    for fn in ["bpecodes", "code"]:  # older fairseq called the merges file "code"
        fsmt_merges_file = os.path.join(fsmt_folder_path, fn)
        if os.path.exists(fsmt_merges_file):
            break
    with open(fsmt_merges_file, encoding="utf-8") as fin:
        merges = fin.read()
    merges = re.sub(r" \d+$", "", merges, 0, re.M)  # remove frequency number
    print(f"Generating {merges_file}")
    with open(merges_file, "w", encoding="utf-8") as fout:
        fout.write(merges)

    # model config
    fsmt_model_config_file = os.path.join(pytorch_dump_folder_path, "config.json")

    # validate bpe/tokenizer config, as currently it's hardcoded to moses+fastbpe -
    # may have to modify the tokenizer if a different type is used by a future model
    assert args["bpe"] == "fastbpe", f"need to extend tokenizer to support bpe={args['bpe']}"
    assert args["tokenizer"] == "moses", f"need to extend tokenizer to support bpe={args['tokenizer']}"

    model_conf = {
        "architectures": ["FSMTForConditionalGeneration"],
        "model_type": "fsmt",
        "activation_dropout": args["activation_dropout"],
        "activation_function": "relu",
        "attention_dropout": args["attention_dropout"],
        "d_model": args["decoder_embed_dim"],
        "dropout": args["dropout"],
        "init_std": 0.02,
        "max_position_embeddings": args["max_source_positions"],
        "num_hidden_layers": args["encoder_layers"],
        "src_vocab_size": src_vocab_size,
        "tgt_vocab_size": tgt_vocab_size,
        "langs": [src_lang, tgt_lang],
        "encoder_attention_heads": args["encoder_attention_heads"],
        "encoder_ffn_dim": args["encoder_ffn_embed_dim"],
        "encoder_layerdrop": args["encoder_layerdrop"],
        "encoder_layers": args["encoder_layers"],
        "decoder_attention_heads": args["decoder_attention_heads"],
        "decoder_ffn_dim": args["decoder_ffn_embed_dim"],
        "decoder_layerdrop": args["decoder_layerdrop"],
        "decoder_layers": args["decoder_layers"],
        "bos_token_id": 0,
        "pad_token_id": 1,
        "eos_token_id": 2,
        "is_encoder_decoder": True,
        "scale_embedding": not args["no_scale_embedding"],
        "tie_word_embeddings": args["share_all_embeddings"],
    }

    # good hparam defaults to start with
    model_conf["num_beams"] = 5
    model_conf["early_stopping"] = False
    if model_dir in best_score_hparams and "length_penalty" in best_score_hparams[model_dir]:
        model_conf["length_penalty"] = best_score_hparams[model_dir]["length_penalty"]
    else:
        model_conf["length_penalty"] = 1.0

    print(f"Generating {fsmt_model_config_file}")
    with open(fsmt_model_config_file, "w", encoding="utf-8") as f:
        f.write(json.dumps(model_conf, ensure_ascii=False, indent=json_indent))

    # tokenizer config
    fsmt_tokenizer_config_file = os.path.join(pytorch_dump_folder_path, TOKENIZER_CONFIG_FILE)

    tokenizer_conf = {
        "langs": [src_lang, tgt_lang],
        "model_max_length": 1024,
        "do_lower_case": do_lower_case,
    }

    print(f"Generating {fsmt_tokenizer_config_file}")
    with open(fsmt_tokenizer_config_file, "w", encoding="utf-8") as f:
        f.write(json.dumps(tokenizer_conf, ensure_ascii=False, indent=json_indent))

    # model
    model = chkpt["models"][0]
    model_state_dict = model.state_dict()

    # rename keys to start with 'model.'
    model_state_dict = OrderedDict(("model." + k, v) for k, v in model_state_dict.items())

    # remove unneeded keys
    ignore_keys = [
        "model.model",
        "model.encoder.version",
        "model.decoder.version",
        "model.encoder_embed_tokens.weight",
        "model.decoder_embed_tokens.weight",
        "model.encoder.embed_positions._float_tensor",
        "model.decoder.embed_positions._float_tensor",
    ]
    for k in ignore_keys:
        model_state_dict.pop(k, None)

    config = FSMTConfig.from_pretrained(pytorch_dump_folder_path)
    model_new = FSMTForConditionalGeneration(config)

    # check that it loads ok
    model_new.load_state_dict(model_state_dict, strict=False)

    # save
    pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
    print(f"Generating {pytorch_weights_dump_path}")
    torch.save(model_state_dict, pytorch_weights_dump_path)

    print("Conversion is done!")
    print("\nLast step is to upload the files to s3")
    print(f"cd {data_root}")
    print(f"transformers-cli upload {model_dir}")
Exemplo n.º 13
0
 def build_vocab(self):
     print('vocab path:', self.vocab_path)
     if os.path.exists(self.vocab_path):
         self._vocab = Dictionary.load(self.vocab_path)
     else:
         self.rebuild_vocab()
Exemplo n.º 14
0
 def from_pickle(cls, pkl_file, vocab_file):
     d = Dictionary.load(vocab_file)
     with open(pkl_file, 'rb') as fin:
         obj = pickle.load(fin)
         return cls(obj['vectors'], d)
Exemplo n.º 15
0
 def setup_task(cls, args, **kwargs):
     dictionary = Dictionary.load(os.path.join(args.data, 'dict.txt'))
     print('| Dictionary: {} types'.format(len(dictionary)), flush=True)
     return cls(args, dictionary)
Exemplo n.º 16
0
        self.decoder_attention_heads = decoder_attention_heads
        self.decoder_ffn_embed_dim = decoder_ffn_embed_dim
        self.adaptive_softmax_cutoff = adaptive_softmax_cutoff
        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.activation_fn = activation_fn
        self.fp16 = False
        self.adaptive_input = adaptive_input
        self.quant_noise_pq = quant_noise_pq

        assert self.decoder_embed_dim == self.decoder_output_dim

args = Args(0, True, 1024, 1024, 1024, False, False, False, True, 1, 16, 4096, None, 0.1, 0.1)
embedding = nn.Embedding(13672, 1024, padding_idx=1)

dictionary = Dictionary.load('resource/dict.txt')

def main():
    torch.set_grad_enabled(False)

    input = np.random.randint(3,9000,seq_len * batch,dtype="int64")
    inputs = np.random.randint(3,9000,1 * batch,dtype="int64")

    input_full = torch.from_numpy(input).long().reshape(batch, seq_len).cuda()
    input_inc = torch.from_numpy(inputs).long().reshape(batch, 1).cuda()

    data_type = torch.float32
    if using_half:
        data_type = torch.float16
    torch_decoder = TransformerDecoder(args, dictionary, embedding, True)
    if using_half:
Exemplo n.º 17
0
 def build_vocab(self):
     if os.path.exists(self.vocab_path):
         self._vocab = Dictionary.load(self.vocab_path)
     else:
         self.rebuild_vocab()
Exemplo n.º 18
0
def to_pytorch(fsmt_checkpoint_path, save_path):
    assert os.path.exists(fsmt_checkpoint_path)
    os.makedirs(save_path, exist_ok=True)
    print(f"Writing results to {save_path}")
    checkpoint_file = basename(fsmt_checkpoint_path)
    fsmt_folder_path = dirname(fsmt_checkpoint_path)
    cls = fairseq.model_parallel.models.transformer.ModelParallelTransformerModel
    models = cls.hub_models()
    kw = {"bpe": "fastbpe", "tokenizer": "moses"}
    data_name_or_path = "."
    print(f"using checkpoint {checkpoint_file}")
    chkpt = hub_utils.from_pretrained(fsmt_folder_path,
                                      checkpoint_file,
                                      data_name_or_path,
                                      archive_map=models,
                                      **kw)
    args = vars(chkpt["args"]["model"])
    src_lang = args["source_lang"]
    tgt_lang = args["target_lang"]
    data_root = dirname(save_path)
    model_dir = basename(save_path)
    src_dict_file = os.path.join(fsmt_folder_path, f"dict.{src_lang}.txt")
    tgt_dict_file = os.path.join(fsmt_folder_path, f"dict.{tgt_lang}.txt")
    src_dict = Dictionary.load(src_dict_file)
    src_vocab = rewrite_dict_keys(src_dict.indices)
    s_src_vocab = len(src_vocab)
    src_vocab_file = os.path.join(save_path, "vocab-src.json")
    print(
        f"Generating {src_vocab_file} of {s_src_vocab} of {src_lang} records")
    with open(src_vocab_file, "w", encoding="utf-8") as f:
        f.write(json.dumps(src_vocab, ensure_ascii=False, indent=json_indent))
    do_lower_case = True
    for k in src_vocab.keys():
        if not k.islower():
            do_lower_case = False
            break
    tgt_dict = Dictionary.load(tgt_dict_file)
    tgt_vocab = rewrite_dict_keys(tgt_dict.indices)
    s_tgt_vocab = len(tgt_vocab)
    tgt_vocab_file = os.path.join(save_path, "vocab-tgt.json")
    print(
        f"Generating {tgt_vocab_file} of {s_tgt_vocab} of {tgt_lang} records")
    with open(tgt_vocab_file, "w", encoding="utf-8") as f:
        f.write(json.dumps(tgt_vocab, ensure_ascii=False, indent=json_indent))
    merges_file = os.path.join(save_path, VOCAB_FS["merges_file"])
    for fn in ["bpecodes",
               "code"]:  # older fairseq called the merges file "code"
        fsmt_merges_file = os.path.join(fsmt_folder_path, fn)
        if os.path.exists(fsmt_merges_file):
            break
    with open(fsmt_merges_file, encoding="utf-8") as fin:
        merges = fin.read()
    merges = re.sub(r" \d+$", "", merges, 0, re.M)  # remove frequency number
    print(f"Generating {merges_file}")
    with open(merges_file, "w", encoding="utf-8") as fout:
        fout.write(merges)
    fsmt_model_config_file = os.path.join(save_path, "config.json")
    assert args[
        "bpe"] == "fastbpe", f"need to extend tokenizer to support bpe={args['bpe']}"
    assert (args["tokenizer"] == "moses"
            ), f"need to extend tokenizer to support bpe={args['tokenizer']}"

    model_conf = {
        "archs": ["FSMTForConditionalGeneration"],
        "model_type": "fsmt",
        "drop_act": args["drop_act"],
        "act_fun": "relu",
        "drop_attn": args["drop_attn"],
        "d_hidden": args["decoder_embed_dim"],
        "drop": args["drop"],
        "init_std": 0.02,
        "n_pos": args["max_source_positions"],
        "n_lays": args["n_enc_lays"],
        "s_src_vocab": s_src_vocab,
        "s_tgt_vocab": s_tgt_vocab,
        "langs": [src_lang, tgt_lang],
        "n_enc_heads": args["n_enc_heads"],
        "d_enc_ffn": args["encoder_ffn_embed_dim"],
        "drop_enc": args["drop_enc"],
        "n_enc_lays": args["n_enc_lays"],
        "n_dec_heads": args["n_dec_heads"],
        "d_dec_ffn": args["decoder_ffn_embed_dim"],
        "drop_dec": args["drop_dec"],
        "n_dec_lays": args["n_dec_lays"],
        "BOS": 0,
        "PAD": 1,
        "EOS": 2,
        "is_enc_dec": True,
        "scale": not args["no_scale_embedding"],
        "tie_word_embeds": args["share_all_embeddings"],
    }
    model_conf["n_beams"] = 5
    model_conf["early_stop"] = False
    if model_dir in best_score_hparams and "len_penalty" in best_score_hparams[
            model_dir]:
        model_conf["len_penalty"] = best_score_hparams[model_dir][
            "len_penalty"]
    else:
        model_conf["len_penalty"] = 1.0
    print(f"Generating {fsmt_model_config_file}")
    with open(fsmt_model_config_file, "w", encoding="utf-8") as f:
        f.write(json.dumps(model_conf, ensure_ascii=False, indent=json_indent))
    fsmt_tokenizer_config_file = os.path.join(save_path, TOKENIZER_CONFIG_FILE)
    tokenizer_conf = {
        "langs": [src_lang, tgt_lang],
        "model_max_length": 1024,
        "do_lower_case": do_lower_case,
    }
    print(f"Generating {fsmt_tokenizer_config_file}")
    with open(fsmt_tokenizer_config_file, "w", encoding="utf-8") as f:
        f.write(
            json.dumps(tokenizer_conf, ensure_ascii=False, indent=json_indent))
    model = chkpt["models"][0]
    model_state_dict = model.state_dict()
    model_state_dict = OrderedDict(
        ("model." + k, v) for k, v in model_state_dict.items())
    ignore_keys = [
        "model.model",
        "model.encoder.version",
        "model.decoder.version",
        "model.encoder_embed_tokens.weight",
        "model.decoder_embed_tokens.weight",
        "model.encoder.embed_positions._float_tensor",
        "model.decoder.embed_positions._float_tensor",
    ]
    for k in ignore_keys:
        model_state_dict.pop(k, None)
    config = PreTrained.from_pretrained(save_path)
    model_new = ForConditionalGen(config)
    model_new.load_state_dict(model_state_dict, strict=False)
    pytorch_weights_dump_path = os.path.join(save_path, WEIGHTS_NAME)
    print(f"Generating {pytorch_weights_dump_path}")
    torch.save(model_state_dict, pytorch_weights_dump_path)
    print("Conversion is done!")
    print("\nLast step is to upload the files to s3")
    print(f"cd {data_root}")
    print(f"transformers-cli upload {model_dir}")