Exemplo n.º 1
0
def create_big_model(ds_small, ds_big):
    max_length = 100
    src_vocab_small = Vocabulary(max_tokens=max_length).build_from_ds(ds=ds_small, lang=ds_small.src_lang)
    trg_vocab_small = Vocabulary(max_tokens=max_length).build_from_ds(ds=ds_small, lang=ds_small.trg_lang)

    big_src_vocab = Vocabulary(max_tokens=max_length).build_from_ds(ds=ds_big, lang=ds_big.src_lang)
    big_trg_vocab = Vocabulary(max_tokens=max_length).build_from_ds(ds=ds_big, lang=ds_big.trg_lang)

    # Match vocabularies
    words_small_src_vocab = src_vocab_small.get_tokens()
    words_small_trg_vocab = trg_vocab_small.get_tokens()
    words_big_src_vocab = big_src_vocab.get_tokens()
    words_big_trg_vocab = big_trg_vocab.get_tokens()
    words_missing_src_vocab = list(set(words_big_src_vocab).difference(set(words_small_src_vocab)))
    words_missing_trg_vocab = list(set(words_big_trg_vocab).difference(set(words_small_trg_vocab)))
    final_big_src_vocab = words_small_src_vocab + words_missing_src_vocab
    final_big_trg_vocab = words_small_trg_vocab + words_missing_trg_vocab

    # Create new vocabs from tokens
    final_big_src_vocab[0] = '⁇'
    final_big_trg_vocab[0] = '⁇'
    big_src_vocab = Vocabulary(max_tokens=max_length, unk_piece='⁇').build_from_tokens([(tok.replace('▁', ''), 0) for tok in final_big_src_vocab])
    big_trg_vocab = Vocabulary(max_tokens=max_length, unk_piece='⁇').build_from_tokens([(tok.replace('▁', ''), 0) for tok in final_big_trg_vocab])

    model = Transformer(src_vocab_size=len(big_src_vocab), trg_vocab_size=len(big_trg_vocab), padding_idx=big_src_vocab.pad_id)
    return model, big_src_vocab, big_trg_vocab
Exemplo n.º 2
0
def load_model(ds, run_prefix):
    max_length = 100
    src_vocab = Vocabulary(max_tokens=max_length).build_from_ds(
        ds=ds, lang=ds.src_lang)
    trg_vocab = Vocabulary(max_tokens=max_length).build_from_ds(
        ds=ds, lang=ds.trg_lang)
    model = Transformer(src_vocab_size=len(src_vocab),
                        trg_vocab_size=len(trg_vocab),
                        padding_idx=src_vocab.pad_id,
                        encoder_embed_dim=256,
                        decoder_embed_dim=256)

    # Load check point
    if run_prefix:
        checkpoint_path = ds.get_model_checkpoints_path(
            toolkit="autonmt",
            run_name=ds.get_run_name(run_prefix),
            fname="checkpoint_best.pt")
        model_state_dict = torch.load(checkpoint_path)['state_dict']
        model.load_state_dict(model_state_dict)

        # Change vocabularies
        src_tokens = [tok.replace('▁', '') for tok in src_vocab.get_tokens()]
        trg_tokens = [tok.replace('▁', '') for tok in trg_vocab.get_tokens()]
        src_tokens[0] = '⁇'
        trg_tokens[0] = '⁇'
        src_vocab = Vocabulary(max_tokens=max_length,
                               unk_piece='⁇').build_from_tokens([
                                   (tok, 0) for tok in src_tokens
                               ])
        trg_vocab = Vocabulary(max_tokens=max_length,
                               unk_piece='⁇').build_from_tokens([
                                   (tok, 0) for tok in trg_tokens
                               ])
    return model, src_vocab, trg_vocab
Exemplo n.º 3
0
def get_ref_vocabs2(base_path, limit, max_length=100):
    src_tokens = utils.load_json(os.path.join(base_path, "src.json"))
    trg_tokens = utils.load_json(os.path.join(base_path, "trg.json"))
    src_vocab = Vocabulary(max_tokens=max_length).build_from_tokens([
        (tok, 0) for tok in src_tokens
    ])
    trg_vocab = Vocabulary(max_tokens=max_length).build_from_tokens([
        (tok, 0) for tok in trg_tokens
    ])

    # Create new vocabs from tokens
    src_tokens = [
        tok.replace('▁', '') for tok in src_vocab.get_tokens()[:limit]
    ]
    trg_tokens = [
        tok.replace('▁', '') for tok in trg_vocab.get_tokens()[:limit]
    ]
    src_tokens[0] = '⁇'
    trg_tokens[0] = '⁇'
    src_vocab = Vocabulary(max_tokens=max_length,
                           unk_piece='⁇').build_from_tokens([
                               (tok, 0) for tok in src_tokens
                           ])
    trg_vocab = Vocabulary(max_tokens=max_length,
                           unk_piece='⁇').build_from_tokens([
                               (tok, 0) for tok in trg_tokens
                           ])

    return src_vocab, trg_vocab
Exemplo n.º 4
0
def get_ref_vocabs(ds_ref, limit, max_length=100):
    src_vocab = Vocabulary(max_tokens=max_length).build_from_ds(
        ds_ref, lang=ds_ref.src_lang)
    trg_vocab = Vocabulary(max_tokens=max_length).build_from_ds(
        ds_ref, lang=ds_ref.trg_lang)

    # Create new vocabs from tokens
    src_tokens = [
        tok.replace('▁', '') for tok in src_vocab.get_tokens()[:limit]
    ]
    trg_tokens = [
        tok.replace('▁', '') for tok in trg_vocab.get_tokens()[:limit]
    ]
    src_tokens[0] = '⁇'
    trg_tokens[0] = '⁇'
    src_vocab = Vocabulary(max_tokens=max_length,
                           unk_piece='⁇').build_from_tokens([
                               (tok, 0) for tok in src_tokens
                           ])
    trg_vocab = Vocabulary(max_tokens=max_length,
                           unk_piece='⁇').build_from_tokens([
                               (tok, 0) for tok in trg_tokens
                           ])

    return src_vocab, trg_vocab
Exemplo n.º 5
0
def expand_model(ds_small, ds_big, comp, run_prefix, src_emb, trg_emb):
    max_length = 100
    small_src_vocab = Vocabulary(max_tokens=max_length).build_from_ds(ds=ds_small, lang=ds_small.src_lang)
    small_trg_vocab = Vocabulary(max_tokens=max_length).build_from_ds(ds=ds_small, lang=ds_small.trg_lang)
    model = Transformer(src_vocab_size=len(small_src_vocab), trg_vocab_size=len(small_trg_vocab), padding_idx=small_src_vocab.pad_id)
    checkpoint_path = ds_small.get_model_checkpoints_path(toolkit="autonmt", run_name=ds_small.get_run_name(run_prefix),
                                                    fname="checkpoint_best.pt")
    model_state_dict = torch.load(checkpoint_path)['state_dict']
    model.load_state_dict(model_state_dict)


    max_length = 100
    big_src_vocab = Vocabulary(max_tokens=max_length).build_from_ds(ds=ds_big, lang=ds_big.src_lang)
    big_trg_vocab = Vocabulary(max_tokens=max_length).build_from_ds(ds=ds_big, lang=ds_big.trg_lang)

    # Get old embedding matrix (small)
    device = model.device
    dtype = model.src_embeddings.weight.dtype
    small_src_emb = model.src_embeddings.weight.detach()
    small_trg_emb = model.trg_embeddings.weight.detach()

    # Compute mean and std
    src_small_scaler = StandardScaler().fit(small_src_emb.numpy())
    trg_small_scaler = StandardScaler().fit(small_trg_emb.numpy())

    # Get sizes
    src_big_voc_size, src_small_voc_size, src_voc_dim = len(big_src_vocab), small_src_emb.shape[0], small_src_emb.shape[1]
    trg_big_voc_size, trg_small_voc_size, trg_voc_dim = len(big_trg_vocab), small_trg_emb.shape[0], small_trg_emb.shape[1]

    # Match vocabularies
    words_small_src_vocab = small_src_vocab.get_tokens()
    words_small_trg_vocab = small_trg_vocab.get_tokens()
    words_big_src_vocab = big_src_vocab.get_tokens()
    words_big_trg_vocab = big_trg_vocab.get_tokens()
    words_missing_src_vocab = list(set(words_big_src_vocab).difference(set(words_small_src_vocab)))
    words_missing_trg_vocab = list(set(words_big_trg_vocab).difference(set(words_small_trg_vocab)))
    final_big_src_vocab = words_small_src_vocab + words_missing_src_vocab
    final_big_trg_vocab = words_small_trg_vocab + words_missing_trg_vocab
    src_big_sorted_missing_idxs = [big_src_vocab.voc2idx[tok] for tok in words_missing_src_vocab]
    trg_big_sorted_missing_idxs = [big_trg_vocab.voc2idx[tok] for tok in words_missing_trg_vocab]

    # Reserve space for new embeddings
    new_src_emb = torch.zeros((src_big_voc_size, src_voc_dim), device=device, dtype=dtype)
    new_trg_emb = torch.zeros((trg_big_voc_size, trg_voc_dim), device=device, dtype=dtype)

    # Copy old embeddings
    new_src_emb[:src_small_voc_size, :] = small_src_emb
    new_trg_emb[:trg_small_voc_size, :] = small_trg_emb

    # Add new embeddings
    src_idxs = src_emb[torch.tensor(src_big_sorted_missing_idxs).long()]
    trg_idxs = trg_emb[torch.tensor(trg_big_sorted_missing_idxs).long()]
    src_big_tmp = torch.tensor(src_idxs, device=device, dtype=dtype)
    trg_big_tmp = torch.tensor(trg_idxs, device=device, dtype=dtype)

    # Re-scale new tensors (if needed)
    if comp in {"random"}:  # Do not scale for random
        src_big_rescaled, trg_big_rescaled = src_big_tmp, trg_big_tmp
    else:
        # Standarize new tensors (it's already standarize, although since X values have been select, its stats are shifted)
        # src_big_tmp = StandardScaler().fit_transform(src_big_tmp.numpy())
        # trg_big_tmp = StandardScaler().fit_transform(trg_big_tmp.numpy())

        src_big_rescaled, trg_big_rescaled = src_big_tmp, trg_big_tmp
        # Rescale new tensors
        # src_big_rescaled = src_small_scaler.inverse_transform(src_big_tmp)
        # trg_big_rescaled = trg_small_scaler.inverse_transform(trg_big_tmp)

    # Inverse transform but with the previous model stats
    new_src_emb[src_small_voc_size:, :] = torch.tensor(src_big_rescaled, device=device, dtype=dtype)
    new_trg_emb[trg_small_voc_size:, :] = torch.tensor(trg_big_rescaled, device=device, dtype=dtype)

    # Convert embedding to parameter
    model.src_embeddings.weight = torch.nn.parameter.Parameter(new_src_emb)
    model.trg_embeddings.weight = torch.nn.parameter.Parameter(new_trg_emb)

    # Modify output layer
    new_output = torch.nn.Linear(model.output_layer.in_features, trg_big_voc_size, device=device, dtype=dtype)
    new_output_weights = new_output.weight.detach()
    new_output_bias = new_output.bias.detach()
    new_output_weights[:src_small_voc_size, :] = model.output_layer.weight.detach()
    new_output_bias[:src_small_voc_size] = model.output_layer.bias.detach()
    new_output.weight = torch.nn.parameter.Parameter(new_output_weights)
    new_output.bias = torch.nn.parameter.Parameter(new_output_bias)
    model.output_layer = new_output

    # if comp != "random":
    #     print("******************* Freezing embedding layers *********************")
    #     if comp != "glove":
    #         for p in model.src_embeddings.parameters():
    #             p.requires_grad = False
    #     for p in model.trg_embeddings.parameters():
    #         p.requires_grad = False

    # Create new vocabs from tokens
    final_big_src_vocab[0] = '⁇'
    final_big_trg_vocab[0] = '⁇'
    big_src_vocab = Vocabulary(max_tokens=max_length, unk_piece='⁇').build_from_tokens([(tok.replace('▁', ''), 0) for tok in final_big_src_vocab])
    big_trg_vocab = Vocabulary(max_tokens=max_length, unk_piece='⁇').build_from_tokens([(tok.replace('▁', ''), 0) for tok in final_big_trg_vocab])

    # Reset model
    # model.apply(weight_reset)

    return model, big_src_vocab, big_trg_vocab
Exemplo n.º 6
0
    merge_vocabs=False,
    force_overwrite=False,
    use_cmd=False,
    eval_mode="same",
    letter_case="lower",
).build(make_plots=False, safe=True)
big_datasets = builder_big.get_ds()
ds_ref = big_datasets[0]

base_path = "."

# Load vocabs
for lang, lang_id in [(ds_ref.src_lang, "src")]:  # ,
    # Load vocab
    vocab = Vocabulary().build_from_ds(ds_ref, lang=lang)
    tokens = [tok.replace('▁', '') for tok in vocab.get_tokens()]

    # Save tokens
    with open(f"{base_path}/{lang_id}.json", 'w') as f:
        json.dump(tokens, f)

    # Load model, reduce it and get embeddings
    ft = fasttext.load_model(f"/home/scarrion/Downloads/cc.{lang}.300.bin")
    fasttext.util.reduce_model(ft, 256)
    arr = [ft.get_word_vector(tok) for tok in tokens]
    arr = np.stack(arr, axis=0)

    # Save tensor
    np.save(f"{base_path}/{lang_id}.npy", arr)
    print(f"Saved {lang_id}!")
    asd = 3