def load_model(ds, run_prefix): max_length = 100 src_vocab = Vocabulary(max_tokens=max_length).build_from_ds( ds=ds, lang=ds.src_lang) trg_vocab = Vocabulary(max_tokens=max_length).build_from_ds( ds=ds, lang=ds.trg_lang) model = Transformer(src_vocab_size=len(src_vocab), trg_vocab_size=len(trg_vocab), padding_idx=src_vocab.pad_id, encoder_embed_dim=256, decoder_embed_dim=256) # Load check point if run_prefix: checkpoint_path = ds.get_model_checkpoints_path( toolkit="autonmt", run_name=ds.get_run_name(run_prefix), fname="checkpoint_best.pt") model_state_dict = torch.load(checkpoint_path)['state_dict'] model.load_state_dict(model_state_dict) # Change vocabularies src_tokens = [tok.replace('▁', '') for tok in src_vocab.get_tokens()] trg_tokens = [tok.replace('▁', '') for tok in trg_vocab.get_tokens()] src_tokens[0] = '⁇' trg_tokens[0] = '⁇' src_vocab = Vocabulary(max_tokens=max_length, unk_piece='⁇').build_from_tokens([ (tok, 0) for tok in src_tokens ]) trg_vocab = Vocabulary(max_tokens=max_length, unk_piece='⁇').build_from_tokens([ (tok, 0) for tok in trg_tokens ]) return model, src_vocab, trg_vocab
def get_ref_vocabs2(base_path, limit, max_length=100): src_tokens = utils.load_json(os.path.join(base_path, "src.json")) trg_tokens = utils.load_json(os.path.join(base_path, "trg.json")) src_vocab = Vocabulary(max_tokens=max_length).build_from_tokens([ (tok, 0) for tok in src_tokens ]) trg_vocab = Vocabulary(max_tokens=max_length).build_from_tokens([ (tok, 0) for tok in trg_tokens ]) # Create new vocabs from tokens src_tokens = [ tok.replace('▁', '') for tok in src_vocab.get_tokens()[:limit] ] trg_tokens = [ tok.replace('▁', '') for tok in trg_vocab.get_tokens()[:limit] ] src_tokens[0] = '⁇' trg_tokens[0] = '⁇' src_vocab = Vocabulary(max_tokens=max_length, unk_piece='⁇').build_from_tokens([ (tok, 0) for tok in src_tokens ]) trg_vocab = Vocabulary(max_tokens=max_length, unk_piece='⁇').build_from_tokens([ (tok, 0) for tok in trg_tokens ]) return src_vocab, trg_vocab
def get_ref_vocabs(ds_ref, limit, max_length=100): src_vocab = Vocabulary(max_tokens=max_length).build_from_ds( ds_ref, lang=ds_ref.src_lang) trg_vocab = Vocabulary(max_tokens=max_length).build_from_ds( ds_ref, lang=ds_ref.trg_lang) # Create new vocabs from tokens src_tokens = [ tok.replace('▁', '') for tok in src_vocab.get_tokens()[:limit] ] trg_tokens = [ tok.replace('▁', '') for tok in trg_vocab.get_tokens()[:limit] ] src_tokens[0] = '⁇' trg_tokens[0] = '⁇' src_vocab = Vocabulary(max_tokens=max_length, unk_piece='⁇').build_from_tokens([ (tok, 0) for tok in src_tokens ]) trg_vocab = Vocabulary(max_tokens=max_length, unk_piece='⁇').build_from_tokens([ (tok, 0) for tok in trg_tokens ]) return src_vocab, trg_vocab
def load_model(ds, run_prefix): max_length = 100 src_vocab = Vocabulary(max_tokens=max_length).build_from_ds(ds=ds, lang=ds.src_lang) trg_vocab = Vocabulary(max_tokens=max_length).build_from_ds(ds=ds, lang=ds.trg_lang) model = Transformer(src_vocab_size=len(src_vocab), trg_vocab_size=len(trg_vocab), padding_idx=src_vocab.pad_id) # Load check point checkpoint_path = ds.get_model_checkpoints_path(toolkit="autonmt", run_name=ds.get_run_name(run_prefix), fname="checkpoint_best.pt") model_state_dict = torch.load(checkpoint_path)['state_dict'] model.load_state_dict(model_state_dict) return model, src_vocab, trg_vocab
def main(): # Create preprocessing for training builder = DatasetBuilder( base_path="/home/scarrion/datasets/nn/translation", datasets=[ { "name": "multi30k", "languages": ["de-en"], "sizes": [("original", None)] }, # {"name": "europarl", "languages": ["de-en"], "sizes": [("original", None)]}, ], subword_models=["word"], vocab_sizes=[8000], merge_vocabs=False, force_overwrite=True, use_cmd=True, eval_mode="same", letter_case="lower", ).build(make_plots=False) # Create preprocessing for training and testing tr_datasets = builder.get_ds() ts_datasets = builder.get_ds(ignore_variants=True) # Train & Score a model for each dataset scores = [] errors = [] run_prefix = "model_mt8kemb" for ds in tr_datasets: # try: # Instantiate vocabs and model src_vocab = Vocabulary(max_tokens=120).build_from_ds(ds=ds, lang=ds.src_lang) trg_vocab = Vocabulary(max_tokens=120).build_from_ds(ds=ds, lang=ds.trg_lang) model = Transformer(src_vocab_size=len(src_vocab), trg_vocab_size=len(trg_vocab), padding_idx=src_vocab.pad_id, encoder_embed_dim=256, decoder_embed_dim=256) # Train model wandb_params = dict(project="autonmt-tests", entity="salvacarrion") model = AutonmtTranslator(model=model, src_vocab=src_vocab, trg_vocab=trg_vocab, model_ds=ds, wandb_params=wandb_params, force_overwrite=True, run_prefix=run_prefix) model.fit(max_epochs=100, batch_size=128, seed=1234, num_workers=16, patience=10) m_scores = model.predict(ts_datasets, metrics={"bleu"}, beams=[1], max_gen_length=120, load_best_checkpoint=True) scores.append(m_scores) # except Exception as e: # print(str(e)) # errors += [str(e)] # Make report and print it output_path = f".outputs/autonmt/{str(datetime.datetime.now())}/{run_prefix}" df_report, df_summary = generate_report( scores=scores, output_path=output_path, plot_metric="beam1__sacrebleu_bleu_score") print("Summary:") print(df_summary.to_string(index=False)) print(f"Errors: {len(errors)}") print(errors)
def expand_model(ds_small, ds_big, comp, run_prefix, src_emb, trg_emb): max_length = 100 small_src_vocab = Vocabulary(max_tokens=max_length).build_from_ds(ds=ds_small, lang=ds_small.src_lang) small_trg_vocab = Vocabulary(max_tokens=max_length).build_from_ds(ds=ds_small, lang=ds_small.trg_lang) model = Transformer(src_vocab_size=len(small_src_vocab), trg_vocab_size=len(small_trg_vocab), padding_idx=small_src_vocab.pad_id) checkpoint_path = ds_small.get_model_checkpoints_path(toolkit="autonmt", run_name=ds_small.get_run_name(run_prefix), fname="checkpoint_best.pt") model_state_dict = torch.load(checkpoint_path)['state_dict'] model.load_state_dict(model_state_dict) max_length = 100 big_src_vocab = Vocabulary(max_tokens=max_length).build_from_ds(ds=ds_big, lang=ds_big.src_lang) big_trg_vocab = Vocabulary(max_tokens=max_length).build_from_ds(ds=ds_big, lang=ds_big.trg_lang) # Get old embedding matrix (small) device = model.device dtype = model.src_embeddings.weight.dtype small_src_emb = model.src_embeddings.weight.detach() small_trg_emb = model.trg_embeddings.weight.detach() # Compute mean and std src_small_scaler = StandardScaler().fit(small_src_emb.numpy()) trg_small_scaler = StandardScaler().fit(small_trg_emb.numpy()) # Get sizes src_big_voc_size, src_small_voc_size, src_voc_dim = len(big_src_vocab), small_src_emb.shape[0], small_src_emb.shape[1] trg_big_voc_size, trg_small_voc_size, trg_voc_dim = len(big_trg_vocab), small_trg_emb.shape[0], small_trg_emb.shape[1] # Match vocabularies words_small_src_vocab = small_src_vocab.get_tokens() words_small_trg_vocab = small_trg_vocab.get_tokens() words_big_src_vocab = big_src_vocab.get_tokens() words_big_trg_vocab = big_trg_vocab.get_tokens() words_missing_src_vocab = list(set(words_big_src_vocab).difference(set(words_small_src_vocab))) words_missing_trg_vocab = list(set(words_big_trg_vocab).difference(set(words_small_trg_vocab))) final_big_src_vocab = words_small_src_vocab + words_missing_src_vocab final_big_trg_vocab = words_small_trg_vocab + words_missing_trg_vocab src_big_sorted_missing_idxs = [big_src_vocab.voc2idx[tok] for tok in words_missing_src_vocab] trg_big_sorted_missing_idxs = [big_trg_vocab.voc2idx[tok] for tok in words_missing_trg_vocab] # Reserve space for new embeddings new_src_emb = torch.zeros((src_big_voc_size, src_voc_dim), device=device, dtype=dtype) new_trg_emb = torch.zeros((trg_big_voc_size, trg_voc_dim), device=device, dtype=dtype) # Copy old embeddings new_src_emb[:src_small_voc_size, :] = small_src_emb new_trg_emb[:trg_small_voc_size, :] = small_trg_emb # Add new embeddings src_idxs = src_emb[torch.tensor(src_big_sorted_missing_idxs).long()] trg_idxs = trg_emb[torch.tensor(trg_big_sorted_missing_idxs).long()] src_big_tmp = torch.tensor(src_idxs, device=device, dtype=dtype) trg_big_tmp = torch.tensor(trg_idxs, device=device, dtype=dtype) # Re-scale new tensors (if needed) if comp in {"random"}: # Do not scale for random src_big_rescaled, trg_big_rescaled = src_big_tmp, trg_big_tmp else: # Standarize new tensors (it's already standarize, although since X values have been select, its stats are shifted) # src_big_tmp = StandardScaler().fit_transform(src_big_tmp.numpy()) # trg_big_tmp = StandardScaler().fit_transform(trg_big_tmp.numpy()) src_big_rescaled, trg_big_rescaled = src_big_tmp, trg_big_tmp # Rescale new tensors # src_big_rescaled = src_small_scaler.inverse_transform(src_big_tmp) # trg_big_rescaled = trg_small_scaler.inverse_transform(trg_big_tmp) # Inverse transform but with the previous model stats new_src_emb[src_small_voc_size:, :] = torch.tensor(src_big_rescaled, device=device, dtype=dtype) new_trg_emb[trg_small_voc_size:, :] = torch.tensor(trg_big_rescaled, device=device, dtype=dtype) # Convert embedding to parameter model.src_embeddings.weight = torch.nn.parameter.Parameter(new_src_emb) model.trg_embeddings.weight = torch.nn.parameter.Parameter(new_trg_emb) # Modify output layer new_output = torch.nn.Linear(model.output_layer.in_features, trg_big_voc_size, device=device, dtype=dtype) new_output_weights = new_output.weight.detach() new_output_bias = new_output.bias.detach() new_output_weights[:src_small_voc_size, :] = model.output_layer.weight.detach() new_output_bias[:src_small_voc_size] = model.output_layer.bias.detach() new_output.weight = torch.nn.parameter.Parameter(new_output_weights) new_output.bias = torch.nn.parameter.Parameter(new_output_bias) model.output_layer = new_output # if comp != "random": # print("******************* Freezing embedding layers *********************") # if comp != "glove": # for p in model.src_embeddings.parameters(): # p.requires_grad = False # for p in model.trg_embeddings.parameters(): # p.requires_grad = False # Create new vocabs from tokens final_big_src_vocab[0] = '⁇' final_big_trg_vocab[0] = '⁇' big_src_vocab = Vocabulary(max_tokens=max_length, unk_piece='⁇').build_from_tokens([(tok.replace('▁', ''), 0) for tok in final_big_src_vocab]) big_trg_vocab = Vocabulary(max_tokens=max_length, unk_piece='⁇').build_from_tokens([(tok.replace('▁', ''), 0) for tok in final_big_trg_vocab]) # Reset model # model.apply(weight_reset) return model, big_src_vocab, big_trg_vocab
def create_big_model(ds_small, ds_big): max_length = 100 src_vocab_small = Vocabulary(max_tokens=max_length).build_from_ds(ds=ds_small, lang=ds_small.src_lang) trg_vocab_small = Vocabulary(max_tokens=max_length).build_from_ds(ds=ds_small, lang=ds_small.trg_lang) big_src_vocab = Vocabulary(max_tokens=max_length).build_from_ds(ds=ds_big, lang=ds_big.src_lang) big_trg_vocab = Vocabulary(max_tokens=max_length).build_from_ds(ds=ds_big, lang=ds_big.trg_lang) # Match vocabularies words_small_src_vocab = src_vocab_small.get_tokens() words_small_trg_vocab = trg_vocab_small.get_tokens() words_big_src_vocab = big_src_vocab.get_tokens() words_big_trg_vocab = big_trg_vocab.get_tokens() words_missing_src_vocab = list(set(words_big_src_vocab).difference(set(words_small_src_vocab))) words_missing_trg_vocab = list(set(words_big_trg_vocab).difference(set(words_small_trg_vocab))) final_big_src_vocab = words_small_src_vocab + words_missing_src_vocab final_big_trg_vocab = words_small_trg_vocab + words_missing_trg_vocab # Create new vocabs from tokens final_big_src_vocab[0] = '⁇' final_big_trg_vocab[0] = '⁇' big_src_vocab = Vocabulary(max_tokens=max_length, unk_piece='⁇').build_from_tokens([(tok.replace('▁', ''), 0) for tok in final_big_src_vocab]) big_trg_vocab = Vocabulary(max_tokens=max_length, unk_piece='⁇').build_from_tokens([(tok.replace('▁', ''), 0) for tok in final_big_trg_vocab]) model = Transformer(src_vocab_size=len(big_src_vocab), trg_vocab_size=len(big_trg_vocab), padding_idx=big_src_vocab.pad_id) return model, big_src_vocab, big_trg_vocab
vocab_sizes=[16000], merge_vocabs=False, force_overwrite=False, use_cmd=False, eval_mode="same", letter_case="lower", ).build(make_plots=False, safe=True) big_datasets = builder_big.get_ds() ds_ref = big_datasets[0] base_path = "." # Load vocabs for lang, lang_id in [(ds_ref.src_lang, "src")]: # , # Load vocab vocab = Vocabulary().build_from_ds(ds_ref, lang=lang) tokens = [tok.replace('▁', '') for tok in vocab.get_tokens()] # Save tokens with open(f"{base_path}/{lang_id}.json", 'w') as f: json.dump(tokens, f) # Load model, reduce it and get embeddings ft = fasttext.load_model(f"/home/scarrion/Downloads/cc.{lang}.300.bin") fasttext.util.reduce_model(ft, 256) arr = [ft.get_word_vector(tok) for tok in tokens] arr = np.stack(arr, axis=0) # Save tensor np.save(f"{base_path}/{lang_id}.npy", arr) print(f"Saved {lang_id}!")