encoder = nemo_nlp.huggingface.BERT( pretrained_model_name=args.pretrained_model, local_rank=args.local_rank) device = encoder.bert.embeddings.word_embeddings.weight.get_device() zeros = torch.zeros((tokens_to_add, args.d_model)).to(device=device) encoder.bert.embeddings.word_embeddings.weight.data = torch.cat( (encoder.bert.embeddings.word_embeddings.weight.data, zeros)) decoder = nemo_nlp.TransformerDecoderNM( d_model=args.d_model, d_inner=args.d_inner, num_layers=args.num_layers, num_attn_heads=args.num_heads, ffn_dropout=args.ffn_dropout, vocab_size=vocab_size, max_seq_length=args.max_seq_length, embedding_dropout=args.embedding_dropout, learn_positional_encodings=True, hidden_act="gelu", **dec_first_sublayer_params) decoder.restore_from(args.restore_from, local_rank=args.local_rank) t_log_softmax = nemo_nlp.TokenClassifier(args.d_model, num_classes=vocab_size, num_layers=1, log_softmax=True) beam_translator = nemo_nlp.BeamSearchTranslatorNM( decoder=decoder,
num_layers=args.num_layers, embedding_dropout=args.embedding_dropout, num_attn_heads=args.num_attn_heads, ffn_dropout=args.ffn_dropout, vocab_size=vocab_size, attn_score_dropout=args.attn_score_dropout, attn_layer_dropout=args.attn_layer_dropout, max_seq_length=args.max_sequence_length, share_all_layers=args.share_encoder_layers) decoder = nemo_nlp.TransformerDecoderNM( factory=neural_factory, d_embedding=args.d_embedding, d_model=args.d_model, d_inner=args.d_inner, num_layers=args.num_layers, embedding_dropout=args.embedding_dropout, num_attn_heads=args.num_attn_heads, ffn_dropout=args.ffn_dropout, vocab_size=vocab_size, attn_score_dropout=args.attn_score_dropout, attn_layer_dropout=args.attn_layer_dropout, max_seq_length=args.max_sequence_length, share_all_layers=args.share_decoder_layers) log_softmax = nemo_nlp.TransformerLogSoftmaxNM(factory=neural_factory, vocab_size=vocab_size, d_model=args.d_model, d_embedding=args.d_embedding) beam_search = nemo_nlp.BeamSearchTranslatorNM( factory=neural_factory, decoder=decoder, log_softmax=log_softmax, max_seq_length=args.max_sequence_length,
d_inner=args.d_inner, num_layers=args.num_layers, embedding_dropout=args.embedding_dropout, num_attn_heads=args.num_attn_heads, ffn_dropout=args.ffn_dropout, vocab_size=src_vocab_size, attn_score_dropout=args.attn_score_dropout, attn_layer_dropout=args.attn_layer_dropout, max_seq_length=args.max_seq_length) decoder = nemo_nlp.TransformerDecoderNM( d_model=args.d_model, d_inner=args.d_inner, num_layers=args.num_layers, embedding_dropout=args.embedding_dropout, num_attn_heads=args.num_attn_heads, ffn_dropout=args.ffn_dropout, vocab_size=tgt_vocab_size, attn_score_dropout=args.attn_score_dropout, attn_layer_dropout=args.attn_layer_dropout, max_seq_length=args.max_seq_length) log_softmax = nemo_nlp.TokenClassifier(args.d_model, num_classes=tgt_tokenizer.vocab_size, num_layers=1, log_softmax=True) beam_search = nemo_nlp.BeamSearchTranslatorNM( decoder=decoder, log_softmax=log_softmax, max_seq_length=args.max_seq_length,
local_rank=args.local_rank) device = encoder.bert.embeddings.word_embeddings.weight.get_device() zeros = torch.zeros((tokens_to_add, args.d_model)).to(device=device) encoder.bert.embeddings.word_embeddings.weight.data = torch.cat( (encoder.bert.embeddings.word_embeddings.weight.data, zeros)) decoder = nemo_nlp.TransformerDecoderNM( factory=neural_factory, d_embedding=args.d_embedding, d_model=args.d_model, d_inner=args.d_inner, num_layers=args.num_layers, num_attn_heads=args.num_heads, ffn_dropout=args.ffn_dropout, vocab_size=vocab_size, attn_score_dropout=args.attn_score_dropout, attn_layer_dropout=args.attn_layer_dropout, max_seq_length=max_sequence_length, embedding_dropout=args.embedding_dropout, share_all_layers=args.share_decoder_layers, learn_positional_encodings=True, hidden_act="gelu") if args.restore_decoder: decoder.restore_from(args.decoder_restore_from, local_rank=args.local_rank) elif args.encoder == "nemo": encoder = nemo_nlp.TransformerEncoderNM( factory=neural_factory,