num_attn_heads=args.num_attn_heads, ffn_dropout=args.ffn_dropout, vocab_size=tgt_vocab_size, attn_score_dropout=args.attn_score_dropout, attn_layer_dropout=args.attn_layer_dropout, max_seq_length=args.max_seq_length) log_softmax = nemo_nlp.TokenClassifier(args.d_model, num_classes=tgt_tokenizer.vocab_size, num_layers=1, log_softmax=True) beam_search = nemo_nlp.BeamSearchTranslatorNM( decoder=decoder, log_softmax=log_softmax, max_seq_length=args.max_seq_length, beam_size=args.beam_size, bos_token=tgt_tokenizer.bos_id(), pad_token=tgt_tokenizer.pad_id(), eos_token=tgt_tokenizer.eos_id()) loss_fn = nemo_nlp.PaddedSmoothedCrossEntropyLossNM( pad_id=tgt_tokenizer.pad_id(), label_smoothing=args.label_smoothing) if tie_weight: log_softmax.mlp.last_linear_layer.weight = \ encoder.embedding_layer.token_embedding.weight decoder.embedding_layer.token_embedding.weight = \ encoder.embedding_layer.token_embedding.weight def create_pipeline(dataset_src,
learn_positional_encodings=True, hidden_act="gelu", **dec_first_sublayer_params) decoder.restore_from(args.restore_from, local_rank=args.local_rank) t_log_softmax = nemo_nlp.TokenClassifier(args.d_model, num_classes=vocab_size, num_layers=1, log_softmax=True) beam_translator = nemo_nlp.BeamSearchTranslatorNM( decoder=decoder, log_softmax=t_log_softmax, max_seq_length=args.max_seq_length, beam_size=args.beam_size, length_penalty=args.len_pen, bos_token=tokenizer.bos_id(), pad_token=tokenizer.pad_id(), eos_token=tokenizer.eos_id()) loss = nemo_nlp.PaddedSmoothedCrossEntropyLossNM(pad_id=0, smoothing=0.1) loss_eval = nemo_nlp.PaddedSmoothedCrossEntropyLossNM(pad_id=0, smoothing=0.0) # tie all embeddings weights t_log_softmax.mlp.last_linear_layer.weight = \ encoder.bert.embeddings.word_embeddings.weight decoder.embedding_layer.token_embedding.weight = \ encoder.bert.embeddings.word_embeddings.weight decoder.embedding_layer.position_embedding.weight = \
attn_layer_dropout=args.attn_layer_dropout, max_seq_length=max_sequence_length, embedding_dropout=args.embedding_dropout, share_all_layers=args.share_encoder_layers, hidden_act="gelu") log_softmax = nemo_nlp.TransformerLogSoftmaxNM(factory=neural_factory, vocab_size=vocab_size, d_model=args.d_model, d_embedding=args.d_embedding) beam_translator = nemo_nlp.BeamSearchTranslatorNM( factory=neural_factory, decoder=decoder, log_softmax=log_softmax, max_seq_length=max_sequence_length, beam_size=args.beam_size, length_penalty=args.len_pen, bos_token=tokenizer.bos_id(), pad_token=tokenizer.pad_id(), eos_token=tokenizer.eos_id()) loss = nemo_nlp.PaddedSmoothedCrossEntropyLossNM(factory=neural_factory, pad_id=tokenizer.pad_id(), smoothing=0.1) loss_eval = nemo_nlp.PaddedSmoothedCrossEntropyLossNM( factory=neural_factory, pad_id=tokenizer.pad_id(), smoothing=0.0) # tie weight of embedding and log_softmax layers if args.tie_enc_dec: decoder.embedding_layer.token_embedding.weight = \