from apex.transformer.enums import ModelType from apex.transformer import parallel_state, tensor_parallel from apex.transformer.pipeline_parallel.schedules.common import build_model from apex.transformer.pipeline_parallel.schedules.fwd_bwd_no_pipelining import forward_backward_no_pipelining from apex.transformer.pipeline_parallel.schedules.fwd_bwd_pipelining_without_interleaving import ( forward_backward_pipelining_without_interleaving, ) from apex.transformer.pipeline_parallel.utils import ( get_num_microbatches, _reconfigure_microbatch_calculator, get_micro_batch_size, ) HAVE_APEX = True except (ImportError, ModuleNotFoundError): ModelType = ApexGuardDefaults() HAVE_APEX = False __all__ = ["MegatronLMEncoderDecoderModel"] class MegatronLMEncoderDecoderModel(MegatronBaseModel): """ Megatron encoder-decoder base class """ def __init__(self, cfg: DictConfig, trainer: Trainer): super().__init__(cfg, trainer=trainer) # Make sure trainer.accumulate_grad_batches is 1.
MegatronRetrievalTransformerDecoderModule, ) from nemo.collections.nlp.modules.common.megatron.utils import ( ApexGuardDefaults, init_method_normal, scaled_init_method_normal, ) try: from apex.transformer.enums import AttnMaskType, ModelType HAVE_APEX = True except (ImportError, ModuleNotFoundError): HAVE_APEX = False # fake missing classes with None attributes AttnMaskType = ApexGuardDefaults() ModelType = ApexGuardDefaults() __all__ = [] AVAILABLE_DECODERS = ["transformer"] def get_decoder_model( arch, hidden_size, ffn_hidden_size, num_layers, num_attention_heads, apply_query_key_layer_scaling=True, kv_channels=None,
ApexGuardDefaults, get_linear_layer, init_method_normal, scaled_init_method_normal, ) try: from apex.transformer import tensor_parallel from apex.transformer.enums import AttnMaskType HAVE_APEX = True except (ImportError, ModuleNotFoundError): HAVE_APEX = False # fake missing classes with None attributes AttnMaskType = ApexGuardDefaults() LayerType = ApexGuardDefaults() def get_language_model( hidden_size, ffn_hidden_size, num_layers, max_position_embeddings, num_tokentypes, add_pooler, vocab_size, num_attention_heads, encoder_attn_mask_type, apply_query_key_layer_scaling=True, kv_channels=None,