def __init__(self, num_tokentypes=2, add_binary_head=True, parallel_output=True): super(BertModelBase, self).__init__() args = get_args() self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy self.add_binary_head = add_binary_head self.parallel_output = parallel_output init_method = init_method_normal(args.init_method_std) scaled_init_method = scaled_init_method_normal(args.init_method_std, args.num_layers) self.language_model, self._language_model_key = get_language_model( attention_mask_func=bert_attention_mask_func, num_tokentypes=num_tokentypes, add_pooler=self.add_binary_head, init_method=init_method, scaled_init_method=scaled_init_method) self.initialize_word_embeddings(init_method_normal) if mpu.is_pipeline_last_stage(): self.lm_head = BertLMHead( self.word_embeddings_weight().size(0), args.hidden_size, init_method, args.layernorm_epsilon, parallel_output) self._lm_head_key = 'lm_head' self.binary_head = None if self.add_binary_head: self.binary_head = get_linear_layer(args.hidden_size, 2, init_method) self._binary_head_key = 'binary_head'
def __init__(self, num_tokentypes=2, add_binary_head=True, parallel_output=True): super(BertModel, self).__init__() args = get_args() self.add_binary_head = add_binary_head self.parallel_output = parallel_output init_method = init_method_normal(args.init_method_std) scaled_init_method = scaled_init_method_normal(args.init_method_std, args.num_layers) self.language_model, self._language_model_key = get_language_model( attention_mask_func=bert_attention_mask_func, num_tokentypes=num_tokentypes, add_pooler=self.add_binary_head, init_method=init_method, scaled_init_method=scaled_init_method) self.lm_head = BertLMHead( self.language_model.embedding.word_embeddings.weight.size(0), args.hidden_size, init_method, args.layernorm_epsilon, parallel_output) self._lm_head_key = 'lm_head' if self.add_binary_head: self.binary_head = get_linear_layer(args.hidden_size, 2, init_method) self._binary_head_key = 'binary_head'
def __init__(self, num_tokentypes=2, parallel_output=True, pre_process=True, post_process=True): super(PretrainedBertModel, self).__init__() args = get_args() tokenizer = get_tokenizer() self.pad_id = tokenizer.pad self.biencoder_projection_dim = args.biencoder_projection_dim self.parallel_output = parallel_output self.pre_process = pre_process self.post_process = post_process init_method = init_method_normal(args.init_method_std) scaled_init_method = scaled_init_method_normal(args.init_method_std, args.num_layers) self.language_model, self._language_model_key = get_language_model( num_tokentypes=num_tokentypes, add_pooler=False, encoder_attn_mask_type=AttnMaskType.padding, init_method=init_method, scaled_init_method=scaled_init_method, pre_process=self.pre_process, post_process=self.post_process) if args.biencoder_projection_dim > 0: self.projection_enc = get_linear_layer( args.hidden_size, args.biencoder_projection_dim, init_method) self._projection_enc_key = 'projection_enc'
def __init__(self, num_tokentypes=2): super(MultipleChoice, self).__init__() args = get_args() init_method = init_method_normal(args.init_method_std) self.language_model, self._language_model_key = get_language_model( attention_mask_func=bert_attention_mask_func, num_tokentypes=num_tokentypes, add_pooler=True, init_method=init_method, scaled_init_method=scaled_init_method_normal( args.init_method_std, args.num_layers)) # Multi-choice head. self.multichoice_dropout = torch.nn.Dropout(args.hidden_dropout) self.multichoice_head = get_linear_layer(args.hidden_size, 1, init_method) self._multichoice_head_key = 'multichoice_head'
def __init__(self, ict_head_size, num_tokentypes=2, parallel_output=True): super(IREncoderBertModel, self).__init__() args = get_args() self.ict_head_size = ict_head_size self.parallel_output = parallel_output init_method = init_method_normal(args.init_method_std) scaled_init_method = scaled_init_method_normal(args.init_method_std, args.num_layers) self.language_model, self._language_model_key = get_language_model( attention_mask_func=bert_attention_mask_func, num_tokentypes=num_tokentypes, add_pooler=True, init_method=init_method, scaled_init_method=scaled_init_method) self.ict_head = get_linear_layer(args.hidden_size, ict_head_size, init_method) self._ict_head_key = 'ict_head'
def __init__( self, model_name, vocab_file, hidden_size=1024, num_attention_heads=16, num_layers=24, max_seq_length=512, tokenizer_type='BertWordPieceLowerCase', init_method_std=0.02, num_tokentypes=2, ): super().__init__() if not os.path.exists(vocab_file): raise ValueError(f'Vocab file not found at {vocab_file}') megatron_args = { "num_layers": num_layers, "hidden_size": hidden_size, "num_attention_heads": num_attention_heads, "max_position_embeddings": max_seq_length, "tokenizer_type": tokenizer_type, "vocab_file": vocab_file, } initialize_megatron(None, megatron_args, ignore_unknown_args=True) init_method = init_method_normal(init_method_std) self.language_model, self._language_model_key = get_language_model( attention_mask_func=bert_attention_mask_func, num_tokentypes=num_tokentypes, add_pooler=False, init_method=init_method, scaled_init_method=scaled_init_method_normal( init_method_std, num_layers), ) self.language_model.to(self._device) self._hidden_size = self.language_model.hidden_size
def __init__(self, num_classes, num_tokentypes=2): super(ClassificationBase, self).__init__(share_word_embeddings=False) args = get_args() self.num_classes = num_classes init_method = init_method_normal(args.init_method_std) self.language_model, self._language_model_key = get_language_model( attention_mask_func=bert_attention_mask_func, num_tokentypes=num_tokentypes, add_pooler=True, init_method=init_method, scaled_init_method=scaled_init_method_normal( args.init_method_std, args.num_layers)) # Multi-choice head. if mpu.is_pipeline_last_stage(): self.classification_dropout = torch.nn.Dropout(args.hidden_dropout) self.classification_head = get_linear_layer( args.hidden_size, self.num_classes, init_method) self._classification_head_key = 'classification_head'
def __init__(self, num_tokentypes=0, parallel_output=True): super(T5Model, self).__init__() args = get_args() self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy self.parallel_output = parallel_output init_method = init_method_normal(args.init_method_std) scaled_init_method = scaled_init_method_normal(args.init_method_std, args.num_layers) self.language_model, self._language_model_key = get_language_model( num_tokentypes=num_tokentypes, add_pooler=False, add_decoder=True, encoder_attn_mask_type=AttnMaskType.padding, init_method=init_method, scaled_init_method=scaled_init_method) self.lm_head = T5LMHead( self.language_model.embedding.word_embeddings.weight.size(0), parallel_output) self._lm_head_key = 'lm_head'
def __init__(self, num_tokentypes=2, add_binary_head=True, parallel_output=True, pre_process=True, post_process=True): super(BertModel, self).__init__() args = get_args() self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy self.add_binary_head = add_binary_head self.parallel_output = parallel_output self.pre_process = pre_process self.post_process = post_process init_method = init_method_normal(args.init_method_std) scaled_init_method = scaled_init_method_normal(args.init_method_std, args.num_layers) self.language_model, self._language_model_key = get_language_model( num_tokentypes=num_tokentypes, add_pooler=self.add_binary_head, encoder_attn_mask_type=AttnMaskType.padding, init_method=init_method, scaled_init_method=scaled_init_method, pre_process=self.pre_process, post_process=self.post_process) self.initialize_word_embeddings(init_method_normal) if self.post_process: self.lm_head = BertLMHead( self.word_embeddings_weight().size(0), args.hidden_size, init_method, args.layernorm_epsilon, parallel_output) self._lm_head_key = 'lm_head' self.binary_head = None if self.add_binary_head: self.binary_head = get_linear_layer(args.hidden_size, 2, init_method) self._binary_head_key = 'binary_head'
def __init__(self, num_tokentypes=0, parallel_output=True, pre_process=True, post_process=True, add_encoder=True, add_decoder=True): super(T5Model, self).__init__() args = get_args() self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy self.parallel_output = parallel_output init_method = init_method_normal(args.init_method_std) scaled_init_method = scaled_init_method_normal(args.init_method_std, args.num_layers) self.pre_process = pre_process self.post_process = post_process self.add_encoder = add_encoder self.add_decoder = add_decoder self.language_model, self._language_model_key = get_language_model( num_tokentypes=num_tokentypes, add_pooler=False, add_encoder=add_encoder, add_decoder=add_decoder, encoder_attn_mask_type=AttnMaskType.padding, init_method=init_method, scaled_init_method=scaled_init_method, pre_process=self.pre_process, post_process=self.post_process) self.initialize_word_embeddings(init_method_normal) if self.post_process and self.add_decoder: self.lm_head = T5LMHead(self.word_embeddings_weight().size(0), parallel_output) self._lm_head_key = 'lm_head'
def __init__(self, num_tokentypes=2, pre_process=True, post_process=True): super(MultipleChoice, self).__init__(share_word_embeddings=False) args = get_args() init_method = init_method_normal(args.init_method_std) self.pre_process = pre_process self.post_process = post_process self.language_model, self._language_model_key = get_language_model( num_tokentypes=num_tokentypes, add_pooler=True, encoder_attn_mask_type=AttnMaskType.padding, init_method=init_method, scaled_init_method=scaled_init_method_normal( args.init_method_std, args.num_layers), pre_process=self.pre_process, post_process=self.post_process) # Multi-choice head. if self.post_process: self.multichoice_dropout = torch.nn.Dropout(args.hidden_dropout) self.multichoice_head = get_linear_layer(args.hidden_size, 1, init_method) self._multichoice_head_key = 'multichoice_head'