def init_data(self, use_cuda: bool) -> None: self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(4) torch.set_grad_enabled(False) self.cfg = BertConfig() self.torch_pooler = BertPooler(self.cfg) if torch.cuda.is_available(): self.torch_pooler.to(self.test_device) self.torch_pooler.eval() self.turbo_pooler = turbo_transformers.BertPooler.from_torch( self.torch_pooler)
def __init__(self, config): super().__init__(config) self.config = config self.embeddings = NeZhaEmbeddings(config) self.encoder = NeZhaEncoder(config) self.pooler = BertPooler(config) self.init_weights()
def __init__(self, config): super(BertModel, self).__init__(config) self.config = config self.embeddings = BertEmbeddings(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) self.init_weights()
def __init__(self, bert, opt): super(BERT_BASE, self).__init__() self.bert = bert self.opt = opt self.dropout = nn.Dropout(opt.dropout) self.pooler = BertPooler(bert.config) self.dense = nn.Linear(opt.embed_dim, opt.polarities_dim)
def __init__(self, config): super(BertImgModel, self).__init__(config) self.embeddings = BertEmbeddings(config) self.encoder = BertEncoder(config)#CaptionBertEncoder(config) self.pooler = BertPooler(config) self.img_dim = config.img_feature_dim logger.info('BertImgModel Image Dimension: {}'.format(self.img_dim)) self.img_feature_type = config.img_feature_type if hasattr(config, 'use_img_layernorm'): self.use_img_layernorm = config.use_img_layernorm else: self.use_img_layernorm = None if config.img_feature_type == 'dis_code': self.code_embeddings = nn.Embedding(config.code_voc, config.code_dim, padding_idx=0) self.img_embedding = nn.Linear(config.code_dim, self.config.hidden_size, bias=True) elif config.img_feature_type == 'dis_code_t': # transpose self.code_embeddings = nn.Embedding(config.code_voc, config.code_dim, padding_idx=0) self.img_embedding = nn.Linear(config.code_size, self.config.hidden_size, bias=True) elif config.img_feature_type == 'dis_code_scale': # scaled self.input_embeddings = nn.Linear(config.code_dim, config.code_size, bias=True) self.code_embeddings = nn.Embedding(config.code_voc, config.code_dim, padding_idx=0) self.img_embedding = nn.Linear(config.code_dim, self.config.hidden_size, bias=True) else: self.img_embedding = nn.Linear(self.img_dim, self.config.hidden_size, bias=True) self.dropout = nn.Dropout(config.hidden_dropout_prob) if self.use_img_layernorm: self.LayerNorm = LayerNorm(config.hidden_size, eps=config.img_layer_norm_eps)
def __init__( self, config, visual_embedding_dim=512, embedding_strategy="plain", bypass_transformer=False, output_attentions=False, output_hidden_states=False, ): super().__init__(config) self.config = config config.visual_embedding_dim = visual_embedding_dim config.embedding_strategy = embedding_strategy config.bypass_transformer = bypass_transformer config.output_attentions = output_attentions config.output_hidden_states = output_hidden_states self.embeddings = BertVisioLinguisticEmbeddings(config) self.encoder = BertEncoderJit(config) self.pooler = BertPooler(config) self.bypass_transformer = config.bypass_transformer if self.bypass_transformer: self.additional_layer = BertLayerJit(config) self.output_attentions = self.config.output_attentions self.output_hidden_states = self.config.output_hidden_states self.init_weights()
def __init__(self, config, tokenizer, device): super().__init__() self.config = config self.tokenizer = tokenizer self.embeddings = BertEmbeddings(self.config) self.corrector = BertEncoder(self.config) self.mask_token_id = self.tokenizer.mask_token_id self.pooler = BertPooler(self.config) self.cls = BertOnlyMLMHead(self.config) self._device = device
def __init__(self, config, args): super().__init__(config) self.config = config self.embeddings = BertEmbeddings(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) self.MAG = MAG(config, args) self.init_weights()
def __init__(self, config, add_pooling_layer=True): super().__init__(config) self.config = config self.embeddings = KBertEmbeddings(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) if add_pooling_layer else None self.init_weights()
def __init__(self, config): super(BertImgModel, self).__init__(config) self.embeddings = BertEmbeddings(config) self.encoder = CaptionBertEncoder(config) self.pooler = BertPooler(config) self.img_dim = config.img_feature_dim logger.info('BertImgModel Image Dimension: {}'.format(self.img_dim)) # self.apply(self.init_weights) self.init_weights()
def __init__(self, count, config, num_labels): super(HSUM, self).__init__() self.count = count self.num_labels = num_labels self.pre_layers = torch.nn.ModuleList() self.loss_fct = torch.nn.ModuleList() self.pooler = BertPooler(config) self.classifier = torch.nn.Linear(config.hidden_size, num_labels) for i in range(count): self.pre_layers.append(BertLayer(config)) self.loss_fct.append(torch.nn.CrossEntropyLoss(ignore_index=-1))
def _build_word_embedding(self): self.bert_config = BertConfig.from_pretrained(self.config.bert_model_name) if self.config.pretrained_bert: bert_model = BertForPreTraining.from_pretrained(self.config.bert_model_name) self.word_embedding = bert_model.bert.embeddings self.pooler = bert_model.bert.pooler self.pooler.apply(self.init_weights) else: self.pooler = BertPooler(self.bert_config) self.word_embedding = BertEmbeddings(self.bert_config)
def __init__(self, config: Config, *args, **kwargs): super().__init__(config, *args, **kwargs) # Head modules self.pooler = BertPooler(self.config) self.classifier = nn.Sequential( nn.Dropout(self.config.hidden_dropout_prob), BertPredictionHeadTransform(self.config), nn.Linear(self.config.hidden_size, self.config.num_labels), ) self.num_labels = self.config.num_labels self.hidden_size = self.config.hidden_size
def __init__(self, hparams): super().__init__() self.hparams = hparams self.save_hyperparameters() config = BertConfig() #self.model = BertForSequenceClassification.from_pretrained(self.hparams.model_name, num_labels=self.hparams.n_class) self.model = AutoModel.from_pretrained(self.hparams.model_name) self.pooler = BertPooler(config) # self.attention = MultiheadedAttention(h_dim=self.hparams.h_dim, kqv_dim=self.hparams.kqv_dim, n_heads=self.hparams.n_heads) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, self.hparams.n_class) self.loss = nn.CrossEntropyLoss()
def __init__(self, bert, opt): super(LCF_BERT, self).__init__() self.bert4global = bert self.bert4local = copy.deepcopy( bert) if opt.use_dual_bert else self.bert4global self.opt = opt self.dropout = nn.Dropout(opt.dropout) self.bert_SA = SelfAttention(bert.config, opt) self.linear2 = nn.Linear(opt.embed_dim * 2, opt.embed_dim) self.linear3 = nn.Linear(opt.embed_dim * 3, opt.embed_dim) self.bert_pooler = BertPooler(bert.config) self.dense = nn.Linear(opt.embed_dim, opt.polarities_dim)
def __init__(self, bert, opt): super(LFC_BERT, self).__init__() self.bert_spc = bert self.opt = opt self.bert_local = bert self.dropout = nn.Dropout(opt['dropout']) self.bert_SA = SelfAttention(bert.config, opt) self.linear_double = nn.Linear(opt['bert_dim'] * 2, opt['bert_dim']) self.linear_single = nn.Linear(opt['bert_dim'], opt['bert_dim']) self.bert_pooler = BertPooler(bert.config) self.dense = nn.Linear(opt['bert_dim'], opt['polarities_dim'])
def __init__(self, bert, opt): super(LCF_BERT, self).__init__() self.bert_spc = bert self.opt = opt # self.bert_local = copy.deepcopy(bert) # Uncomment the line to use dual Bert self.bert_local = bert # Default to use single Bert and reduce memory requirements self.dropout = nn.Dropout(opt.dropout) self.bert_SA = SelfAttention(bert.config, opt) self.linear_double = nn.Linear(opt.bert_dim * 2, opt.bert_dim) self.linear_single = nn.Linear(opt.bert_dim, opt.bert_dim) self.bert_pooler = BertPooler(bert.config) self.dense = nn.Linear(opt.bert_dim, opt.polarities_dim)
def __init__(self, config, add_pooling_layer=True): # Call the init one parent class up. Otherwise, the model will be defined twice. BertPreTrainedModel.__init__(self, config) self.config = config self.embeddings = BertEmbeddings(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) if add_pooling_layer else None # Sparsify linear modules. self.sparsify_model() self.init_weights()
def __init__(self, config, num_choices=2): super(BertForMultipleChoiceWithMatch, self).__init__(config) self.num_choices = num_choices self.bert = BertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) self.classifier2 = nn.Linear(2 * config.hidden_size, 1) self.classifier3 = nn.Linear(3 * config.hidden_size, 1) self.classifier4 = nn.Linear(4 * config.hidden_size, 1) self.classifier6 = nn.Linear(6 * config.hidden_size, 1) self.ssmatch = SSingleMatchNet(config) self.pooler = BertPooler(config) self.fuse = FuseNet(config) self.init_weights()
def __init__(self, config: LukeConfig): super(LukeModel, self).__init__() self.config = config self.encoder = BertEncoder(config) self.pooler = BertPooler(config) if self.config.bert_model_name and "roberta" in self.config.bert_model_name: self.embeddings = RobertaEmbeddings(config) self.embeddings.token_type_embeddings.requires_grad = False else: self.embeddings = BertEmbeddings(config) self.entity_embeddings = EntityEmbeddings(config)
def __init__(self, bert, opt): super(SLIDE_LCF_BERT, self).__init__() self.bert4global = bert self.bert4local = copy.deepcopy( bert) if opt.use_dual_bert else self.bert4global self.opt = opt self.dropout = nn.Dropout(opt.dropout) self.encoder = Encoder(bert.config, opt) self.encoder_left = Encoder(bert.config, opt) self.encoder_right = Encoder(bert.config, opt) self.linear2 = nn.Linear(opt.embed_dim * 2, opt.embed_dim) self.linear_window_3h = nn.Linear(opt.embed_dim * 3, opt.embed_dim) self.linear_window_2h = nn.Linear(opt.embed_dim * 2, opt.embed_dim) self.bert_pooler = BertPooler(bert.config) self.dense = nn.Linear(opt.embed_dim, opt.polarities_dim)
def __init__(self, embedding_matrix, opt): super(LCA_GLOVE, self).__init__() # Only few of the parameters are necessary in the config.json, such as hidden_size, num_attention_heads self.config = BertConfig.from_json_file("modules/utils/bert_config.json") self.opt = opt self.embed = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float)) self.lc_embed = nn.Embedding(2, opt.embed_dim) self.global_encoder1 = SelfAttention(self.config, opt) self.local_encoder1 = SelfAttention(self.config, opt) self.local_encoder2 = SelfAttention(self.config, opt) self.mha = SelfAttention(self.config, opt) self.pool = BertPooler(self.config) self.dropout = nn.Dropout(opt.dropout) self.linear = nn.Linear(opt.embed_dim * 2, opt.embed_dim) self.dense = nn.Linear(opt.embed_dim, opt.polarities_dim) self.classifier = nn.Linear(opt.embed_dim, 2)
def __init__( self, pretrained_model: str = 'bert-large-uncased', num_choices: int = 4, learning_rate: float = 2e-5, gradient_accumulation_steps: int = 1, num_train_epochs: float = 3.0, train_batch_size: int = 32, warmup_proportion: float = 0.1, train_all: bool = False, use_bert_adam: bool = True, ): super().__init__() self.config = BertConfig.from_pretrained(pretrained_model, num_choices=4) self.bert = BertModel.from_pretrained(pretrained_model, config=self.config) self.num_choices = num_choices self.dropout = nn.Dropout(self.config.hidden_dropout_prob) self.classifier = nn.Linear(3 * self.config.hidden_size, 1) self.ssmatch = SSingleMatchNet(self.config) self.pooler = BertPooler(self.config) self.fuse = FuseNet(self.config) if not train_all: for param in self.bert.parameters(): param.requires_grad = False for param in self.bert.pooler.parameters(): param.requires_grad = True # for param in self.bert.encoder.layer[15:24].parameters(): # param.requires_grad = True # for param in self.bert.encoder.layer[15].output.parameters(): # param.requires_grad = True # print model layers and config print(self.config) for name, params in self.named_parameters(): print('-->name:', name, '-->grad_require:', params.requires_grad) self.learning_rate = learning_rate self.gradient_accumulation_steps = gradient_accumulation_steps self.num_train_epochs = num_train_epochs self.train_batch_size = train_batch_size self.warmup_proportion = warmup_proportion self.use_bert_adam = use_bert_adam self.warmup_steps = 0 self.total_steps = 0
def __init__(self, embedding_matrix, opt): super(LCF_GLOVE, self).__init__() self.config = BertConfig.from_json_file( "modules/utils/bert_config.json") self.opt = opt self.embed = nn.Embedding.from_pretrained( torch.tensor(embedding_matrix, dtype=torch.float)) self.mha_global = SelfAttention(self.config, opt) self.mha_local = SelfAttention(self.config, opt) self.ffn_global = PositionwiseFeedForward(self.opt.embed_dim, dropout=self.opt.dropout) self.ffn_local = PositionwiseFeedForward(self.opt.embed_dim, dropout=self.opt.dropout) self.mha_local_SA = SelfAttention(self.config, opt) self.mha_global_SA = SelfAttention(self.config, opt) self.pool = BertPooler(self.config) self.dropout = nn.Dropout(opt.dropout) self.linear = nn.Linear(opt.embed_dim * 2, opt.embed_dim) self.dense = nn.Linear(opt.embed_dim, opt.polarities_dim)
def __init__(self, config, visual_embedding_dim): super().__init__() # Attributes self.config = config self.config.visual_embedding_dim = visual_embedding_dim self.num_labels = config.num_labels # Build Bert self.embeddings = BertVisioLinguisticEmbeddings(self.config) self.encoder = BertEncoder(self.config) self.pooler = BertPooler(self.config) # Add classification head # Added sigmoid activation to smooth the output self.dropout = nn.Dropout(self.config.hidden_dropout_prob) self.classifier = nn.Sequential( BertPredictionHeadTransform(self.config), nn.Linear(self.config.hidden_size, self.num_labels), nn.Sigmoid()) self.init_weights()
def __init__(self, bert_base_model, args): super(LCF_ATEPC, self).__init__(config=bert_base_model.config) config = bert_base_model.config self.bert_for_global_context = bert_base_model self.args = args # do not init lcf layer if BERT-SPC or BERT-BASE specified # if self.args.local_context_focus in {'cdw', 'cdm', 'fusion'}: if not self.args.use_unique_bert: self.bert_for_local_context = copy.deepcopy(self.bert_for_global_context) else: self.bert_for_local_context = self.bert_for_global_context self.pooler = BertPooler(config) if args.dataset in {'camera', 'car', 'phone', 'notebook'}: self.dense = torch.nn.Linear(768, 2) else: self.dense = torch.nn.Linear(768, 3) self.bert_global_focus = self.bert_for_global_context self.dropout = nn.Dropout(self.args.dropout) self.SA1 = SelfAttention(config, args) self.SA2 = SelfAttention(config, args) self.linear_double = nn.Linear(768 * 2, 768) self.linear_triple = nn.Linear(768 * 3, 768)
def convert_checkpoint_to_pytorch(tf_checkpoint_path: str, config_path: str, pytorch_dump_path: str): def get_masked_lm_array(name: str): full_name = f"masked_lm/{name}/.ATTRIBUTES/VARIABLE_VALUE" array = tf.train.load_variable(tf_checkpoint_path, full_name) if "kernel" in name: array = array.transpose() return torch.from_numpy(array) def get_encoder_array(name: str): full_name = f"encoder/{name}/.ATTRIBUTES/VARIABLE_VALUE" array = tf.train.load_variable(tf_checkpoint_path, full_name) if "kernel" in name: array = array.transpose() return torch.from_numpy(array) def get_encoder_layer_array(layer_index: int, name: str): full_name = f"encoder/_transformer_layers/{layer_index}/{name}/.ATTRIBUTES/VARIABLE_VALUE" array = tf.train.load_variable(tf_checkpoint_path, full_name) if "kernel" in name: array = array.transpose() return torch.from_numpy(array) def get_encoder_attention_layer_array(layer_index: int, name: str, orginal_shape): full_name = f"encoder/_transformer_layers/{layer_index}/_attention_layer/{name}/.ATTRIBUTES/VARIABLE_VALUE" array = tf.train.load_variable(tf_checkpoint_path, full_name) array = array.reshape(orginal_shape) if "kernel" in name: array = array.transpose() return torch.from_numpy(array) print(f"Loading model based on config from {config_path}...") config = BertConfig.from_json_file(config_path) model = BertForMaskedLM(config) # Layers for layer_index in range(0, config.num_hidden_layers): layer: BertLayer = model.bert.encoder.layer[layer_index] # Self-attention self_attn: BertSelfAttention = layer.attention.self self_attn.query.weight.data = get_encoder_attention_layer_array( layer_index, "_query_dense/kernel", self_attn.query.weight.data.shape) self_attn.query.bias.data = get_encoder_attention_layer_array( layer_index, "_query_dense/bias", self_attn.query.bias.data.shape) self_attn.key.weight.data = get_encoder_attention_layer_array( layer_index, "_key_dense/kernel", self_attn.key.weight.data.shape) self_attn.key.bias.data = get_encoder_attention_layer_array( layer_index, "_key_dense/bias", self_attn.key.bias.data.shape) self_attn.value.weight.data = get_encoder_attention_layer_array( layer_index, "_value_dense/kernel", self_attn.value.weight.data.shape) self_attn.value.bias.data = get_encoder_attention_layer_array( layer_index, "_value_dense/bias", self_attn.value.bias.data.shape) # Self-attention Output self_output: BertSelfOutput = layer.attention.output self_output.dense.weight.data = get_encoder_attention_layer_array( layer_index, "_output_dense/kernel", self_output.dense.weight.data.shape) self_output.dense.bias.data = get_encoder_attention_layer_array( layer_index, "_output_dense/bias", self_output.dense.bias.data.shape) self_output.LayerNorm.weight.data = get_encoder_layer_array( layer_index, "_attention_layer_norm/gamma") self_output.LayerNorm.bias.data = get_encoder_layer_array( layer_index, "_attention_layer_norm/beta") # Intermediate intermediate: BertIntermediate = layer.intermediate intermediate.dense.weight.data = get_encoder_layer_array( layer_index, "_intermediate_dense/kernel") intermediate.dense.bias.data = get_encoder_layer_array( layer_index, "_intermediate_dense/bias") # Output bert_output: BertOutput = layer.output bert_output.dense.weight.data = get_encoder_layer_array( layer_index, "_output_dense/kernel") bert_output.dense.bias.data = get_encoder_layer_array( layer_index, "_output_dense/bias") bert_output.LayerNorm.weight.data = get_encoder_layer_array( layer_index, "_output_layer_norm/gamma") bert_output.LayerNorm.bias.data = get_encoder_layer_array( layer_index, "_output_layer_norm/beta") # Embeddings model.bert.embeddings.position_embeddings.weight.data = get_encoder_array( "_position_embedding_layer/embeddings") model.bert.embeddings.token_type_embeddings.weight.data = get_encoder_array( "_type_embedding_layer/embeddings") model.bert.embeddings.LayerNorm.weight.data = get_encoder_array( "_embedding_norm_layer/gamma") model.bert.embeddings.LayerNorm.bias.data = get_encoder_array( "_embedding_norm_layer/beta") # LM Head lm_head = model.cls.predictions.transform lm_head.dense.weight.data = get_masked_lm_array("dense/kernel") lm_head.dense.bias.data = get_masked_lm_array("dense/bias") lm_head.LayerNorm.weight.data = get_masked_lm_array("layer_norm/gamma") lm_head.LayerNorm.bias.data = get_masked_lm_array("layer_norm/beta") model.bert.embeddings.word_embeddings.weight.data = get_masked_lm_array( "embedding_table") # Pooling model.bert.pooler = BertPooler(config=config) model.bert.pooler.dense.weight.data: BertPooler = get_encoder_array( "_pooler_layer/kernel") model.bert.pooler.dense.bias.data: BertPooler = get_encoder_array( "_pooler_layer/bias") # Export final model model.save_pretrained(pytorch_dump_path) # Integration test - should load without any errors ;) new_model = BertForMaskedLM.from_pretrained(pytorch_dump_path) print(new_model.eval()) print("Model conversion was done sucessfully!")
class TestBertPooler(unittest.TestCase): def init_data(self, use_cuda: bool) -> None: self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(4) torch.set_grad_enabled(False) self.cfg = BertConfig() self.torch_pooler = BertPooler(self.cfg) if torch.cuda.is_available(): self.torch_pooler.to(self.test_device) self.torch_pooler.eval() self.turbo_pooler = turbo_transformers.BertPooler.from_torch( self.torch_pooler) def check_torch_and_turbo(self, use_cuda): self.init_data(use_cuda=use_cuda) device = "GPU" if use_cuda else "CPU" num_iter = 2 hidden_size = self.cfg.hidden_size input_tensor = torch.rand(size=(batch_size, 1, hidden_size), dtype=torch.float32, device=self.test_device) torch_model = lambda: self.torch_pooler(input_tensor) torch_result, torch_qps, torch_time = \ test_helper.run_model(torch_model, use_cuda, num_iter) print(f"BertPooler \"({batch_size},{hidden_size:03})\" ", f"{device} Torch QPS, {torch_qps}, time, {torch_time}") turbo_model = lambda: self.turbo_pooler( input_tensor.reshape((batch_size, hidden_size))) turbo_result, turbo_qps, turbo_time = \ test_helper.run_model(turbo_model, use_cuda, num_iter) print( f"BertPooler \"({batch_size}, {hidden_size}\" ", f"{device} TurboTransform QPS, {turbo_qps}, time, {turbo_time}" ) torch_result = torch_result.cpu().numpy() turbo_result = turbo_result.cpu().numpy() self.assertTrue( numpy.allclose(torch_result, turbo_result, rtol=1e-4, atol=1e-3)) with open("bert_pooler_res.txt", "a") as fh: fh.write( f"\"({batch_size},{hidden_size:03})\", {torch_qps}, {torch_qps}\n" ) def test_pooler(self): self.check_torch_and_turbo(use_cuda=False) if torch.cuda.is_available() and \ turbo_transformers.config.is_compiled_with_cuda(): self.check_torch_and_turbo(use_cuda=True)
def __init__(self, config): super().__init__() self.pooler = BertPooler(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels)