def __init__(self, config): super(VariableNormTransformerLayer, self).__init__() self.config = config if self.config.norm_type == 'layer': self.attention_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) elif self.config.norm_type == 'adanorm': self.attention_norm = AdaNorm(0.3, config.layer_norm_eps) elif self.config.norm_type == 'scalenorm': self.attention_norm = ScaleNorm(config.hidden_size**0.5) self.self_attention = BertSelfAttention(config) self.self_out = nn.Linear(config.hidden_size, config.hidden_size) self.self_dropout = nn.Dropout(config.hidden_dropout_prob) if self.config.norm_type == 'layer': self.ff_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) elif self.config.norm_type == 'adanorm': self.ff_norm = AdaNorm(0.3, config.layer_norm_eps) elif self.config.norm_type == 'scalenorm': self.ff_norm = ScaleNorm(config.hidden_size**0.5) self.ff1 = BertIntermediate(config) self.ff2 = nn.Linear(config.intermediate_size, config.hidden_size) self.ff_dropout = nn.Dropout(config.hidden_dropout_prob)
def __init__(self, config): super(BertConnectionLayer, self).__init__() self.biattention = BertBiAttention(config) self.biOutput = BertBiOutput(config) v_config = BertConfig.from_dict(config.v_config) self.v_intermediate = BertIntermediate(v_config) self.v_output = BertOutput(v_config) t_config = BertConfig.from_dict(config.t_config) self.t_intermediate = BertIntermediate(t_config) self.t_output = BertOutput(t_config)
def __init__(self, config): super().__init__() # The cross-attention Layer self.visual_attention = BertCrossattLayer(config) # Self-attention Layers self.lang_self_att = BertAttention(config) self.visn_self_att = BertAttention(config) # Intermediate and Output Layers (FFNs) self.lang_inter = BertIntermediate(config) self.lang_output = BertOutput(config) self.visn_inter = BertIntermediate(config) self.visn_output = BertOutput(config)
def init_data(self, use_cuda: bool) -> None: self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(1) torch.set_grad_enabled(False) self.cfg = BertConfig() self.torch_intermediate = BertIntermediate(self.cfg) if torch.cuda.is_available(): self.torch_intermediate.to(self.test_device) self.torch_intermediate.eval() self.turbo_intermediate = turbo_transformers.BertIntermediate.from_torch( self.torch_intermediate)
def __init__(self, config): super().__init__() self.attention = MyBertAttention10(config) self.is_decoder = config.is_decoder if self.is_decoder: self.crossattention = BertAttention(config) self.intermediate = BertIntermediate(config) self.output = MyBertOutput10(config)
def __init__(self, config): super(BertGraphLayer, self).__init__() self.attention = BertGraphAttention(config) self.is_decoder = config.is_decoder if self.is_decoder: self.crossattention = BertGraphAttention(config) self.intermediate = BertIntermediate(config) self.output = BertOutput(config)
def __init__(self, config): super(SpanAttentionLayer, self).__init__() # create modules self.attention = SpanAttention(config) self.intermediate = BertIntermediate(config) self.output = BertOutput(config) # initialize weights self.init_weights()
def __init__(self, config): super(GLTPairCompose, self).__init__() self._non_compositional_reps = config.non_compositional_reps if config.non_compositional_reps: self.lstm = nn.LSTM(config.hidden_size, config.hidden_size, num_layers=1, batch_first=True, bidirectional=True) self.attention = GLTPairComposeAttention(config) self.intermediate = BertIntermediate(config) self.output = GLTSelfOutput(config, dropout=True, dense=False, l_norm=True) self.weighted_output = GLTSelfOutput(config, dense=False, dropout=False) if config.grounded: self.meaning_query_2 = nn.Linear(config.hidden_size, 2) self.control_gate = None n_options = 0 if config.control_gate_add_skip: n_options += 1 if config.control_gate_add_intersect: n_options += 1 if config.control_gate_add_union: n_options += 1 if config.control_gate_add_vis: n_options += 1 if config.control_gate_add_extra_vis_module_left_branching: n_options += 1 if n_options == 0: raise (AttributeError("At least one module must be added")) self.control_gate = nn.Linear(config.hidden_size, n_options) self.control_gate_add_skip = config.control_gate_add_skip self.control_gate_add_union = config.control_gate_add_union self.control_gate_add_intersect = config.control_gate_add_intersect self.control_gate_add_vis = config.control_gate_add_vis self.control_gate_add_extra_vis_module_left_branching = config.control_gate_add_extra_vis_module_left_branching self.control_gate_set_vis_left_branching = config.control_gate_set_vis_left_branching if self.control_gate_add_vis: self.vis_text_text_comp = GLTVisualTextComp(config) self.constt_rep_lin = nn.Linear(config.hidden_size, config.hidden_size) self.activation = nn.GELU() self.dropout = nn.Dropout(config.layer_dropout_prob) self.constt_energy = self.lin = nn.Linear(config.hidden_size, 1)
def __init__(self, config): super().__init__() self.chunk_size_feed_forward = config.chunk_size_feed_forward self.seq_len_dim = 1 self.attention = BertAttention(config) self.is_decoder = config.is_decoder self.add_cross_attention = config.add_cross_attention if self.add_cross_attention: assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added" self.crossattention = BertAttention(config) self.intermediate = BertIntermediate(config) self.output = BertOutput(config)
def __init__(self, config): super(BertLayerOracleSparse, self).__init__() logger.info( f"Set Oracle Sparse with key_c:{config.key_c} and query_c:{config.query_c}!" ) self.attention = BertAttention(config) self.attention.self.output_attentions = True self.intermediate = BertIntermediate(config) self.output = BertOutput(config) self.key_c = config.key_c self.query_c = config.query_c self.num_heads = config.num_attention_heads
def __init__(self, config): super(BertScanLayer, self).__init__() self.attention = BertAttention(config) self.scan_attention = BertScanAttention(config) self.intermediate = BertIntermediate(config) self.output = BertOutput(config)
def __init__(self, config): super().__init__() self.attention = BertAttentionJit(config) self.intermediate = BertIntermediate(config) self.output = BertOutput(config)
class VariableNormTransformerLayer(nn.Module): def __init__(self, config): super(VariableNormTransformerLayer, self).__init__() self.config = config if self.config.norm_type == 'layer': self.attention_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) elif self.config.norm_type == 'adanorm': self.attention_norm = AdaNorm(0.3, config.layer_norm_eps) elif self.config.norm_type == 'scalenorm': self.attention_norm = ScaleNorm(config.hidden_size**0.5) self.self_attention = BertSelfAttention(config) self.self_out = nn.Linear(config.hidden_size, config.hidden_size) self.self_dropout = nn.Dropout(config.hidden_dropout_prob) if self.config.norm_type == 'layer': self.ff_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) elif self.config.norm_type == 'adanorm': self.ff_norm = AdaNorm(0.3, config.layer_norm_eps) elif self.config.norm_type == 'scalenorm': self.ff_norm = ScaleNorm(config.hidden_size**0.5) self.ff1 = BertIntermediate(config) self.ff2 = nn.Linear(config.intermediate_size, config.hidden_size) self.ff_dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states, attention_mask=None, *args, **kwargs): residual = hidden_states if self.config.prenorm: hidden_states = self.attention_norm(hidden_states) # Self-attention sublayers if not attention_mask is None: if attention_mask.ndim == 2: attention_mask = attention_mask[:, None, None, :] hidden_states, attentions = self.self_attention( hidden_states, attention_mask=attention_mask, output_attentions=self.config.output_attentions) hidden_states = self.self_out(hidden_states) hidden_states = self.self_dropout(hidden_states) + residual if not self.config.prenorm: hidden_states = self.attention_norm(hidden_states) residual = hidden_states if self.config.prenorm: hidden_states = self.ff_norm(hidden_states) # FF sublayer hidden_states = self.ff1(hidden_states) hidden_state = torch.nn.functional.gelu(hidden_states) hidden_states = self.ff2(hidden_states) hidden_states = self.ff_dropout(hidden_states) + residual if not self.config.prenorm: hidden_states = self.ff_norm(hidden_states) return hidden_states, attentions def load_from_bert(self, bert_layer): self.self_attention.load_state_dict( bert_layer.attention.self.state_dict()) self.self_out.load_state_dict( bert_layer.attention.output.dense.state_dict()) self.ff1.load_state_dict(bert_layer.intermediate.state_dict()) self.ff2.load_state_dict(bert_layer.output.dense.state_dict()) if self.config.norm_type == "layer": self.attention_norm.load_state_dict( bert_layer.attention.output.LayerNorm.state_dict()) self.ff_norm.load_state_dict( bert_layer.output.LayerNorm.state_dict())
def __init__(self, config): super(MyBertAdapterLayer9, self).__init__() self.new_attention = MyBertAttention9(config) self.new_intermediate = BertIntermediate(config) self.new_output = MyBertOutput9(config) self.adapter = BertAdapter9(config)
def __init__(self, config): super(CaptionBertLayer, self).__init__(config) self.attention = CaptionBertAttention(config) self.intermediate = BertIntermediate(config) self.output = BertOutput(config)
class TestBertIntermediate(unittest.TestCase): def init_data(self, use_cuda: bool) -> None: self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(1) torch.set_grad_enabled(False) self.cfg = BertConfig() self.torch_intermediate = BertIntermediate(self.cfg) if torch.cuda.is_available(): self.torch_intermediate.to(self.test_device) self.torch_intermediate.eval() self.turbo_intermediate = turbo_transformers.BertIntermediate.from_torch( self.torch_intermediate) def check_torch_and_turbo(self, use_cuda): self.init_data(use_cuda=use_cuda) device = "GPU" if use_cuda else "CPU" num_iter = 2 hidden_size = self.cfg.hidden_size input_tensor = torch.rand(size=(batch_size, seq_length, hidden_size), dtype=torch.float32, device=self.test_device) turbo_model = lambda: self.turbo_intermediate(input_tensor) turbo_result, turbo_qps, turbo_time = \ test_helper.run_model(turbo_model, use_cuda, num_iter) print( f"BertIntermediate \"({batch_size},{seq_length:03})\" ", f"{device} TurboTransform QPS, {turbo_qps}, time, {turbo_time}" ) torch_model = lambda: self.torch_intermediate(input_tensor) torch_result, torch_qps, torch_time = \ test_helper.run_model(torch_model, use_cuda, num_iter) print(f"BertIntermediate \"({batch_size},{seq_length:03})\" ", f"{device} Torch QPS, {torch_qps}, time, {torch_time}") torch_result = torch_result.cpu().numpy() turbo_result = turbo_result.cpu().numpy() self.assertTrue( numpy.allclose(torch_result, turbo_result, rtol=1e-4, atol=1e-3)) with open("bert_intermediate_res.txt", "a") as fh: fh.write( f"\"({batch_size},{seq_length:03})\", {torch_qps}, {torch_qps}\n" ) def test_intermediate(self): self.check_torch_and_turbo(use_cuda=False) if torch.cuda.is_available() and \ turbo_transformers.config.is_compiled_with_cuda(): self.check_torch_and_turbo(use_cuda=True)
def __init__(self, config): super(EntityAwareLayer, self).__init__() self.attention = EntityAwareAttention(config) self.intermediate = BertIntermediate(config) self.output = BertOutput(config)
def __init__(self, config): super(GramBertLayer, self).__init__() self.attention = GramBertAttention( config) # attention+linear+dropout+res-connnect+norm self.intermediate = BertIntermediate(config) # linear self.output = BertOutput(config) # linear+dropout+res-connnect+norm