Пример #1
0
    def forward(self, hidden_states, span_mask, position_ids=None):
        seq_length = span_mask.size(1)
        if position_ids is None:
            position_ids = torch.arange(seq_length,
                                        dtype=torch.long,
                                        device=span_mask.device)
            position_ids = position_ids.unsqueeze(0).expand_as(span_mask)
        position_embeddings = self.position_embeddings(
            position_ids)  # bs,sl,hn

        # get span representation
        span_mask = (span_mask > -1)  # bs,sl

        fw_idxs = torch.zeros_like(span_mask, dtype=torch.long)  # bs,sl
        for _idx_col in range(1, span_mask.size()[1]):
            fw_idxs[:, _idx_col] = torch.where(
                span_mask[:, _idx_col] & (~span_mask[:, _idx_col - 1]),
                torch.full_like(fw_idxs[:, _idx_col - 1], _idx_col - 1),
                fw_idxs[:, _idx_col - 1],
            )

        bw_idxs = torch.full_like(span_mask,
                                  span_mask.size(1) - 1,
                                  dtype=torch.long)  # bs,sl
        for _idx_col in range(span_mask.size(1) - 2, -1, -1):
            bw_idxs[:, _idx_col] = torch.where(
                span_mask[:, _idx_col] & ~span_mask[:, _idx_col + 1],
                torch.full_like(bw_idxs[:, _idx_col + 1], _idx_col + 1),
                bw_idxs[:, _idx_col + 1],
            )

        fw_idxs = fw_idxs.unsqueeze(-1).expand_as(hidden_states)  # bs,sl,hn
        bw_idxs = bw_idxs.unsqueeze(-1).expand_as(hidden_states)  # bs,sl,hn

        fw_hidden_states = torch.gather(hidden_states, 1, fw_idxs)  # bs,sl,hn
        bw_hidden_states = torch.gather(hidden_states, 1, bw_idxs)  # bs,sl,hn

        sbo_rep = torch.cat(
            [fw_hidden_states, bw_hidden_states, position_embeddings], dim=-1)
        sbo_rep = sbo_rep * span_mask.to(dtype=sbo_rep.dtype).unsqueeze(
            -1)  # bs,sl,3*hn

        mid_rep = self.layer_norm1(
            gelu(self.linear1(sbo_rep)).to(torch.float32))
        pre_logits = self.layer_norm2(
            gelu(self.linear2(mid_rep)).to(torch.float32))
        logits = self.decoder(pre_logits) + self.bias
        return logits
Пример #2
0
 def forward(self, features, **kwargs):
     x = self.dense(features)
     x = gelu(x)
     x = self.dropout(x)
     x = self.layer_norm(x)
     x = self.decoder(x)
     return x
Пример #3
0
    def forward(self, x):
        x = self.dense(x)
        x = gelu(x)
        x = self.layer_norm(x)

        x = self.decoder(x)

        return x
Пример #4
0
    def forward(self, features, weight, **kwargs):
        x = self.dense(features)
        x = gelu(x)
        # x = self.dropout(x)
        x = self.layer_norm(x)
        x = x.matmul(weight.t())

        return x
Пример #5
0
 def forward(self, x):
     for i, layer in enumerate(self.layers):
         x = layer(self.drop_out(x))
         if i < len(self.layers) - 1:
             x = gelu(x)
             if len(self.norm_layers):
                 x = self.norm_layers[i](x)
     return x
Пример #6
0
    def forward(self, features, **kwargs):
        x = self.dense(features)
        x = gelu(x)
        x = self.layer_norm(x)

        # project back to size of vocabulary with bias
        x = self.decoder(x) + self.bias

        return x
Пример #7
0
 def forward(self, x):
     return self.w_2(self.dropout(gelu(self.w_1(x))))