示例#1
0
    def __init__(self, bert_pretrained_weights, num_class, kernel_size,
                 kernel_nums):

        super().__init__()
        self.bert = BertModel.from_pretrained(bert_pretrained_weights)

        self.positional_encoding = PositionalEncoding(input_dim=768)
        # self.classifier = CNNClassifier(num_class=num_class,
        #                                 input_dim=768,
        #                                 kernel_nums=kernel_nums,
        #                                 kernel_sizes=kernel_size,
        #                                 max_kernel_size=kernel_size[-1])

        # self.essay_feature_extracter = CNNFeatureExtrater(
        #     input_dim=768,
        #     output_dim=300,
        #     kernel_nums=kernel_nums,
        #     kernel_sizes=kernel_size,
        #     max_kernel_size=kernel_size[-1]
        # )
        # self.prompt_feature_extracter = CNNFeatureExtrater(
        #     input_dim=768,
        #     output_dim=300,
        #     kernel_sizes=[2, 4, 8, 16, 32, 64, 128, 256],
        #     kernel_nums=[64, 64, 64, 64, 64, 64, 64, 64],
        #     max_kernel_size=kernel_size[-1]
        # )
        self.linear_layer = nn.Linear(768 * 2, num_class)

        self.dropout_layer = nn.Dropout(0.5)
        self.criterion = nn.NLLLoss(reduction='mean')
    def __init__(self, bert_pretrained_weights):

        super().__init__()
        self.bert = BertModel.from_pretrained(bert_pretrained_weights)
        self.positional_encoding = PositionalEncoding(input_dim=768)

        self.linear_layer = nn.Linear(768 + 5 + 300, 1)
        self.dropout_layer = nn.Dropout(0.6)
        self.criterion = nn.MSELoss(reduction='sum')

        self.manual_feature_layer = nn.Linear(27, 5)

        self.prompt_global_attention = GlobalAttention(hid_dim=768,
                                                       key_size=768)
        self.prompt_doc_attention = BahdanauAttention(hid_dim=768,
                                                      key_size=768,
                                                      query_size=768)

        self.segment_encoder = RNNEncoder(embedding_dim=768,
                                          hid_dim=150,
                                          num_layers=1,
                                          dropout_rate=0.5)

        nn.init.uniform_(self.linear_layer.weight.data, -0.1, 0.1)
        nn.init.zeros_(self.linear_layer.bias.data)
示例#3
0
    def __init__(self, bert_pretrained_weights):

        super().__init__()
        self.bert = BertModel.from_pretrained(bert_pretrained_weights)

        self.positional_encoding = PositionalEncoding(input_dim=768)

        self.linear_layer = nn.Linear(768 * 2, 1)
        self.dropout_layer = nn.Dropout(0.5)
        self.criterion = nn.MSELoss(reduction='sum')

        nn.init.uniform_(self.linear_layer.weight.data, -0.1, 0.1)
        nn.init.zeros_(self.linear_layer.bias.data)
示例#4
0
    def __init__(self, bert_pretrained_weights, num_class, kernel_nums, kernel_size):

        super().__init__()
        self.bert = BertModel.from_pretrained(bert_pretrained_weights)
        self.positional_encoding = PositionalEncoding(input_dim=768)

        self.doc_feature_extracter = CNNFeatureExtrater(input_dim=768, output_dim=768, kernel_nums=kernel_nums, kernel_sizes=kernel_size)
        self.prompt_feature_extracter = CNNFeatureExtrater(input_dim=768, output_dim=768, kernel_nums=kernel_nums, kernel_sizes=kernel_size)

        self.linear_layer = nn.Linear(768 * 2, num_class)
        self.dropout_layer = nn.Dropout(0.5)
        self.criterion = nn.NLLLoss(reduction='sum')

        nn.init.uniform_(self.linear_layer.weight.data, -0.1, 0.1)
        nn.init.zeros_(self.linear_layer.bias.data)
示例#5
0
文件: lbl.py 项目: huanghonggit/ELMo
    def __init__(self,
                 width: int,
                 input_size: int,
                 hidden_size: int,
                 n_layers: int,
                 n_highway: int,
                 use_position: bool = False,
                 dropout: float = 0.0):
        super(LBLHighwayBiLmV2, self).__init__()
        self.use_position = use_position
        self.n_layers = n_layers = n_layers
        self.n_highway = n_highway = n_highway
        self.dropout = torch.nn.Dropout(p=dropout)

        self.width = width
        self.input_size = input_size
        self.hidden_size = hidden_size

        forward_scores, backward_scores = [], []
        forward_blocks, backward_blocks = [], []

        for _ in range(n_layers):
            forward_scores.append(torch.nn.Parameter(torch.randn(width + 1)))
            backward_scores.append(torch.nn.Parameter(torch.randn(width + 1)))

            forward_blocks.append(Highway(hidden_size, num_layers=n_highway))
            backward_blocks.append(Highway(hidden_size, num_layers=n_highway))

        self.forward_weights = torch.nn.ParameterList(forward_scores)
        self.backward_weights = torch.nn.ParameterList(backward_scores)
        self.forward_blocks = torch.nn.ModuleList(forward_blocks)
        self.backward_blocks = torch.nn.ModuleList(backward_blocks)

        if self.use_position:
            self.position = PositionalEncoding(hidden_size)
示例#6
0
    def __init__(
        self,
        decoding_dim: int,
        target_embedding_dim: int,
        feedforward_hidden_dim: int,
        num_layers: int,
        num_attention_heads: int,
        use_positional_encoding: bool = True,
        positional_encoding_max_steps: int = 5000,
        dropout_prob: float = 0.1,
        residual_dropout_prob: float = 0.2,
        attention_dropout_prob: float = 0.1,
    ) -> None:

        super().__init__(decoding_dim=decoding_dim,
                         target_embedding_dim=target_embedding_dim,
                         decodes_parallel=True)

        attn = MultiHeadedAttention(num_attention_heads, decoding_dim,
                                    attention_dropout_prob)
        feed_forward = PositionwiseFeedForward(decoding_dim,
                                               feedforward_hidden_dim,
                                               dropout_prob)
        self._embed_scale = math.sqrt(decoding_dim)
        self._positional_embedder = PositionalEncoding(decoding_dim,
                                                       positional_encoding_max_steps) \
                                                       if use_positional_encoding else None
        self._dropout = nn.Dropout(dropout_prob)
        self._self_attention = Decoder(
            DecoderLayer(decoding_dim, deepcopy(attn), deepcopy(attn),
                         feed_forward, residual_dropout_prob), num_layers)
    def __init__(self, bert_pretrained_weights, num_class):

        super().__init__()
        self.bert = BertModel.from_pretrained(bert_pretrained_weights)

        self.positional_encoding = PositionalEncoding(input_dim=768)

        self.linear_layer = nn.Linear(768 * 2 + 5, num_class)
        self.manual_feature_layer = nn.Linear(27, 5)
        self.dropout_layer = nn.Dropout(0.5)
        self.criterion = nn.NLLLoss(reduction='mean')

        self.prompt_global_attention = GlobalAttention(hid_dim=768,
                                                       key_size=768)
        self.doc_global_attention = GlobalAttention(hid_dim=768, key_size=768)

        nn.init.uniform_(self.linear_layer.weight.data, -0.1, 0.1)
        nn.init.zeros_(self.linear_layer.bias.data)
示例#8
0
    def __init__(self,
                 width: int,
                 input_size: int,
                 hidden_size: int,
                 n_heads: int,
                 n_layers: int,
                 n_highway: int,
                 use_position: bool = False,
                 use_relative_position: bool = False,
                 dropout: float = 0.0):
        super(SelfAttentiveLBLBiLMV3, self).__init__()
        self.use_position = use_position
        self.use_relative_position_weights = use_relative_position
        self.n_layers = n_layers
        self.n_highway = n_highway
        self.n_heads = n_heads
        self.input_size = input_size
        self.width = width
        self.hidden_size = hidden_size

        forward_attns, backward_attns = [], []
        forward_blocks, backward_blocks = [], []

        for _ in range(n_layers):
            if self.use_relative_position_weights:
                forward_attn = MultiHeadedAttentionWithRelativePositionEmbeddings(
                    n_heads,
                    hidden_size,
                    width=width + 1,
                    left_to_right=True,
                    dropout=dropout)
                backward_attn = MultiHeadedAttentionWithRelativePositionEmbeddings(
                    n_heads,
                    hidden_size,
                    width=width + 1,
                    left_to_right=False,
                    dropout=dropout)
            else:
                forward_attn = MultiHeadedAttention(n_heads, hidden_size,
                                                    dropout)
                backward_attn = MultiHeadedAttention(n_heads, hidden_size,
                                                     dropout)

            forward_attns.append(forward_attn)
            backward_attns.append(backward_attn)
            forward_blocks.append(Highway(hidden_size, n_highway))
            backward_blocks.append(Highway(hidden_size, n_highway))

        self.forward_attns = torch.nn.ModuleList(forward_attns)
        self.backward_attns = torch.nn.ModuleList(backward_attns)

        self.forward_blocks = torch.nn.ModuleList(forward_blocks)
        self.backward_blocks = torch.nn.ModuleList(backward_blocks)

        if self.use_position:
            self.position = PositionalEncoding(hidden_size)
示例#9
0
    def __init__(self,
                 width: int,
                 input_size: int,
                 hidden_size: int,
                 n_layers: int,
                 n_highway: int,
                 use_position: bool = False,
                 dropout: float = 0.0):
        super(Bengio03HighwayBiLmV2, self).__init__()
        self.use_position = use_position
        self.n_layers = n_layers
        self.n_highway = n_highway

        self.dropout = torch.nn.Dropout(p=dropout)
        self.activation = torch.nn.ReLU()

        self.width = width
        self.input_size = input_size
        self.context_input_size = input_size * (width + 1)
        self.hidden_size = hidden_size

        self.forward_paddings = torch.nn.ModuleList([
            torch.nn.ConstantPad2d((0, 0, length, 0), 0)
            for length in range(width + 1)
        ])
        self.backward_paddings = torch.nn.ModuleList([
            torch.nn.ConstantPad2d((0, 0, 0, length), 0)
            for length in range(width + 1)
        ])

        forward_blocks = []
        backward_blocks = []
        for layer_index in range(self.n_layers):
            forward_layer = torch.nn.ModuleList([
                torch.nn.Linear(input_size, hidden_size, bias=False)
                for _ in range(width + 1)
            ])
            backward_layer = torch.nn.ModuleList([
                torch.nn.Linear(input_size, hidden_size, bias=False)
                for _ in range(width + 1)
            ])
            self.add_module('forward_layer_{}'.format(layer_index),
                            forward_layer)
            self.add_module('backward_layer_{}'.format(layer_index),
                            backward_layer)

            forward_blocks.append(Highway(hidden_size, num_layers=n_highway))
            backward_blocks.append(Highway(hidden_size, num_layers=n_highway))

        self.forward_blocks = torch.nn.ModuleList(forward_blocks)
        self.backward_blocks = torch.nn.ModuleList(backward_blocks)

        if self.use_position:
            self.position = PositionalEncoding(hidden_size)
示例#10
0
文件: lbl.py 项目: huanghonggit/ELMo
    def __init__(self,
                 width: int,
                 input_size: int,
                 hidden_size: int,
                 n_layers: int,
                 use_position: bool = False,
                 dropout: float = 0.0):
        super(LBLResNetBiLm, self).__init__()
        self.use_position = use_position

        self.dropout = torch.nn.Dropout(dropout)
        self.activation = torch.nn.ReLU()

        self.width = width
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers

        forward_paddings, backward_paddings = [], []
        forward_weights, backward_weights = [], []
        for _ in range(self.n_layers):
            forward_paddings.append(
                torch.nn.Parameter(
                    torch.randn(width, hidden_size) / np.sqrt(hidden_size)))
            backward_paddings.append(
                torch.nn.Parameter(
                    torch.randn(width, hidden_size) / np.sqrt(hidden_size)))
            forward_weights.append(torch.nn.Parameter(torch.randn(width + 1)))
            backward_weights.append(torch.nn.Parameter(torch.randn(width + 1)))

        self.forward_paddings = torch.nn.ParameterList(forward_paddings)
        self.backward_paddings = torch.nn.ParameterList(backward_paddings)
        self.forward_weights = torch.nn.Parameter(forward_weights)
        self.backward_weights = torch.nn.Parameter(backward_weights)

        if self.use_position:
            self.position = PositionalEncoding(hidden_size)

        self.forward_linears = torch.nn.ModuleList([
            PositionwiseFeedForward(hidden_size, hidden_size, dropout)
            for _ in range(n_layers)
        ])
        self.backward_linears = torch.nn.ModuleList([
            PositionwiseFeedForward(hidden_size, hidden_size, dropout)
            for _ in range(n_layers)
        ])

        self.forward_blocks = torch.nn.ModuleList([
            SublayerConnection(hidden_size, dropout) for _ in range(n_layers)
        ])
        self.backward_blocks = torch.nn.ModuleList([
            SublayerConnection(hidden_size, dropout) for _ in range(n_layers)
        ])
示例#11
0
    def __init__(self,
                 width: int,
                 input_size: int,
                 hidden_size: int,
                 n_layers: int,
                 n_highway: int,
                 use_position: bool = False,
                 dropout: float = 0.0):
        super(Bengio03HighwayBiLm, self).__init__()
        self.use_position = use_position
        self.n_layers = n_layers
        self.n_highway = n_highway

        self.dropout = torch.nn.Dropout(p=dropout)
        self.activation = torch.nn.ReLU()

        self.width = width
        self.input_size = input_size
        self.context_input_size = input_size * (width + 1)
        self.hidden_size = hidden_size

        forward_paddings, backward_paddings = [], []
        forward_blocks, backward_blocks = [], []
        forward_projects, backward_projects = [], []
        for i in range(n_layers):
            forward_paddings.append(
                torch.nn.Parameter(torch.randn(width, hidden_size)))
            backward_paddings.append(
                torch.nn.Parameter(torch.randn(width, hidden_size)))

            forward_blocks.append(Highway(hidden_size, num_layers=n_highway))
            backward_blocks.append(Highway(hidden_size, num_layers=n_highway))

            forward_projects.append(
                torch.nn.Linear(self.context_input_size, hidden_size))
            backward_projects.append(
                torch.nn.Linear(self.context_input_size, hidden_size))

        self.forward_projects = torch.nn.ModuleList(forward_projects)
        self.backward_projects = torch.nn.ModuleList(backward_projects)
        self.forward_paddings = torch.nn.ParameterList(forward_paddings)
        self.backward_paddings = torch.nn.ParameterList(backward_paddings)
        self.forward_blocks = torch.nn.ModuleList(forward_blocks)
        self.backward_blocks = torch.nn.ModuleList(backward_blocks)

        if self.use_position:
            self.position = PositionalEncoding(hidden_size)

        self.reset_parameters()
示例#12
0
    def __init__(
        self,
        num_layers: int,
        decoding_dim: int,
        target_embedding_dim: int,
        feedforward_hidden_dim: int,
        num_attention_heads: int,
        combiner: TransformerCombiner,
        num_sources: int,
        use_positional_encoding: bool = True,
        positional_encoding_max_steps: int = 5000,
        dropout_prob: float = 0.1,
        residual_dropout_prob: float = 0.2,
        attention_dropout_prob: float = 0.2,
    ) -> None:
        super().__init__(decoding_dim,
                         target_embedding_dim,
                         decodes_parallel=True)

        self._decoding_dim = decoding_dim
        self._embed_scale = math.sqrt(decoding_dim)

        self._positional_embedder = (PositionalEncoding(
            input_dim=decoding_dim, max_len=positional_encoding_max_steps)
                                     if use_positional_encoding else None)
        self._dropout = nn.Dropout(dropout_prob)

        generic_attn = MultiHeadedAttention(num_attention_heads, decoding_dim,
                                            attention_dropout_prob)
        combined_attn = AttentionCombiner(num_sources, generic_attn, combiner)
        feed_forward = PositionwiseFeedForward(decoding_dim,
                                               feedforward_hidden_dim,
                                               dropout_prob)

        layer = DecoderLayer(size=decoding_dim,
                             self_attn=deepcopy(generic_attn),
                             src_attn=deepcopy(combined_attn),
                             feed_forward=feed_forward,
                             dropout=residual_dropout_prob)

        self._self_attention_layers = _clones(layer, num_layers)
        self.norm = nn.LayerNorm(layer.size)
class BertGlobalAttentionClassifier(nn.Module):
    def __init__(self, bert_pretrained_weights, num_class):

        super().__init__()
        self.bert = BertModel.from_pretrained(bert_pretrained_weights)

        self.positional_encoding = PositionalEncoding(input_dim=768)

        self.linear_layer = nn.Linear(768 * 2 + 5, num_class)
        self.manual_feature_layer = nn.Linear(27, 5)
        self.dropout_layer = nn.Dropout(0.5)
        self.criterion = nn.NLLLoss(reduction='mean')

        self.prompt_global_attention = GlobalAttention(hid_dim=768,
                                                       key_size=768)
        self.doc_global_attention = GlobalAttention(hid_dim=768, key_size=768)

        nn.init.uniform_(self.linear_layer.weight.data, -0.1, 0.1)
        nn.init.zeros_(self.linear_layer.bias.data)

    def forward(self,
                inputs,
                mask,
                sent_counts,
                sent_lens,
                prompt_inputs,
                prompt_mask,
                prompt_sent_counts,
                prompt_sent_lens,
                manual_feature,
                label=None):
        """

        :param prompt_sent_lens:
        :param prompt_sent_counts:
        :param prompt_inputs:
        :param prompt_mask:
        :param inputs:  [batch size, max sent count, max sent len]
        :param mask:    [batch size, max sent count, max sent len]
        :param sent_counts: [batch size]
        :param sent_lens: [batch size, max sent count]
        :param label: [batch size]
        :return:
        """
        batch_size = inputs.shape[0]
        max_sent_count = inputs.shape[1]
        max_sent_length = inputs.shape[2]

        inputs = inputs.view(-1, inputs.shape[-1])
        mask = mask.view(-1, mask.shape[-1])

        # [batch size * max sent len, hid size]
        last_hidden_states = self.bert(input_ids=inputs,
                                       attention_mask=mask)[0]
        last_hidden_states = last_hidden_states.view(batch_size,
                                                     max_sent_count,
                                                     max_sent_length, -1)

        prompt_inputs = prompt_inputs.view(-1, prompt_inputs.shape[-1])
        prompt_mask = prompt_mask.view(-1, prompt_mask.shape[-1])
        prompt_hidden_states = self.bert(input_ids=prompt_inputs,
                                         attention_mask=prompt_mask)[0]

        docs = []
        lens = []
        for i in range(0, batch_size):
            doc = []
            sent_count = sent_counts[i]
            sent_len = sent_lens[i]

            for j in range(sent_count):
                length = sent_len[j]
                cur_sent = last_hidden_states[i, j, :length, :]
                # print('cur sent shape', cur_sent.shape)
                doc.append(cur_sent)

            # mean for a doc
            doc_vec = torch.cat(doc, dim=0).unsqueeze(0)
            doc_vec = self.positional_encoding.forward(doc_vec)

            lens.append(doc_vec.shape[1])
            # print(i, 'doc shape', doc_vec.shape)
            docs.append(doc_vec)

        batch_max_len = max(lens)
        for i, doc in enumerate(docs):
            if doc.shape[1] < batch_max_len:
                pd = (0, 0, 0, batch_max_len - doc.shape[1])
                m = nn.ConstantPad2d(pd, 0)
                doc = m(doc)

            docs[i] = doc

        # [batch size, bert embedding dim]
        docs = torch.cat(docs, 0)
        docs_mask = get_mask_from_sequence_lengths(
            torch.tensor(lens), max_length=batch_max_len).to(docs.device)

        prompt = []
        for j in range(prompt_sent_counts):
            length = prompt_sent_lens[0][j]
            sent = prompt_hidden_states[j, :length, :]
            prompt.append(sent)

        prompt_vec = torch.cat(prompt, dim=0).unsqueeze(0)
        prompt_vec = self.positional_encoding.forward(prompt_vec)
        prompt_len = prompt_vec.shape[1]
        prompt_attention_mask = get_mask_from_sequence_lengths(
            torch.tensor([prompt_len]),
            max_length=prompt_len).to(prompt_vec.device)
        # [1, seq len]
        prompt_vec_weights = self.prompt_global_attention(
            prompt_vec, prompt_attention_mask)
        # [1, bert hidden size]
        prompt_vec = torch.bmm(prompt_vec_weights.unsqueeze(1),
                               prompt_vec).squeeze(1)

        doc_weights = self.doc_global_attention(docs, docs_mask)
        doc_vec = torch.bmm(doc_weights.unsqueeze(1), docs).squeeze(1)

        doc_feature = self.dropout_layer(torch.tanh(doc_vec))
        prompt_feature = self.dropout_layer(
            torch.tanh(prompt_vec.expand_as(doc_feature)))
        feature = torch.cat([doc_feature, prompt_feature], dim=-1)

        log_probs = torch.log_softmax(self.linear_layer(feature), dim=-1)

        # log_probs = self.classifier(docs)
        if label is not None:
            loss = self.criterion(input=log_probs.contiguous().view(
                -1, log_probs.shape[-1]),
                                  target=label.contiguous().view(-1))
        else:
            loss = None

        prediction = torch.max(log_probs, dim=1)[1]
        return {'loss': loss, 'prediction': prediction}
示例#14
0
class BertClassifier(nn.Module):
    def __init__(self, bert_pretrained_weights, num_class, kernel_size,
                 kernel_nums):

        super().__init__()
        self.bert = BertModel.from_pretrained(bert_pretrained_weights)

        self.positional_encoding = PositionalEncoding(input_dim=768)
        # self.classifier = CNNClassifier(num_class=num_class,
        #                                 input_dim=768,
        #                                 kernel_nums=kernel_nums,
        #                                 kernel_sizes=kernel_size,
        #                                 max_kernel_size=kernel_size[-1])

        # self.essay_feature_extracter = CNNFeatureExtrater(
        #     input_dim=768,
        #     output_dim=300,
        #     kernel_nums=kernel_nums,
        #     kernel_sizes=kernel_size,
        #     max_kernel_size=kernel_size[-1]
        # )
        # self.prompt_feature_extracter = CNNFeatureExtrater(
        #     input_dim=768,
        #     output_dim=300,
        #     kernel_sizes=[2, 4, 8, 16, 32, 64, 128, 256],
        #     kernel_nums=[64, 64, 64, 64, 64, 64, 64, 64],
        #     max_kernel_size=kernel_size[-1]
        # )
        self.linear_layer = nn.Linear(768 * 2, num_class)

        self.dropout_layer = nn.Dropout(0.5)
        self.criterion = nn.NLLLoss(reduction='mean')

    def forward(self,
                inputs,
                mask,
                sent_counts,
                sent_lens,
                prompt_inputs,
                prompt_mask,
                prompt_sent_counts,
                prompt_sent_lens,
                label=None):
        """

        :param prompt_sent_lens:
        :param prompt_sent_counts:
        :param prompt_inputs:
        :param prompt_mask:
        :param inputs:  [batch size, max sent count, max sent len]
        :param mask:    [batch size, max sent count, max sent len]
        :param sent_counts: [batch size]
        :param sent_lens: [batch size, max sent count]
        :param label: [batch size]
        :return:
        """
        batch_size = inputs.shape[0]
        max_sent_count = inputs.shape[1]
        max_sent_length = inputs.shape[2]

        inputs = inputs.view(-1, inputs.shape[-1])
        mask = mask.view(-1, mask.shape[-1])

        # [batch size * max sent len, hid size]
        last_hidden_states = self.bert(input_ids=inputs,
                                       attention_mask=mask)[0]
        last_hidden_states = last_hidden_states.view(batch_size,
                                                     max_sent_count,
                                                     max_sent_length, -1)

        prompt_inputs = prompt_inputs.view(-1, prompt_inputs.shape[-1])
        prompt_mask = prompt_mask.view(-1, prompt_mask.shape[-1])
        prompt_hidden_states = self.bert(input_ids=prompt_inputs,
                                         attention_mask=prompt_mask)[0]

        docs = []
        lens = []
        for i in range(0, batch_size):
            doc = []
            sent_count = sent_counts[i]
            sent_len = sent_lens[i]

            for j in range(sent_count):
                length = sent_len[j]
                cur_sent = last_hidden_states[i, j, :length, :]
                # print('cur sent shape', cur_sent.shape)
                doc.append(cur_sent)

            doc_vec = torch.cat(doc, dim=0).unsqueeze(0)
            doc_vec = self.positional_encoding.forward(doc_vec)
            doc_vec = torch.mean(doc_vec, dim=1)

            lens.append(doc_vec.shape[0])
            # print(i, 'doc shape', doc_vec.shape)
            docs.append(doc_vec)

        # batch_max_len = max(lens)
        # for i, doc in enumerate(docs):
        #     if doc.shape[0] < batch_max_len:
        #         pd = (0, 0, 0, batch_max_len - doc.shape[0])
        #         m = nn.ConstantPad2d(pd, 0)
        #         doc = m(doc)
        #
        #     docs[i] = doc.unsqueeze(0)

        docs = torch.cat(docs, 0)
        # print(docs.shape)
        # docs = self.positional_encoding.forward(docs)
        # [batch size, num_class]

        prompt = []
        for j in range(prompt_sent_counts):
            length = prompt_sent_lens[0][j]
            sent = prompt_hidden_states[j, :length, :]
            prompt.append(sent)

        prompt_vec = torch.cat(prompt, dim=0).unsqueeze(0)
        prompt_vec = self.positional_encoding.forward(prompt_vec)
        prompt_vec = torch.mean(prompt_vec, dim=1)

        # [batch size, feature size]
        # doc_feature = self.essay_feature_extracter(docs)
        # prompt_feature = self.prompt_feature_extracter(prompt_vec)
        # prompt_feature = prompt_feature.expand_as(doc_feature)

        doc_feature = self.dropout_layer(torch.tanh(docs))
        prompt_feature = self.dropout_layer(
            torch.tanh(prompt_vec.expand_as(doc_feature)))

        feature = torch.cat([doc_feature, prompt_feature], dim=-1)
        log_probs = torch.log_softmax(self.linear_layer(feature), dim=-1)

        # log_probs = self.classifier(docs)
        if label is not None:
            loss = self.criterion(input=log_probs.contiguous().view(
                -1, log_probs.shape[-1]),
                                  target=label.contiguous().view(-1))
        else:
            loss = None

        prediction = torch.max(log_probs, dim=1)[1]
        return {'loss': loss, 'prediction': prediction}
class MixBertRecurrentAttentionRegressor(nn.Module):
    def __init__(self, bert_pretrained_weights):

        super().__init__()
        self.bert = BertModel.from_pretrained(bert_pretrained_weights)
        self.positional_encoding = PositionalEncoding(input_dim=768)

        self.linear_layer = nn.Linear(768 + 5 + 300, 1)
        self.dropout_layer = nn.Dropout(0.6)
        self.criterion = nn.MSELoss(reduction='sum')

        self.manual_feature_layer = nn.Linear(27, 5)

        self.prompt_global_attention = GlobalAttention(hid_dim=768,
                                                       key_size=768)
        self.prompt_doc_attention = BahdanauAttention(hid_dim=768,
                                                      key_size=768,
                                                      query_size=768)

        self.segment_encoder = RNNEncoder(embedding_dim=768,
                                          hid_dim=150,
                                          num_layers=1,
                                          dropout_rate=0.5)

        nn.init.uniform_(self.linear_layer.weight.data, -0.1, 0.1)
        nn.init.zeros_(self.linear_layer.bias.data)

    def forward(self,
                inputs,
                mask,
                sent_counts,
                sent_lens,
                prompt_inputs,
                prompt_mask,
                prompt_sent_counts,
                prompt_sent_lens,
                min_score,
                max_score,
                manual_feature,
                label=None):
        """

        :param manual_feature: [batch size]
        :param max_score: [batch size]
        :param min_score: [batch size]
        :param prompt_sent_lens: [batch size, max sent count]
        :param prompt_sent_counts: [batch size]
        :param prompt_inputs:   [batch size, max sent count, max sent len]
        :param prompt_mask: [batch size, max sent count, max sent len]
        :param inputs:  [batch size, max sent count, max sent len]
        :param mask:    [batch size, max sent count, max sent len]
        :param sent_counts: [batch size]
        :param sent_lens: [batch size, max sent count]
        :param label: [batch size]
        :return:
        """
        batch_size = inputs.shape[0]
        max_sent_count = inputs.shape[1]
        max_sent_length = inputs.shape[2]

        max_prompt_sent_count = prompt_inputs.shape[1]
        max_prompt_sent_length = prompt_inputs.shape[2]

        inputs = inputs.view(-1, inputs.shape[-1])
        mask = mask.view(-1, mask.shape[-1])

        # [batch size * max sent len, hid size]
        last_hidden_states = self.bert(input_ids=inputs,
                                       attention_mask=mask)[0]
        last_hidden_states = last_hidden_states.view(batch_size,
                                                     max_sent_count,
                                                     max_sent_length, -1)
        last_hidden_states = self.dropout_layer(last_hidden_states)

        prompt_inputs = prompt_inputs.view(-1, prompt_inputs.shape[-1])
        prompt_mask = prompt_mask.view(-1, prompt_mask.shape[-1])
        prompt_hidden_states = self.bert(input_ids=prompt_inputs,
                                         attention_mask=prompt_mask)[0]
        prompt_hidden_states = prompt_hidden_states.view(
            batch_size, max_prompt_sent_count, max_prompt_sent_length, -1)
        prompt_hidden_states = self.dropout_layer(prompt_hidden_states)

        docs = []
        lens = []
        doc_segments = []
        for i in range(0, batch_size):
            doc = []
            doc_segment = []
            sent_count = sent_counts[i]
            sent_len = sent_lens[i]

            for j in range(sent_count):
                length = sent_len[j]
                cur_sent = last_hidden_states[i, j, :length, :]
                mean_cur_sent = torch.mean(cur_sent, dim=0)
                # print('cur sent shape', cur_sent.shape)
                doc.append(cur_sent)
                doc_segment.append(mean_cur_sent.unsqueeze(0))

            # [1, len, hid size]
            doc_vec = torch.cat(doc, dim=0).unsqueeze(0)
            doc_vec = self.positional_encoding.forward(doc_vec)

            lens.append(doc_vec.shape[1])
            # print(i, 'doc shape', doc_vec.shape)
            docs.append(doc_vec)
            doc_segments.append(doc_segment)

        batch_max_len = max(lens)
        for i, doc in enumerate(docs):
            if doc.shape[1] < batch_max_len:
                pd = (0, 0, 0, batch_max_len - doc.shape[1])
                m = nn.ConstantPad2d(pd, 0)
                doc = m(doc)

            docs[i] = doc

        # [batch size, bert embedding dim]
        docs = torch.cat(docs, 0)
        docs_mask = get_mask_from_sequence_lengths(
            torch.tensor(lens), max_length=batch_max_len).to(docs.device)
        # print('lens ', lens)
        # print('docs shape', docs.shape)

        prompt_docs = []
        prompt_lens = []
        for i in range(0, batch_size):
            prompt_doc = []
            prompt_sent_count = prompt_sent_counts[i]
            prompt_sent_len = prompt_sent_lens[i]

            for j in range(prompt_sent_count):
                length = prompt_sent_len[j]
                cur_sent = prompt_hidden_states[i, j, :length, :]
                prompt_doc.append(cur_sent)

            prompt_doc_vec = torch.cat(prompt_doc, dim=0).unsqueeze(0)
            prompt_doc_vec = self.positional_encoding.forward(prompt_doc_vec)

            prompt_lens.append(prompt_doc_vec.shape[1])
            prompt_docs.append(prompt_doc_vec)

        prompt_batch_max_len = max(prompt_lens)
        for i, doc in enumerate(prompt_docs):
            if doc.shape[1] < prompt_batch_max_len:
                pd = (0, 0, 0, prompt_batch_max_len - doc.shape[1])
                m = nn.ConstantPad2d(pd, 0)
                doc = m(doc)

            prompt_docs[i] = doc

        prompt_docs = torch.cat(prompt_docs, 0)
        prompt_attention_mask = get_mask_from_sequence_lengths(
            torch.tensor(prompt_lens),
            max_length=prompt_batch_max_len).to(docs.device)
        # [batch size, max seq len]
        prompt_vec_weights = self.prompt_global_attention(
            prompt_docs, prompt_attention_mask)

        # [batch size, bert hidden size]
        prompt_vec = torch.bmm(prompt_vec_weights.unsqueeze(1),
                               prompt_docs).squeeze(1)
        # print('prompt len', prompt_len)

        doc_weights = self.prompt_doc_attention(query=prompt_vec,
                                                key=docs,
                                                mask=docs_mask)
        doc_vec = torch.bmm(doc_weights.unsqueeze(1), docs).squeeze(1)
        doc_feature = self.dropout_layer(torch.tanh(doc_vec))
        manual_feature = torch.tanh(
            self.manual_feature_layer(self.dropout_layer(manual_feature)))

        # rnn segments encoder
        sorted_index = sorted(range(len(sent_counts)),
                              key=lambda i: sent_counts[i],
                              reverse=True)
        max_count = max_sent_count
        for idx, doc in enumerate(doc_segments):
            for i in range(max_count - len(doc)):
                doc.append(torch.zeros_like(doc[0]))
            doc_segments[idx] = torch.cat(doc, dim=0).unsqueeze(0)
        doc_segments = torch.cat(doc_segments, dim=0)

        sorted_doc_segments = doc_segments[sorted_index]
        sorted_batch_counts = sent_counts[sorted_index]
        final_hidden_states = self.segment_encoder(
            sorted_doc_segments, sorted_batch_counts)['final_hidden_states']
        final_hidden_states[sorted_index] = final_hidden_states
        final_hidden_states = torch.tanh(final_hidden_states)
        final_hidden_states = self.dropout_layer(final_hidden_states)

        # feature = self.dropout_layer(torch.tanh(doc_vec))
        # prompt_feature = self.dropout_layer(torch.tanh(prompt_vec.expand_as(doc_feature)))
        feature = torch.cat([doc_feature, manual_feature, final_hidden_states],
                            dim=-1)

        grade = self.linear_layer(feature)
        if label is not None:
            # print('label ', label)
            # print('min score ', min_score)
            # print('max score ', max_score)
            # grade = grade * (max_score - min_score) + min_score
            label = (label.type_as(grade) - min_score.type_as(grade)) / (
                max_score.type_as(grade) - min_score.type_as(grade))
            loss = self.criterion(
                input=grade.contiguous().view(-1),
                target=label.type_as(grade).contiguous().view(-1))
        else:
            loss = None

        prediction = grade * (max_score.type_as(grade) - min_score.type_as(
            grade)) + min_score.type_as(grade)
        return {'loss': loss, 'prediction': prediction}
示例#16
0
    def __init__(self,
                 width: int,
                 input_size: int,
                 hidden_size: int,
                 n_heads: int,
                 n_layers: int,
                 n_highway: int,
                 use_position: bool = False,
                 use_relative_position: bool = False,
                 dropout: float = 0.0):
        super(SelfAttentiveLBLBiLM, self).__init__()
        self.use_position = use_position
        self.use_relative_position_weights = use_relative_position
        self.n_layers = n_layers
        self.n_highway = n_highway
        self.n_heads = n_heads
        self.input_size = input_size
        self.width = width
        self.hidden_size = hidden_size

        forward_attns, backward_attns = [], []
        forward_paddings, backward_paddings = [], []
        forward_blocks, backward_blocks = [], []
        forward_weights, backward_weights = [], []

        for _ in range(n_layers):
            forward_attns.append(
                MultiHeadedAttention(n_heads, hidden_size, dropout))
            backward_attns.append(
                MultiHeadedAttention(n_heads, hidden_size, dropout))

            forward_paddings.append(
                torch.nn.Parameter(
                    torch.randn(width, hidden_size) / np.sqrt(hidden_size)))
            backward_paddings.append(
                torch.nn.Parameter(
                    torch.randn(width, hidden_size) / np.sqrt(hidden_size)))

            forward_blocks.append(Highway(hidden_size, n_highway))
            backward_blocks.append(Highway(hidden_size, n_highway))

            if self.use_relative_position_weights:
                forward_weights.append(
                    torch.nn.Parameter(torch.randn(width + 1)))
                backward_weights.append(
                    torch.nn.Parameter(torch.randn(width + 1)))

        self.forward_attns = torch.nn.ModuleList(forward_attns)
        self.backward_attns = torch.nn.ModuleList(backward_attns)

        self.forward_paddings = torch.nn.ParameterList(forward_paddings)
        self.backward_paddings = torch.nn.ParameterList(backward_paddings)

        self.forward_blocks = torch.nn.ModuleList(forward_blocks)
        self.backward_blocks = torch.nn.ModuleList(backward_blocks)

        if self.use_relative_position_weights:
            self.forward_weights = torch.nn.ParameterList(forward_weights)
            self.backward_weights = torch.nn.ParameterList(backward_weights)

        if self.use_position:
            self.position = PositionalEncoding(hidden_size)
class BertSimpleClassifier(nn.Module):
    def __init__(self, bert_pretrained_weights, num_class):

        super().__init__()
        self.bert = BertModel.from_pretrained(bert_pretrained_weights)
        self.positional_encoding = PositionalEncoding(input_dim=768)

        # self.linear_doc = nn.Linear(768, 768)
        # self.linear_prompt = nn.Linear(768, 768)

        self.linear_layer = nn.Linear(768 * 2, num_class)
        self.dropout_layer = nn.Dropout(0.5)
        self.criterion = nn.NLLLoss(reduction='sum')

        nn.init.uniform_(self.linear_layer.weight.data, -0.1, 0.1)
        nn.init.zeros_(self.linear_layer.bias.data)

    def forward(self,
                inputs,
                mask,
                sent_counts,
                sent_lens,
                prompt_inputs,
                prompt_mask,
                prompt_sent_counts,
                prompt_sent_lens,
                label=None):
        """

        :param prompt_sent_lens:
        :param prompt_sent_counts:
        :param prompt_inputs:
        :param prompt_mask:
        :param inputs:  [batch size, max sent count, max sent len]
        :param mask:    [batch size, max sent count, max sent len]
        :param sent_counts: [batch size]
        :param sent_lens: [batch size, max sent count]
        :param label: [batch size]
        :return:
        """
        batch_size = inputs.shape[0]
        max_sent_count = inputs.shape[1]
        max_sent_length = inputs.shape[2]

        inputs = inputs.view(-1, inputs.shape[-1])
        mask = mask.view(-1, mask.shape[-1])

        # [batch size * max sent len, hid size]
        last_hidden_states = self.bert(input_ids=inputs,
                                       attention_mask=mask)[0]
        last_hidden_states = last_hidden_states.view(batch_size,
                                                     max_sent_count,
                                                     max_sent_length, -1)
        last_hidden_states = self.dropout_layer(last_hidden_states)

        prompt_inputs = prompt_inputs.view(-1, prompt_inputs.shape[-1])
        prompt_mask = prompt_mask.view(-1, prompt_mask.shape[-1])
        prompt_hidden_states = self.bert(input_ids=prompt_inputs,
                                         attention_mask=prompt_mask)[0]
        prompt_hidden_states = self.dropout_layer(prompt_hidden_states)

        docs = []
        lens = []
        for i in range(0, batch_size):
            doc = []
            sent_count = sent_counts[i]
            sent_len = sent_lens[i]

            for j in range(sent_count):
                length = sent_len[j]
                cur_sent = last_hidden_states[i, j, :length, :]
                # print('cur sent shape', cur_sent.shape)
                doc.append(cur_sent)

            # mean for a doc
            doc_vec = torch.cat(doc, dim=0).unsqueeze(0)
            doc_vec = self.positional_encoding.forward(doc_vec)
            doc_vec = torch.mean(doc_vec, dim=1)

            lens.append(doc_vec.shape[0])
            # print(i, 'doc shape', doc_vec.shape)
            docs.append(doc_vec)

        # [batch size, bert embedding dim]
        docs = torch.cat(docs, 0)

        prompt = []
        for j in range(prompt_sent_counts):
            length = prompt_sent_lens[0][j]
            sent = prompt_hidden_states[j, :length, :]
            prompt.append(sent)

        prompt_vec = torch.cat(prompt, dim=0).unsqueeze(0)
        prompt_vec = self.positional_encoding.forward(prompt_vec)
        # mean [1, bert embedding dim]
        prompt_vec = torch.mean(prompt_vec, dim=1)
        # prompt_vec = self.linear_prompt(prompt_vec)

        doc_feature = docs
        prompt_feature = prompt_vec.expand_as(doc_feature)

        feature = torch.cat([doc_feature, prompt_feature], dim=-1)
        log_probs = torch.log_softmax(torch.tanh(self.linear_layer(feature)),
                                      dim=-1)

        # log_probs = self.classifier(docs)
        if label is not None:
            loss = self.criterion(input=log_probs.contiguous().view(
                -1, log_probs.shape[-1]),
                                  target=label.contiguous().view(-1))
        else:
            loss = None

        prediction = torch.max(log_probs, dim=1)[1]
        return {'loss': loss, 'prediction': prediction}