def __init__(self, bert_pretrained_weights, num_class, kernel_size, kernel_nums): super().__init__() self.bert = BertModel.from_pretrained(bert_pretrained_weights) self.positional_encoding = PositionalEncoding(input_dim=768) # self.classifier = CNNClassifier(num_class=num_class, # input_dim=768, # kernel_nums=kernel_nums, # kernel_sizes=kernel_size, # max_kernel_size=kernel_size[-1]) # self.essay_feature_extracter = CNNFeatureExtrater( # input_dim=768, # output_dim=300, # kernel_nums=kernel_nums, # kernel_sizes=kernel_size, # max_kernel_size=kernel_size[-1] # ) # self.prompt_feature_extracter = CNNFeatureExtrater( # input_dim=768, # output_dim=300, # kernel_sizes=[2, 4, 8, 16, 32, 64, 128, 256], # kernel_nums=[64, 64, 64, 64, 64, 64, 64, 64], # max_kernel_size=kernel_size[-1] # ) self.linear_layer = nn.Linear(768 * 2, num_class) self.dropout_layer = nn.Dropout(0.5) self.criterion = nn.NLLLoss(reduction='mean')
def __init__(self, bert_pretrained_weights): super().__init__() self.bert = BertModel.from_pretrained(bert_pretrained_weights) self.positional_encoding = PositionalEncoding(input_dim=768) self.linear_layer = nn.Linear(768 + 5 + 300, 1) self.dropout_layer = nn.Dropout(0.6) self.criterion = nn.MSELoss(reduction='sum') self.manual_feature_layer = nn.Linear(27, 5) self.prompt_global_attention = GlobalAttention(hid_dim=768, key_size=768) self.prompt_doc_attention = BahdanauAttention(hid_dim=768, key_size=768, query_size=768) self.segment_encoder = RNNEncoder(embedding_dim=768, hid_dim=150, num_layers=1, dropout_rate=0.5) nn.init.uniform_(self.linear_layer.weight.data, -0.1, 0.1) nn.init.zeros_(self.linear_layer.bias.data)
def __init__(self, bert_pretrained_weights): super().__init__() self.bert = BertModel.from_pretrained(bert_pretrained_weights) self.positional_encoding = PositionalEncoding(input_dim=768) self.linear_layer = nn.Linear(768 * 2, 1) self.dropout_layer = nn.Dropout(0.5) self.criterion = nn.MSELoss(reduction='sum') nn.init.uniform_(self.linear_layer.weight.data, -0.1, 0.1) nn.init.zeros_(self.linear_layer.bias.data)
def __init__(self, bert_pretrained_weights, num_class, kernel_nums, kernel_size): super().__init__() self.bert = BertModel.from_pretrained(bert_pretrained_weights) self.positional_encoding = PositionalEncoding(input_dim=768) self.doc_feature_extracter = CNNFeatureExtrater(input_dim=768, output_dim=768, kernel_nums=kernel_nums, kernel_sizes=kernel_size) self.prompt_feature_extracter = CNNFeatureExtrater(input_dim=768, output_dim=768, kernel_nums=kernel_nums, kernel_sizes=kernel_size) self.linear_layer = nn.Linear(768 * 2, num_class) self.dropout_layer = nn.Dropout(0.5) self.criterion = nn.NLLLoss(reduction='sum') nn.init.uniform_(self.linear_layer.weight.data, -0.1, 0.1) nn.init.zeros_(self.linear_layer.bias.data)
def __init__(self, width: int, input_size: int, hidden_size: int, n_layers: int, n_highway: int, use_position: bool = False, dropout: float = 0.0): super(LBLHighwayBiLmV2, self).__init__() self.use_position = use_position self.n_layers = n_layers = n_layers self.n_highway = n_highway = n_highway self.dropout = torch.nn.Dropout(p=dropout) self.width = width self.input_size = input_size self.hidden_size = hidden_size forward_scores, backward_scores = [], [] forward_blocks, backward_blocks = [], [] for _ in range(n_layers): forward_scores.append(torch.nn.Parameter(torch.randn(width + 1))) backward_scores.append(torch.nn.Parameter(torch.randn(width + 1))) forward_blocks.append(Highway(hidden_size, num_layers=n_highway)) backward_blocks.append(Highway(hidden_size, num_layers=n_highway)) self.forward_weights = torch.nn.ParameterList(forward_scores) self.backward_weights = torch.nn.ParameterList(backward_scores) self.forward_blocks = torch.nn.ModuleList(forward_blocks) self.backward_blocks = torch.nn.ModuleList(backward_blocks) if self.use_position: self.position = PositionalEncoding(hidden_size)
def __init__( self, decoding_dim: int, target_embedding_dim: int, feedforward_hidden_dim: int, num_layers: int, num_attention_heads: int, use_positional_encoding: bool = True, positional_encoding_max_steps: int = 5000, dropout_prob: float = 0.1, residual_dropout_prob: float = 0.2, attention_dropout_prob: float = 0.1, ) -> None: super().__init__(decoding_dim=decoding_dim, target_embedding_dim=target_embedding_dim, decodes_parallel=True) attn = MultiHeadedAttention(num_attention_heads, decoding_dim, attention_dropout_prob) feed_forward = PositionwiseFeedForward(decoding_dim, feedforward_hidden_dim, dropout_prob) self._embed_scale = math.sqrt(decoding_dim) self._positional_embedder = PositionalEncoding(decoding_dim, positional_encoding_max_steps) \ if use_positional_encoding else None self._dropout = nn.Dropout(dropout_prob) self._self_attention = Decoder( DecoderLayer(decoding_dim, deepcopy(attn), deepcopy(attn), feed_forward, residual_dropout_prob), num_layers)
def __init__(self, bert_pretrained_weights, num_class): super().__init__() self.bert = BertModel.from_pretrained(bert_pretrained_weights) self.positional_encoding = PositionalEncoding(input_dim=768) self.linear_layer = nn.Linear(768 * 2 + 5, num_class) self.manual_feature_layer = nn.Linear(27, 5) self.dropout_layer = nn.Dropout(0.5) self.criterion = nn.NLLLoss(reduction='mean') self.prompt_global_attention = GlobalAttention(hid_dim=768, key_size=768) self.doc_global_attention = GlobalAttention(hid_dim=768, key_size=768) nn.init.uniform_(self.linear_layer.weight.data, -0.1, 0.1) nn.init.zeros_(self.linear_layer.bias.data)
def __init__(self, width: int, input_size: int, hidden_size: int, n_heads: int, n_layers: int, n_highway: int, use_position: bool = False, use_relative_position: bool = False, dropout: float = 0.0): super(SelfAttentiveLBLBiLMV3, self).__init__() self.use_position = use_position self.use_relative_position_weights = use_relative_position self.n_layers = n_layers self.n_highway = n_highway self.n_heads = n_heads self.input_size = input_size self.width = width self.hidden_size = hidden_size forward_attns, backward_attns = [], [] forward_blocks, backward_blocks = [], [] for _ in range(n_layers): if self.use_relative_position_weights: forward_attn = MultiHeadedAttentionWithRelativePositionEmbeddings( n_heads, hidden_size, width=width + 1, left_to_right=True, dropout=dropout) backward_attn = MultiHeadedAttentionWithRelativePositionEmbeddings( n_heads, hidden_size, width=width + 1, left_to_right=False, dropout=dropout) else: forward_attn = MultiHeadedAttention(n_heads, hidden_size, dropout) backward_attn = MultiHeadedAttention(n_heads, hidden_size, dropout) forward_attns.append(forward_attn) backward_attns.append(backward_attn) forward_blocks.append(Highway(hidden_size, n_highway)) backward_blocks.append(Highway(hidden_size, n_highway)) self.forward_attns = torch.nn.ModuleList(forward_attns) self.backward_attns = torch.nn.ModuleList(backward_attns) self.forward_blocks = torch.nn.ModuleList(forward_blocks) self.backward_blocks = torch.nn.ModuleList(backward_blocks) if self.use_position: self.position = PositionalEncoding(hidden_size)
def __init__(self, width: int, input_size: int, hidden_size: int, n_layers: int, n_highway: int, use_position: bool = False, dropout: float = 0.0): super(Bengio03HighwayBiLmV2, self).__init__() self.use_position = use_position self.n_layers = n_layers self.n_highway = n_highway self.dropout = torch.nn.Dropout(p=dropout) self.activation = torch.nn.ReLU() self.width = width self.input_size = input_size self.context_input_size = input_size * (width + 1) self.hidden_size = hidden_size self.forward_paddings = torch.nn.ModuleList([ torch.nn.ConstantPad2d((0, 0, length, 0), 0) for length in range(width + 1) ]) self.backward_paddings = torch.nn.ModuleList([ torch.nn.ConstantPad2d((0, 0, 0, length), 0) for length in range(width + 1) ]) forward_blocks = [] backward_blocks = [] for layer_index in range(self.n_layers): forward_layer = torch.nn.ModuleList([ torch.nn.Linear(input_size, hidden_size, bias=False) for _ in range(width + 1) ]) backward_layer = torch.nn.ModuleList([ torch.nn.Linear(input_size, hidden_size, bias=False) for _ in range(width + 1) ]) self.add_module('forward_layer_{}'.format(layer_index), forward_layer) self.add_module('backward_layer_{}'.format(layer_index), backward_layer) forward_blocks.append(Highway(hidden_size, num_layers=n_highway)) backward_blocks.append(Highway(hidden_size, num_layers=n_highway)) self.forward_blocks = torch.nn.ModuleList(forward_blocks) self.backward_blocks = torch.nn.ModuleList(backward_blocks) if self.use_position: self.position = PositionalEncoding(hidden_size)
def __init__(self, width: int, input_size: int, hidden_size: int, n_layers: int, use_position: bool = False, dropout: float = 0.0): super(LBLResNetBiLm, self).__init__() self.use_position = use_position self.dropout = torch.nn.Dropout(dropout) self.activation = torch.nn.ReLU() self.width = width self.input_size = input_size self.hidden_size = hidden_size self.n_layers = n_layers forward_paddings, backward_paddings = [], [] forward_weights, backward_weights = [], [] for _ in range(self.n_layers): forward_paddings.append( torch.nn.Parameter( torch.randn(width, hidden_size) / np.sqrt(hidden_size))) backward_paddings.append( torch.nn.Parameter( torch.randn(width, hidden_size) / np.sqrt(hidden_size))) forward_weights.append(torch.nn.Parameter(torch.randn(width + 1))) backward_weights.append(torch.nn.Parameter(torch.randn(width + 1))) self.forward_paddings = torch.nn.ParameterList(forward_paddings) self.backward_paddings = torch.nn.ParameterList(backward_paddings) self.forward_weights = torch.nn.Parameter(forward_weights) self.backward_weights = torch.nn.Parameter(backward_weights) if self.use_position: self.position = PositionalEncoding(hidden_size) self.forward_linears = torch.nn.ModuleList([ PositionwiseFeedForward(hidden_size, hidden_size, dropout) for _ in range(n_layers) ]) self.backward_linears = torch.nn.ModuleList([ PositionwiseFeedForward(hidden_size, hidden_size, dropout) for _ in range(n_layers) ]) self.forward_blocks = torch.nn.ModuleList([ SublayerConnection(hidden_size, dropout) for _ in range(n_layers) ]) self.backward_blocks = torch.nn.ModuleList([ SublayerConnection(hidden_size, dropout) for _ in range(n_layers) ])
def __init__(self, width: int, input_size: int, hidden_size: int, n_layers: int, n_highway: int, use_position: bool = False, dropout: float = 0.0): super(Bengio03HighwayBiLm, self).__init__() self.use_position = use_position self.n_layers = n_layers self.n_highway = n_highway self.dropout = torch.nn.Dropout(p=dropout) self.activation = torch.nn.ReLU() self.width = width self.input_size = input_size self.context_input_size = input_size * (width + 1) self.hidden_size = hidden_size forward_paddings, backward_paddings = [], [] forward_blocks, backward_blocks = [], [] forward_projects, backward_projects = [], [] for i in range(n_layers): forward_paddings.append( torch.nn.Parameter(torch.randn(width, hidden_size))) backward_paddings.append( torch.nn.Parameter(torch.randn(width, hidden_size))) forward_blocks.append(Highway(hidden_size, num_layers=n_highway)) backward_blocks.append(Highway(hidden_size, num_layers=n_highway)) forward_projects.append( torch.nn.Linear(self.context_input_size, hidden_size)) backward_projects.append( torch.nn.Linear(self.context_input_size, hidden_size)) self.forward_projects = torch.nn.ModuleList(forward_projects) self.backward_projects = torch.nn.ModuleList(backward_projects) self.forward_paddings = torch.nn.ParameterList(forward_paddings) self.backward_paddings = torch.nn.ParameterList(backward_paddings) self.forward_blocks = torch.nn.ModuleList(forward_blocks) self.backward_blocks = torch.nn.ModuleList(backward_blocks) if self.use_position: self.position = PositionalEncoding(hidden_size) self.reset_parameters()
def __init__( self, num_layers: int, decoding_dim: int, target_embedding_dim: int, feedforward_hidden_dim: int, num_attention_heads: int, combiner: TransformerCombiner, num_sources: int, use_positional_encoding: bool = True, positional_encoding_max_steps: int = 5000, dropout_prob: float = 0.1, residual_dropout_prob: float = 0.2, attention_dropout_prob: float = 0.2, ) -> None: super().__init__(decoding_dim, target_embedding_dim, decodes_parallel=True) self._decoding_dim = decoding_dim self._embed_scale = math.sqrt(decoding_dim) self._positional_embedder = (PositionalEncoding( input_dim=decoding_dim, max_len=positional_encoding_max_steps) if use_positional_encoding else None) self._dropout = nn.Dropout(dropout_prob) generic_attn = MultiHeadedAttention(num_attention_heads, decoding_dim, attention_dropout_prob) combined_attn = AttentionCombiner(num_sources, generic_attn, combiner) feed_forward = PositionwiseFeedForward(decoding_dim, feedforward_hidden_dim, dropout_prob) layer = DecoderLayer(size=decoding_dim, self_attn=deepcopy(generic_attn), src_attn=deepcopy(combined_attn), feed_forward=feed_forward, dropout=residual_dropout_prob) self._self_attention_layers = _clones(layer, num_layers) self.norm = nn.LayerNorm(layer.size)
class BertGlobalAttentionClassifier(nn.Module): def __init__(self, bert_pretrained_weights, num_class): super().__init__() self.bert = BertModel.from_pretrained(bert_pretrained_weights) self.positional_encoding = PositionalEncoding(input_dim=768) self.linear_layer = nn.Linear(768 * 2 + 5, num_class) self.manual_feature_layer = nn.Linear(27, 5) self.dropout_layer = nn.Dropout(0.5) self.criterion = nn.NLLLoss(reduction='mean') self.prompt_global_attention = GlobalAttention(hid_dim=768, key_size=768) self.doc_global_attention = GlobalAttention(hid_dim=768, key_size=768) nn.init.uniform_(self.linear_layer.weight.data, -0.1, 0.1) nn.init.zeros_(self.linear_layer.bias.data) def forward(self, inputs, mask, sent_counts, sent_lens, prompt_inputs, prompt_mask, prompt_sent_counts, prompt_sent_lens, manual_feature, label=None): """ :param prompt_sent_lens: :param prompt_sent_counts: :param prompt_inputs: :param prompt_mask: :param inputs: [batch size, max sent count, max sent len] :param mask: [batch size, max sent count, max sent len] :param sent_counts: [batch size] :param sent_lens: [batch size, max sent count] :param label: [batch size] :return: """ batch_size = inputs.shape[0] max_sent_count = inputs.shape[1] max_sent_length = inputs.shape[2] inputs = inputs.view(-1, inputs.shape[-1]) mask = mask.view(-1, mask.shape[-1]) # [batch size * max sent len, hid size] last_hidden_states = self.bert(input_ids=inputs, attention_mask=mask)[0] last_hidden_states = last_hidden_states.view(batch_size, max_sent_count, max_sent_length, -1) prompt_inputs = prompt_inputs.view(-1, prompt_inputs.shape[-1]) prompt_mask = prompt_mask.view(-1, prompt_mask.shape[-1]) prompt_hidden_states = self.bert(input_ids=prompt_inputs, attention_mask=prompt_mask)[0] docs = [] lens = [] for i in range(0, batch_size): doc = [] sent_count = sent_counts[i] sent_len = sent_lens[i] for j in range(sent_count): length = sent_len[j] cur_sent = last_hidden_states[i, j, :length, :] # print('cur sent shape', cur_sent.shape) doc.append(cur_sent) # mean for a doc doc_vec = torch.cat(doc, dim=0).unsqueeze(0) doc_vec = self.positional_encoding.forward(doc_vec) lens.append(doc_vec.shape[1]) # print(i, 'doc shape', doc_vec.shape) docs.append(doc_vec) batch_max_len = max(lens) for i, doc in enumerate(docs): if doc.shape[1] < batch_max_len: pd = (0, 0, 0, batch_max_len - doc.shape[1]) m = nn.ConstantPad2d(pd, 0) doc = m(doc) docs[i] = doc # [batch size, bert embedding dim] docs = torch.cat(docs, 0) docs_mask = get_mask_from_sequence_lengths( torch.tensor(lens), max_length=batch_max_len).to(docs.device) prompt = [] for j in range(prompt_sent_counts): length = prompt_sent_lens[0][j] sent = prompt_hidden_states[j, :length, :] prompt.append(sent) prompt_vec = torch.cat(prompt, dim=0).unsqueeze(0) prompt_vec = self.positional_encoding.forward(prompt_vec) prompt_len = prompt_vec.shape[1] prompt_attention_mask = get_mask_from_sequence_lengths( torch.tensor([prompt_len]), max_length=prompt_len).to(prompt_vec.device) # [1, seq len] prompt_vec_weights = self.prompt_global_attention( prompt_vec, prompt_attention_mask) # [1, bert hidden size] prompt_vec = torch.bmm(prompt_vec_weights.unsqueeze(1), prompt_vec).squeeze(1) doc_weights = self.doc_global_attention(docs, docs_mask) doc_vec = torch.bmm(doc_weights.unsqueeze(1), docs).squeeze(1) doc_feature = self.dropout_layer(torch.tanh(doc_vec)) prompt_feature = self.dropout_layer( torch.tanh(prompt_vec.expand_as(doc_feature))) feature = torch.cat([doc_feature, prompt_feature], dim=-1) log_probs = torch.log_softmax(self.linear_layer(feature), dim=-1) # log_probs = self.classifier(docs) if label is not None: loss = self.criterion(input=log_probs.contiguous().view( -1, log_probs.shape[-1]), target=label.contiguous().view(-1)) else: loss = None prediction = torch.max(log_probs, dim=1)[1] return {'loss': loss, 'prediction': prediction}
class BertClassifier(nn.Module): def __init__(self, bert_pretrained_weights, num_class, kernel_size, kernel_nums): super().__init__() self.bert = BertModel.from_pretrained(bert_pretrained_weights) self.positional_encoding = PositionalEncoding(input_dim=768) # self.classifier = CNNClassifier(num_class=num_class, # input_dim=768, # kernel_nums=kernel_nums, # kernel_sizes=kernel_size, # max_kernel_size=kernel_size[-1]) # self.essay_feature_extracter = CNNFeatureExtrater( # input_dim=768, # output_dim=300, # kernel_nums=kernel_nums, # kernel_sizes=kernel_size, # max_kernel_size=kernel_size[-1] # ) # self.prompt_feature_extracter = CNNFeatureExtrater( # input_dim=768, # output_dim=300, # kernel_sizes=[2, 4, 8, 16, 32, 64, 128, 256], # kernel_nums=[64, 64, 64, 64, 64, 64, 64, 64], # max_kernel_size=kernel_size[-1] # ) self.linear_layer = nn.Linear(768 * 2, num_class) self.dropout_layer = nn.Dropout(0.5) self.criterion = nn.NLLLoss(reduction='mean') def forward(self, inputs, mask, sent_counts, sent_lens, prompt_inputs, prompt_mask, prompt_sent_counts, prompt_sent_lens, label=None): """ :param prompt_sent_lens: :param prompt_sent_counts: :param prompt_inputs: :param prompt_mask: :param inputs: [batch size, max sent count, max sent len] :param mask: [batch size, max sent count, max sent len] :param sent_counts: [batch size] :param sent_lens: [batch size, max sent count] :param label: [batch size] :return: """ batch_size = inputs.shape[0] max_sent_count = inputs.shape[1] max_sent_length = inputs.shape[2] inputs = inputs.view(-1, inputs.shape[-1]) mask = mask.view(-1, mask.shape[-1]) # [batch size * max sent len, hid size] last_hidden_states = self.bert(input_ids=inputs, attention_mask=mask)[0] last_hidden_states = last_hidden_states.view(batch_size, max_sent_count, max_sent_length, -1) prompt_inputs = prompt_inputs.view(-1, prompt_inputs.shape[-1]) prompt_mask = prompt_mask.view(-1, prompt_mask.shape[-1]) prompt_hidden_states = self.bert(input_ids=prompt_inputs, attention_mask=prompt_mask)[0] docs = [] lens = [] for i in range(0, batch_size): doc = [] sent_count = sent_counts[i] sent_len = sent_lens[i] for j in range(sent_count): length = sent_len[j] cur_sent = last_hidden_states[i, j, :length, :] # print('cur sent shape', cur_sent.shape) doc.append(cur_sent) doc_vec = torch.cat(doc, dim=0).unsqueeze(0) doc_vec = self.positional_encoding.forward(doc_vec) doc_vec = torch.mean(doc_vec, dim=1) lens.append(doc_vec.shape[0]) # print(i, 'doc shape', doc_vec.shape) docs.append(doc_vec) # batch_max_len = max(lens) # for i, doc in enumerate(docs): # if doc.shape[0] < batch_max_len: # pd = (0, 0, 0, batch_max_len - doc.shape[0]) # m = nn.ConstantPad2d(pd, 0) # doc = m(doc) # # docs[i] = doc.unsqueeze(0) docs = torch.cat(docs, 0) # print(docs.shape) # docs = self.positional_encoding.forward(docs) # [batch size, num_class] prompt = [] for j in range(prompt_sent_counts): length = prompt_sent_lens[0][j] sent = prompt_hidden_states[j, :length, :] prompt.append(sent) prompt_vec = torch.cat(prompt, dim=0).unsqueeze(0) prompt_vec = self.positional_encoding.forward(prompt_vec) prompt_vec = torch.mean(prompt_vec, dim=1) # [batch size, feature size] # doc_feature = self.essay_feature_extracter(docs) # prompt_feature = self.prompt_feature_extracter(prompt_vec) # prompt_feature = prompt_feature.expand_as(doc_feature) doc_feature = self.dropout_layer(torch.tanh(docs)) prompt_feature = self.dropout_layer( torch.tanh(prompt_vec.expand_as(doc_feature))) feature = torch.cat([doc_feature, prompt_feature], dim=-1) log_probs = torch.log_softmax(self.linear_layer(feature), dim=-1) # log_probs = self.classifier(docs) if label is not None: loss = self.criterion(input=log_probs.contiguous().view( -1, log_probs.shape[-1]), target=label.contiguous().view(-1)) else: loss = None prediction = torch.max(log_probs, dim=1)[1] return {'loss': loss, 'prediction': prediction}
class MixBertRecurrentAttentionRegressor(nn.Module): def __init__(self, bert_pretrained_weights): super().__init__() self.bert = BertModel.from_pretrained(bert_pretrained_weights) self.positional_encoding = PositionalEncoding(input_dim=768) self.linear_layer = nn.Linear(768 + 5 + 300, 1) self.dropout_layer = nn.Dropout(0.6) self.criterion = nn.MSELoss(reduction='sum') self.manual_feature_layer = nn.Linear(27, 5) self.prompt_global_attention = GlobalAttention(hid_dim=768, key_size=768) self.prompt_doc_attention = BahdanauAttention(hid_dim=768, key_size=768, query_size=768) self.segment_encoder = RNNEncoder(embedding_dim=768, hid_dim=150, num_layers=1, dropout_rate=0.5) nn.init.uniform_(self.linear_layer.weight.data, -0.1, 0.1) nn.init.zeros_(self.linear_layer.bias.data) def forward(self, inputs, mask, sent_counts, sent_lens, prompt_inputs, prompt_mask, prompt_sent_counts, prompt_sent_lens, min_score, max_score, manual_feature, label=None): """ :param manual_feature: [batch size] :param max_score: [batch size] :param min_score: [batch size] :param prompt_sent_lens: [batch size, max sent count] :param prompt_sent_counts: [batch size] :param prompt_inputs: [batch size, max sent count, max sent len] :param prompt_mask: [batch size, max sent count, max sent len] :param inputs: [batch size, max sent count, max sent len] :param mask: [batch size, max sent count, max sent len] :param sent_counts: [batch size] :param sent_lens: [batch size, max sent count] :param label: [batch size] :return: """ batch_size = inputs.shape[0] max_sent_count = inputs.shape[1] max_sent_length = inputs.shape[2] max_prompt_sent_count = prompt_inputs.shape[1] max_prompt_sent_length = prompt_inputs.shape[2] inputs = inputs.view(-1, inputs.shape[-1]) mask = mask.view(-1, mask.shape[-1]) # [batch size * max sent len, hid size] last_hidden_states = self.bert(input_ids=inputs, attention_mask=mask)[0] last_hidden_states = last_hidden_states.view(batch_size, max_sent_count, max_sent_length, -1) last_hidden_states = self.dropout_layer(last_hidden_states) prompt_inputs = prompt_inputs.view(-1, prompt_inputs.shape[-1]) prompt_mask = prompt_mask.view(-1, prompt_mask.shape[-1]) prompt_hidden_states = self.bert(input_ids=prompt_inputs, attention_mask=prompt_mask)[0] prompt_hidden_states = prompt_hidden_states.view( batch_size, max_prompt_sent_count, max_prompt_sent_length, -1) prompt_hidden_states = self.dropout_layer(prompt_hidden_states) docs = [] lens = [] doc_segments = [] for i in range(0, batch_size): doc = [] doc_segment = [] sent_count = sent_counts[i] sent_len = sent_lens[i] for j in range(sent_count): length = sent_len[j] cur_sent = last_hidden_states[i, j, :length, :] mean_cur_sent = torch.mean(cur_sent, dim=0) # print('cur sent shape', cur_sent.shape) doc.append(cur_sent) doc_segment.append(mean_cur_sent.unsqueeze(0)) # [1, len, hid size] doc_vec = torch.cat(doc, dim=0).unsqueeze(0) doc_vec = self.positional_encoding.forward(doc_vec) lens.append(doc_vec.shape[1]) # print(i, 'doc shape', doc_vec.shape) docs.append(doc_vec) doc_segments.append(doc_segment) batch_max_len = max(lens) for i, doc in enumerate(docs): if doc.shape[1] < batch_max_len: pd = (0, 0, 0, batch_max_len - doc.shape[1]) m = nn.ConstantPad2d(pd, 0) doc = m(doc) docs[i] = doc # [batch size, bert embedding dim] docs = torch.cat(docs, 0) docs_mask = get_mask_from_sequence_lengths( torch.tensor(lens), max_length=batch_max_len).to(docs.device) # print('lens ', lens) # print('docs shape', docs.shape) prompt_docs = [] prompt_lens = [] for i in range(0, batch_size): prompt_doc = [] prompt_sent_count = prompt_sent_counts[i] prompt_sent_len = prompt_sent_lens[i] for j in range(prompt_sent_count): length = prompt_sent_len[j] cur_sent = prompt_hidden_states[i, j, :length, :] prompt_doc.append(cur_sent) prompt_doc_vec = torch.cat(prompt_doc, dim=0).unsqueeze(0) prompt_doc_vec = self.positional_encoding.forward(prompt_doc_vec) prompt_lens.append(prompt_doc_vec.shape[1]) prompt_docs.append(prompt_doc_vec) prompt_batch_max_len = max(prompt_lens) for i, doc in enumerate(prompt_docs): if doc.shape[1] < prompt_batch_max_len: pd = (0, 0, 0, prompt_batch_max_len - doc.shape[1]) m = nn.ConstantPad2d(pd, 0) doc = m(doc) prompt_docs[i] = doc prompt_docs = torch.cat(prompt_docs, 0) prompt_attention_mask = get_mask_from_sequence_lengths( torch.tensor(prompt_lens), max_length=prompt_batch_max_len).to(docs.device) # [batch size, max seq len] prompt_vec_weights = self.prompt_global_attention( prompt_docs, prompt_attention_mask) # [batch size, bert hidden size] prompt_vec = torch.bmm(prompt_vec_weights.unsqueeze(1), prompt_docs).squeeze(1) # print('prompt len', prompt_len) doc_weights = self.prompt_doc_attention(query=prompt_vec, key=docs, mask=docs_mask) doc_vec = torch.bmm(doc_weights.unsqueeze(1), docs).squeeze(1) doc_feature = self.dropout_layer(torch.tanh(doc_vec)) manual_feature = torch.tanh( self.manual_feature_layer(self.dropout_layer(manual_feature))) # rnn segments encoder sorted_index = sorted(range(len(sent_counts)), key=lambda i: sent_counts[i], reverse=True) max_count = max_sent_count for idx, doc in enumerate(doc_segments): for i in range(max_count - len(doc)): doc.append(torch.zeros_like(doc[0])) doc_segments[idx] = torch.cat(doc, dim=0).unsqueeze(0) doc_segments = torch.cat(doc_segments, dim=0) sorted_doc_segments = doc_segments[sorted_index] sorted_batch_counts = sent_counts[sorted_index] final_hidden_states = self.segment_encoder( sorted_doc_segments, sorted_batch_counts)['final_hidden_states'] final_hidden_states[sorted_index] = final_hidden_states final_hidden_states = torch.tanh(final_hidden_states) final_hidden_states = self.dropout_layer(final_hidden_states) # feature = self.dropout_layer(torch.tanh(doc_vec)) # prompt_feature = self.dropout_layer(torch.tanh(prompt_vec.expand_as(doc_feature))) feature = torch.cat([doc_feature, manual_feature, final_hidden_states], dim=-1) grade = self.linear_layer(feature) if label is not None: # print('label ', label) # print('min score ', min_score) # print('max score ', max_score) # grade = grade * (max_score - min_score) + min_score label = (label.type_as(grade) - min_score.type_as(grade)) / ( max_score.type_as(grade) - min_score.type_as(grade)) loss = self.criterion( input=grade.contiguous().view(-1), target=label.type_as(grade).contiguous().view(-1)) else: loss = None prediction = grade * (max_score.type_as(grade) - min_score.type_as( grade)) + min_score.type_as(grade) return {'loss': loss, 'prediction': prediction}
def __init__(self, width: int, input_size: int, hidden_size: int, n_heads: int, n_layers: int, n_highway: int, use_position: bool = False, use_relative_position: bool = False, dropout: float = 0.0): super(SelfAttentiveLBLBiLM, self).__init__() self.use_position = use_position self.use_relative_position_weights = use_relative_position self.n_layers = n_layers self.n_highway = n_highway self.n_heads = n_heads self.input_size = input_size self.width = width self.hidden_size = hidden_size forward_attns, backward_attns = [], [] forward_paddings, backward_paddings = [], [] forward_blocks, backward_blocks = [], [] forward_weights, backward_weights = [], [] for _ in range(n_layers): forward_attns.append( MultiHeadedAttention(n_heads, hidden_size, dropout)) backward_attns.append( MultiHeadedAttention(n_heads, hidden_size, dropout)) forward_paddings.append( torch.nn.Parameter( torch.randn(width, hidden_size) / np.sqrt(hidden_size))) backward_paddings.append( torch.nn.Parameter( torch.randn(width, hidden_size) / np.sqrt(hidden_size))) forward_blocks.append(Highway(hidden_size, n_highway)) backward_blocks.append(Highway(hidden_size, n_highway)) if self.use_relative_position_weights: forward_weights.append( torch.nn.Parameter(torch.randn(width + 1))) backward_weights.append( torch.nn.Parameter(torch.randn(width + 1))) self.forward_attns = torch.nn.ModuleList(forward_attns) self.backward_attns = torch.nn.ModuleList(backward_attns) self.forward_paddings = torch.nn.ParameterList(forward_paddings) self.backward_paddings = torch.nn.ParameterList(backward_paddings) self.forward_blocks = torch.nn.ModuleList(forward_blocks) self.backward_blocks = torch.nn.ModuleList(backward_blocks) if self.use_relative_position_weights: self.forward_weights = torch.nn.ParameterList(forward_weights) self.backward_weights = torch.nn.ParameterList(backward_weights) if self.use_position: self.position = PositionalEncoding(hidden_size)
class BertSimpleClassifier(nn.Module): def __init__(self, bert_pretrained_weights, num_class): super().__init__() self.bert = BertModel.from_pretrained(bert_pretrained_weights) self.positional_encoding = PositionalEncoding(input_dim=768) # self.linear_doc = nn.Linear(768, 768) # self.linear_prompt = nn.Linear(768, 768) self.linear_layer = nn.Linear(768 * 2, num_class) self.dropout_layer = nn.Dropout(0.5) self.criterion = nn.NLLLoss(reduction='sum') nn.init.uniform_(self.linear_layer.weight.data, -0.1, 0.1) nn.init.zeros_(self.linear_layer.bias.data) def forward(self, inputs, mask, sent_counts, sent_lens, prompt_inputs, prompt_mask, prompt_sent_counts, prompt_sent_lens, label=None): """ :param prompt_sent_lens: :param prompt_sent_counts: :param prompt_inputs: :param prompt_mask: :param inputs: [batch size, max sent count, max sent len] :param mask: [batch size, max sent count, max sent len] :param sent_counts: [batch size] :param sent_lens: [batch size, max sent count] :param label: [batch size] :return: """ batch_size = inputs.shape[0] max_sent_count = inputs.shape[1] max_sent_length = inputs.shape[2] inputs = inputs.view(-1, inputs.shape[-1]) mask = mask.view(-1, mask.shape[-1]) # [batch size * max sent len, hid size] last_hidden_states = self.bert(input_ids=inputs, attention_mask=mask)[0] last_hidden_states = last_hidden_states.view(batch_size, max_sent_count, max_sent_length, -1) last_hidden_states = self.dropout_layer(last_hidden_states) prompt_inputs = prompt_inputs.view(-1, prompt_inputs.shape[-1]) prompt_mask = prompt_mask.view(-1, prompt_mask.shape[-1]) prompt_hidden_states = self.bert(input_ids=prompt_inputs, attention_mask=prompt_mask)[0] prompt_hidden_states = self.dropout_layer(prompt_hidden_states) docs = [] lens = [] for i in range(0, batch_size): doc = [] sent_count = sent_counts[i] sent_len = sent_lens[i] for j in range(sent_count): length = sent_len[j] cur_sent = last_hidden_states[i, j, :length, :] # print('cur sent shape', cur_sent.shape) doc.append(cur_sent) # mean for a doc doc_vec = torch.cat(doc, dim=0).unsqueeze(0) doc_vec = self.positional_encoding.forward(doc_vec) doc_vec = torch.mean(doc_vec, dim=1) lens.append(doc_vec.shape[0]) # print(i, 'doc shape', doc_vec.shape) docs.append(doc_vec) # [batch size, bert embedding dim] docs = torch.cat(docs, 0) prompt = [] for j in range(prompt_sent_counts): length = prompt_sent_lens[0][j] sent = prompt_hidden_states[j, :length, :] prompt.append(sent) prompt_vec = torch.cat(prompt, dim=0).unsqueeze(0) prompt_vec = self.positional_encoding.forward(prompt_vec) # mean [1, bert embedding dim] prompt_vec = torch.mean(prompt_vec, dim=1) # prompt_vec = self.linear_prompt(prompt_vec) doc_feature = docs prompt_feature = prompt_vec.expand_as(doc_feature) feature = torch.cat([doc_feature, prompt_feature], dim=-1) log_probs = torch.log_softmax(torch.tanh(self.linear_layer(feature)), dim=-1) # log_probs = self.classifier(docs) if label is not None: loss = self.criterion(input=log_probs.contiguous().view( -1, log_probs.shape[-1]), target=label.contiguous().view(-1)) else: loss = None prediction = torch.max(log_probs, dim=1)[1] return {'loss': loss, 'prediction': prediction}