def test_bidaf_trilinear_similarity(self): linear = LinearMatrixAttention(2, 2, combination="x,y,x*y") linear._weight_vector = Parameter( torch.FloatTensor([-0.3, 0.5, 2.0, -1.0, 1, 1])) linear._bias = Parameter(torch.FloatTensor([0.0])) output = linear( torch.FloatTensor([[[0, 0], [4, 5]], [[-7, -8], [10, 11]]]), torch.FloatTensor([[[1, 2], [4, 5]], [[7, 8], [10, 11]]]), ) assert_almost_equal( output.data.numpy(), numpy.array([ [ [0 + 0 + 2 + -2 + 0 + 0, 0 + 0 + 8 + -5 + 0 + 0], [ -1.2 + 2.5 + 2 + -2 + 4 + 10, -1.2 + 2.5 + 8 + -5 + 16 + 25 ], ], [ [ 2.1 + -4 + 14 + -8 + -49 + -64, 2.1 + -4 + 20 + -11 + -70 + -88 ], [ -3 + 5.5 + 14 + -8 + 70 + 88, -3 + 5.5 + 20 + -11 + 100 + 121 ], ], ]), decimal=2, )
def test_linear_similarity(self): linear = LinearMatrixAttention(3, 3) linear._weight_vector = Parameter(torch.FloatTensor([-.3, .5, 2.0, -1.0, 1, 1])) linear._bias = Parameter(torch.FloatTensor([.1])) output = linear(Variable(torch.FloatTensor([[[0, 0, 0], [4, 5, 6]], [[-7, -8, -9], [10, 11, 12]]])), Variable(torch.FloatTensor([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]))) assert_almost_equal(output.data.numpy(), numpy.array([[[4.1000, 7.1000], [17.4000, 20.4000]], [[-9.8000, -6.8000], [36.6000, 39.6000]]]), decimal=2)
def test_linear_similarity(self): linear = LinearMatrixAttention(3, 3) linear._weight_vector = Parameter(torch.FloatTensor([-.3, .5, 2.0, -1.0, 1, 1])) linear._bias = Parameter(torch.FloatTensor([.1])) output = linear(Variable(torch.FloatTensor([[[0, 0, 0], [4, 5, 6]], [[-7, -8, -9], [10, 11, 12]]])), Variable(torch.FloatTensor([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]))) assert_almost_equal(output.data.numpy(), numpy.array([[[4.1000, 7.1000], [17.4000, 20.4000]], [[-9.8000, -6.8000], [36.6000, 39.6000]]]), decimal=2)
def __init__(self, input_dim: int, output_dim: int, num_heads: int = 1, activation: Activation = None, input_dropout: float = 0.0, att_dropout: float = 0.0) -> None: super().__init__() self._hidden_dim = output_dim self._weight_vector = nn.Parameter( torch.FloatTensor(input_dim, self._hidden_dim)) self.activation = activation or (lambda x: x) attention_dim = output_dim / num_heads assert attention_dim.is_integer( ), "output dim must be divisible by number of heads" self._attention_dim = int(attention_dim) self._num_heads = num_heads self.matrix_attention = LinearMatrixAttention( tensor_1_dim=self._attention_dim, tensor_2_dim=self._attention_dim, combination='x,y') if input_dropout is not None and input_dropout > 0: self.input_dropout = nn.Dropout(input_dropout) else: self.input_dropout = lambda x: x if att_dropout is not None and att_dropout > 0: self.att_dropout = nn.Dropout(att_dropout) else: self.att_dropout = lambda x: x self.reset_parameters()
def __init__(self, config): super(RobertaForRopes, self).__init__(config) self.roberta = RobertaModel(config) self.find_object1 = MLP(config.hidden_size, 1) self.find_object2 = MLP(config.hidden_size, 1) self.find_TP = MLP(config.hidden_size, 1) self.bb_matrix_attention = LinearMatrixAttention( tensor_1_dim=config.hidden_size, tensor_2_dim=config.hidden_size, combination="x,y,x*y") self.bs_bilinear_imilairty = BilinearMatrixAttention( matrix_1_dim=config.hidden_size, matrix_2_dim=config.hidden_size) self.ss_matrix_attention = LinearMatrixAttention( tensor_1_dim=config.hidden_size, tensor_2_dim=config.hidden_size, combination="x,y,x*y") self.rel_SPo1_SPo2 = Relevance(config) self.pol_TP_SP = MLP(config.hidden_size * 2, 2) # self.rel_TPo1_TPo2 = None # self.answer_according_to_question = None self.init_weights()
def __init__(self, span_dim, max_decoding_steps=5, predict_eos=True, cell='lstm', train_helper="sample", val_helper="beamsearch", beam_size=3, aux_input_dim=None,#200,#None, pass_label=False): super().__init__() self.evd_decoder = PointerNetDecoder(LinearMatrixAttention(span_dim, span_dim, "x,y,x*y"), memory_dim=span_dim, aux_input_dim=aux_input_dim, train_helper=train_helper, val_helper=val_helper, beam_size=beam_size, max_decoding_steps=max_decoding_steps, predict_eos=predict_eos, cell=cell)
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, binary_feature_dim: int, embedding_dropout: float = 0.0, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, label_smoothing: float = None, ignore_span_metric: bool = False, srl_eval_path: str = DEFAULT_SRL_EVAL_PATH, image_embedding_size: int = 2052, lamb: float = 0.2) -> None: super().__init__( vocab, text_field_embedder, encoder, binary_feature_dim, embedding_dropout, initializer, regularizer, label_smoothing, ignore_span_metric, srl_eval_path, ) self.image_embedding_size = image_embedding_size self.embed_dim = self.encoder.get_output_dim() self.img_enc = EncoderImagePrecomp(self.image_embedding_size, \ self.embed_dim, no_imgnorm=False) self.vse_loss = ContrastiveLoss(margin=0.2) self.attention = LinearMatrixAttention(self.embed_dim, self.embed_dim) # tune it self.lamb = lamb self.lamb = torch.tensor(self.lamb) self.tag_projection_layer = TimeDistributed( Linear(self.embed_dim * 2, self.num_classes))
def build_model(vocab: Vocabulary) -> Model: print("Building the model") EMBEDDING_DIM = 300 HIDDEN_DIM = 300 NUM_FILTERS = 60 NGRAM_FILTER_SIZES = (2, 3, 4, 5, 6) #out_dim for char = len(NGRAM_FILTER_SIZES) * NUM_FILTERS F_OUT = 200 elmo_options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json" elmo_weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5" elmo_embedding = ElmoTokenEmbedder(options_file=elmo_options_file, weight_file=elmo_weight_file) character_embedding = Embedding(vocab=vocab, embedding_dim=EMBEDDING_DIM, vocab_namespace='character_vocab') cnn_encoder = CnnEncoder(embedding_dim=EMBEDDING_DIM, num_filters=NUM_FILTERS, ngram_filter_sizes=NGRAM_FILTER_SIZES) token_encoder = TokenCharactersEncoder(character_embedding, cnn_encoder) pos_tag_embedding = Embedding(vocab=vocab, embedding_dim=EMBEDDING_DIM, vocab_namespace='pos_tag_vocab') ner_tag_embedding = Embedding(vocab=vocab, embedding_dim=EMBEDDING_DIM, vocab_namespace='ner_tag_vocab') word_embedding = Embedding(vocab=vocab, embedding_dim=EMBEDDING_DIM, vocab_namespace='token_vocab') utterance_embedder = BasicTextFieldEmbedder( token_embedders={ 'elmo_tokens': elmo_embedding, 'token_characters': token_encoder, 'pos_tags': pos_tag_embedding, 'ner_tags': ner_tag_embedding }) #slot embed slot_embedder = BasicTextFieldEmbedder(token_embedders={ 'elmo_tokens': elmo_embedding, 'token_characters': token_encoder, }) utterance_lstm = PytorchSeq2SeqWrapper( torch.nn.LSTM(2 * EMBEDDING_DIM + 1024 + len(NGRAM_FILTER_SIZES) * NUM_FILTERS, HIDDEN_DIM, num_layers=2, batch_first=True, bidirectional=True)) slot_lstm = PytorchSeq2SeqWrapper( torch.nn.LSTM(1024 + len(NGRAM_FILTER_SIZES) * NUM_FILTERS, HIDDEN_DIM, num_layers=2, batch_first=True, bidirectional=True)) similarity = LinearMatrixAttention(tensor_1_dim=2 * HIDDEN_DIM, tensor_2_dim=2 * HIDDEN_DIM, combination="x,y,x*y", activation=Activation.by_name('tanh')()) modeling_lstm = PytorchSeq2SeqWrapper( torch.nn.LSTM( 2 * 5 * HIDDEN_DIM, # bi-direction HIDDEN_DIM, num_layers=2, batch_first=True, bidirectional=True)) #step1_utterance utterance_embedder2 = BasicTextFieldEmbedder( token_embedders={ 'elmo_tokens': elmo_embedding, 'token_characters': token_encoder, 'pos_tags': pos_tag_embedding, 'ner_tags': ner_tag_embedding }) utterance_lstm2 = PytorchSeq2SeqWrapper( torch.nn.LSTM(2 * EMBEDDING_DIM + 1024 + len(NGRAM_FILTER_SIZES) * NUM_FILTERS, HIDDEN_DIM, num_layers=2, batch_first=True, bidirectional=True)) ## FF to combines two lstm inputs final_linear_layer = FeedForward(2 * HIDDEN_DIM, 2, [HIDDEN_DIM, F_OUT], torch.nn.ReLU(), 0.3) #CRF model model = CrfTagger(vocab=vocab, utterance_embedder=utterance_embedder, utterance_embedder2=utterance_embedder2, slot_embedder=slot_embedder, utterance_encoder=utterance_lstm, utterance_encoder2=utterance_lstm2, slot_encoder=slot_lstm, matrix_attention=similarity, modeling_layer=modeling_lstm, fc_ff_layer=final_linear_layer) return model
def __init__( self, question_encoding_dim: int, passage_encoding_dim: int, passage_attention_to_span: Seq2SeqEncoder, passage_startend_predictor, question_attention_to_span: Seq2SeqEncoder, passage_attention_to_count: Seq2SeqEncoder, num_implicit_nums: int = None, passage_count_predictor=None, passage_count_hidden2logits=None, dropout: float = 0.0, ): super().__init__() self.num_counts = 10 self.passage_attention_scalingvals = [1, 2, 5, 10] # Parameters for answer start/end prediction from PassageAttention self.passage_attention_to_span = passage_attention_to_span self.passage_startend_predictor = passage_startend_predictor # torch.nn.Linear(self.passage_attention_to_span.get_output_dim(), 2) # Parameters for answer start/end pred directly from passage encoding (direct PassageSpanAnswer from 1step prog) self.oneshot_psa_startend_predictor = torch.nn.Linear( passage_encoding_dim, 2) self.question_attention_to_span = question_attention_to_span self.question_startend_predictor = torch.nn.Linear( self.question_attention_to_span.get_output_dim(), 2) self.passage_attention_to_count = passage_attention_to_count # self.passage_count_predictor = torch.nn.Linear(self.passage_attention_to_count.get_output_dim(), # self.num_counts) self.passage_count_predictor = passage_count_predictor # Linear from self.passage_attention_to_count.output_dim --> 1 self.passage_count_hidden2logits = passage_count_hidden2logits self.dotprod_matrix_attn = DotProductMatrixAttention() self.implicit_num_embeddings = torch.nn.Parameter( torch.FloatTensor(num_implicit_nums, passage_encoding_dim)) torch.nn.init.normal_(self.implicit_num_embeddings, mean=0.0, std=0.001) self.implicitnum_bilinear_attention = BilinearMatrixAttention( matrix_1_dim=passage_encoding_dim, matrix_2_dim=passage_encoding_dim) # self.filter_matrix_attention = LinearMatrixAttention( # tensor_1_dim=question_encoding_dim, tensor_2_dim=passage_encoding_dim, combination="x,y,x*y" # ) self.filter_matrix_attention = LinearMatrixAttention( tensor_1_dim=question_encoding_dim, tensor_2_dim=passage_encoding_dim, combination="x,y,x*y") self._endpoint_span_extractor = EndpointSpanExtractor( input_dim=passage_encoding_dim, combination="x,y") # We will sum the passage-token-repr to the weighted-q-repr - to use x*y combination self.relocate_matrix_attention = LinearMatrixAttention( tensor_1_dim=passage_encoding_dim, tensor_2_dim=passage_encoding_dim, combination="x,y,x*y") # This computes a passage_to_passage attention, hopefully, for each token, putting a weight on date tokens # that are related to it. self.passage_to_date_attention: MatrixAttention = BilinearMatrixAttention( matrix_1_dim=passage_encoding_dim, matrix_2_dim=passage_encoding_dim) self.passage_to_start_date_attention: MatrixAttention = BilinearMatrixAttention( matrix_1_dim=passage_encoding_dim, matrix_2_dim=passage_encoding_dim) self.passage_to_end_date_attention: MatrixAttention = BilinearMatrixAttention( matrix_1_dim=passage_encoding_dim, matrix_2_dim=passage_encoding_dim) # This computes a passage_to_passage attention, hopefully, for each token, putting a weight on date tokens # that are related to it. self.passage_to_num_attention: MatrixAttention = BilinearMatrixAttention( matrix_1_dim=passage_encoding_dim, matrix_2_dim=passage_encoding_dim) if dropout > 0: self._dropout = torch.nn.Dropout(p=dropout) else: self._dropout = lambda x: x
combination='x,y', activation=tanh) output = attention(vector, matrix) print('Output from LinearAttention:', output) # MatrixAttention sequence_length1 = 10 sequence_length2 = 15 # dot product attention only allows matrices of the same size matrix1 = torch.rand((batch_size, sequence_length1, embedding_dim1)) matrix2 = torch.rand((batch_size, sequence_length2, embedding_dim1)) matrix_attention = DotProductMatrixAttention() output = matrix_attention(matrix1, matrix2) print('Output shape of DotProductMatrixAttention:', output.shape) # bilinear & linear attention allows inputs of different sizes matrix1 = torch.rand((1, sequence_length1, embedding_dim1)) matrix2 = torch.rand((1, sequence_length2, embedding_dim2)) matrix_attention = BilinearMatrixAttention( matrix_1_dim=embedding_dim1, matrix_2_dim=embedding_dim2) output = matrix_attention(matrix1, matrix2) print('Output shape of BilinearMatrixAttention:', output.shape) matrix_attention = LinearMatrixAttention( tensor_1_dim=embedding_dim1, tensor_2_dim=embedding_dim2, combination='x,y', activation=tanh) output = matrix_attention(matrix1, matrix2) print('Output shape of LinearMatrixAttention:', output.shape)
def __init__(self, question_encoding_dim: int, passage_encoding_dim: int, passage_attention_to_span: Seq2SeqEncoder, question_attention_to_span: Seq2SeqEncoder, passage_attention_to_count: Seq2SeqEncoder, passage_count_predictor=None, passage_count_hidden2logits=None, dropout: float = 0.0): super().__init__() self.num_counts = 10 self.passage_attention_scalingvals = [1, 2, 5, 10] self.passage_attention_to_span = passage_attention_to_span self.passage_startend_predictor = torch.nn.Linear( self.passage_attention_to_span.get_output_dim(), 2) self.question_attention_to_span = question_attention_to_span self.question_startend_predictor = torch.nn.Linear( self.question_attention_to_span.get_output_dim(), 2) self.passage_attention_to_count = passage_attention_to_count # self.passage_count_predictor = torch.nn.Linear(self.passage_attention_to_count.get_output_dim(), # self.num_counts) self.passage_count_predictor = passage_count_predictor # Linear from self.passage_attention_to_count.output_dim --> 1 self.passage_count_hidden2logits = passage_count_hidden2logits self.dotprod_matrix_attn = DotProductMatrixAttention() self.filter_matrix_attention = LinearMatrixAttention( tensor_1_dim=question_encoding_dim, tensor_2_dim=passage_encoding_dim, combination="x,y,x*y") # We will sum the passage-token-repr to the weighted-q-repr - to use x*y combination self.relocate_matrix_attention = LinearMatrixAttention( tensor_1_dim=passage_encoding_dim, tensor_2_dim=passage_encoding_dim, combination="x,y,x*y") # This computes a passage_to_passage attention, hopefully, for each token, putting a weight on date tokens # that are related to it. self.passage_to_date_attention: MatrixAttention = BilinearMatrixAttention( matrix_1_dim=passage_encoding_dim, matrix_2_dim=passage_encoding_dim) self.passage_to_start_date_attention: MatrixAttention = BilinearMatrixAttention( matrix_1_dim=passage_encoding_dim, matrix_2_dim=passage_encoding_dim) self.passage_to_end_date_attention: MatrixAttention = BilinearMatrixAttention( matrix_1_dim=passage_encoding_dim, matrix_2_dim=passage_encoding_dim) # This computes a passage_to_passage attention, hopefully, for each token, putting a weight on date tokens # that are related to it. self.passage_to_num_attention: MatrixAttention = BilinearMatrixAttention( matrix_1_dim=passage_encoding_dim, matrix_2_dim=passage_encoding_dim) if dropout > 0: self._dropout = torch.nn.Dropout(p=dropout) else: self._dropout = lambda x: x
def __init__(self, vocab: Vocabulary, text_encoder: Seq2SeqEncoder, word_embedder: TextFieldEmbedder, enable_training_log: bool = False, inp_drop_rate: float = 0.2, out_drop_rate: float = 0.2, loss_weights: List = (0.2, 0.4, 0.4), super_mode: str = 'before', backbone: str = 'unet', unet_down_channel: int = 256, feature_sel: int = 127): super(UnifiedFollowUp, self).__init__(vocab) self.text_encoder = text_encoder self.word_embedder = word_embedder """ Define model arch choices """ self.backbone = backbone # input dropout if inp_drop_rate > 0: self.var_inp_dropout = InputVariationalDropout(p=inp_drop_rate) else: self.var_inp_dropout = lambda x: x # output dropout if out_drop_rate > 0: self.var_out_dropout = InputVariationalDropout(p=out_drop_rate) else: self.var_out_dropout = lambda x: x self.hidden_size = text_encoder.get_output_dim() // 2 if text_encoder.is_bidirectional() \ else text_encoder.get_output_dim() self.output_size = text_encoder.get_output_dim() # ele -> element wise multiply # dot -> dot product # cos -> cosine similarity # emb_dot -> embedding dot product # emb_cos -> embedding cosine similarity # linear -> linear similarity # bilinear -> bilinear similarity feature_sel = feature_sel sel_arr = "{0:07b}".format(int(feature_sel)) nni_choices = ['ele', 'dot', 'cos', 'emb_dot', 'emb_cos', 'linear', 'bilinear'] self.segment_choices = [nni_choices[i] for i in range(7) if sel_arr[i] == '1'] # if expand bi-direction, we will regard forward/backward as two channels self.expand_bidir = False self.similar_function = ModuleDict({ 'ele': ElementWiseMatrixAttention(), 'dot': DotProductMatrixAttention(), 'cos': CosineMatrixAttention(), 'emb_dot': DotProductMatrixAttention(), 'emb_cos': CosineMatrixAttention(), 'bilinear': BilinearMatrixAttention(matrix_1_dim=self.output_size, matrix_2_dim=self.output_size), 'linear': LinearMatrixAttention(tensor_1_dim=self.output_size, tensor_2_dim=self.output_size) }) self.attn_channel = 0 for choice in self.segment_choices: if choice == 'ele': self.attn_channel += self.output_size elif choice in ['dot', 'cos', 'emb_dot', 'emb_cos', 'bilinear', 'linear']: if self.expand_bidir: self.attn_channel += 2 else: self.attn_channel += 1 self.class_mapping: Dict[str, int] = get_class_mapping(super_mode=super_mode) # Here we have two choices now, one is MLP, and another is UNet if self.backbone == 'unet': self.segmentation_net = AttentionUNet(input_channels=self.attn_channel, class_number=len(self.class_mapping.keys()), down_channel=unet_down_channel) else: raise Exception("Currently we do not support for other arches.") class_zero_weight = loss_weights[0] class_one_weight = loss_weights[1] self.register_buffer('weight_tensor', torch.tensor([class_zero_weight, class_one_weight, 1 - class_zero_weight - class_one_weight])) self.loss = nn.CrossEntropyLoss(ignore_index=-1, weight=self.weight_tensor) # initialize metrics measurement self.metrics = {'ROUGE': BatchAverage(), '_ROUGE1': BatchAverage(), '_ROUGE2': BatchAverage(), # TODO: You can speed up the code by disable BLEU since # the corpus-based BLEU metric is much time-consuming. 'BLEU': CorpusBLEUMetric(), 'EM': BatchAverage(), 'F1': FScoreMetric(prefix="1"), 'F2': FScoreMetric(prefix="2"), 'F3': FScoreMetric(prefix="3")} parameter_num = count_parameters(self) print(parameter_num) self.min_width = 8 self.min_height = 8 self.enable_training_log = enable_training_log