def __init__(self, word_embeddings: TextFieldEmbedder, bin_count: int): super(DRMM, self).__init__() self.word_embeddings = word_embeddings self.cosine_module = CosineMatrixAttention() self.bin_count = bin_count self.matching_classifier = FeedForward( input_dim=bin_count, num_layers=2, hidden_dims=[bin_count, 1], activations=[ Activation.by_name('tanh')(), Activation.by_name('tanh')() ]) self.query_gate = FeedForward( input_dim=self.word_embeddings.get_output_dim(), num_layers=2, hidden_dims=[self.word_embeddings.get_output_dim(), 1], activations=[ Activation.by_name('tanh')(), Activation.by_name('tanh')() ]) self.query_softmax = MaskedSoftmax()
def __init__(self, unified_query_length:int, unified_document_length:int, max_conv_kernel_size: int, # 2 to n conv_output_size: int, # conv output channels kmax_pooling_size: int): # per query k-max pooling super(PACRR,self).__init__() self.cosine_module = CosineMatrixAttention() self.unified_query_length = unified_query_length self.unified_document_length = unified_document_length self.convolutions = [] for i in range(2, max_conv_kernel_size + 1): self.convolutions.append( nn.Sequential( nn.ConstantPad2d((0,i - 1,0, i - 1), 0), # this outputs [batch,1,unified_query_length + i - 1 ,unified_document_length + i - 1] nn.Conv2d(kernel_size=i, in_channels=1, out_channels=conv_output_size), # this outputs [batch,32,unified_query_length,unified_document_length] nn.MaxPool3d(kernel_size=(conv_output_size,1,1)) # this outputs [batch,1,unified_query_length,unified_document_length] )) self.convolutions = nn.ModuleList(self.convolutions) # register conv as part of the model self.masked_softmax = MaskedSoftmax() self.kmax_pooling_size = kmax_pooling_size self.dense = nn.Linear(kmax_pooling_size * unified_query_length * max_conv_kernel_size, out_features=100, bias=True) self.dense2 = nn.Linear(100, out_features=10, bias=True) self.dense3 = nn.Linear(10, out_features=1, bias=False)
def __init__(self, word_embeddings: TextFieldEmbedder, n_grams: int, n_kernels: int, conv_out_dim: int): super(Conv_KNRM, self).__init__() self.word_embeddings = word_embeddings # static - kernel size & magnitude variables self.mu = Variable(torch.cuda.FloatTensor(self.kernel_mus(n_kernels)), requires_grad=False).view(1, 1, 1, n_kernels) self.sigma = Variable(torch.cuda.FloatTensor( self.kernel_sigmas(n_kernels)), requires_grad=False).view(1, 1, 1, n_kernels) self.convolutions = [] for i in range(1, n_grams + 1): self.convolutions.append( nn.Sequential( nn.ConstantPad1d((0, i - 1), 0), nn.Conv1d(kernel_size=i, in_channels=word_embeddings.get_output_dim(), out_channels=conv_out_dim), nn.ReLU())) self.convolutions = nn.ModuleList( self.convolutions) # register conv as part of the model # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights) self.cosine_module = CosineMatrixAttention() # *9 because we concat the 3x3 conv match sums together before the dense layer self.dense = nn.Linear(n_kernels * n_grams * n_grams, 1, bias=False) # init with small weights, otherwise the dense output is way to high fot torch.nn.init.uniform_(self.dense.weight, -0.014, 0.014) # inits taken from matchzoo
def __init__(self, _embsize: int, kernels_mu: List[float], kernels_sigma: List[float], att_heads: int, att_layer: int, att_proj_dim: int, att_ff_dim: int, win_size: int, max_windows: int): super(TK_v2, self).__init__() n_kernels = len(kernels_mu) if len(kernels_mu) != len(kernels_sigma): raise Exception("len(kernels_mu) != len(kernels_sigma)") # static - kernel size & magnitude variables self.mu = Variable(torch.cuda.FloatTensor(kernels_mu), requires_grad=False).view(1, 1, 1, n_kernels) self.sigma = Variable(torch.cuda.FloatTensor(kernels_sigma), requires_grad=False).view(1, 1, 1, n_kernels) self.mixer = nn.Parameter( torch.full([1, 1, 1], 0.5, dtype=torch.float32, requires_grad=True)) self.stacked_att = StackedSelfAttentionEncoder( input_dim=_embsize, hidden_dim=_embsize, projection_dim=att_proj_dim, feedforward_hidden_dim=att_ff_dim, num_layers=att_layer, num_attention_heads=att_heads, dropout_prob=0, residual_dropout_prob=0, attention_dropout_prob=0) # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights) self.cosine_module = CosineMatrixAttention() self.nn_scaler = nn.ParameterList([ nn.Parameter( torch.full([1], 0.01, dtype=torch.float32, requires_grad=True)) for w in win_size ]) self.kernel_weights = nn.ModuleList( [nn.Linear(n_kernels, 1, bias=False) for w in win_size]) self.window_size = win_size self.window_scorer = [] for w in max_windows: l = nn.Linear(w, 1, bias=False) torch.nn.init.constant_(l.weight, 1 / w) self.window_scorer.append(l) self.window_scorer = nn.ModuleList(self.window_scorer) self.window_merger = nn.Linear(len(self.window_size), 1, bias=False)
def __init__(self, _embsize: int, kernels_mu: List[float], kernels_sigma: List[float], att_heads: int, att_layer: int, att_proj_dim: int, att_ff_dim: int): super(TK_v1, self).__init__() n_kernels = len(kernels_mu) if len(kernels_mu) != len(kernels_sigma): raise Exception("len(kernels_mu) != len(kernels_sigma)") # static - kernel size & magnitude variables self.mu = Variable(torch.cuda.FloatTensor(kernels_mu), requires_grad=False).view(1, 1, 1, n_kernels) self.sigma = Variable(torch.cuda.FloatTensor(kernels_sigma), requires_grad=False).view(1, 1, 1, n_kernels) self.nn_scaler = nn.Parameter( torch.full([1], 0.01, dtype=torch.float32, requires_grad=True)) self.mixer = nn.Parameter( torch.full([1, 1, 1], 0.5, dtype=torch.float32, requires_grad=True)) self.stacked_att = StackedSelfAttentionEncoder( input_dim=_embsize, hidden_dim=_embsize, projection_dim=att_proj_dim, feedforward_hidden_dim=att_ff_dim, num_layers=att_layer, num_attention_heads=att_heads, dropout_prob=0, residual_dropout_prob=0, attention_dropout_prob=0) # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights) self.cosine_module = CosineMatrixAttention() # bias is set to True in original code (we found it to not help, how could it?) self.dense = nn.Linear(n_kernels, 1, bias=False) self.dense_mean = nn.Linear(n_kernels, 1, bias=False) self.dense_comb = nn.Linear(2, 1, bias=False) # init with small weights, otherwise the dense output is way to high for the tanh -> resulting in loss == 1 all the time torch.nn.init.uniform_(self.dense.weight, -0.014, 0.014) # inits taken from matchzoo torch.nn.init.uniform_(self.dense_mean.weight, -0.014, 0.014) # inits taken from matchzoo # init with small weights, otherwise the dense output is way to high for the tanh -> resulting in loss == 1 all the time torch.nn.init.uniform_(self.dense.weight, -0.014, 0.014) # inits taken from matchzoo
def __init__(self, word_embeddings_out_dim: int): super(Duet, self).__init__() NUM_HIDDEN_NODES = word_embeddings_out_dim POOLING_KERNEL_WIDTH_QUERY = 18 POOLING_KERNEL_WIDTH_DOC = 100 DROPOUT_RATE = 0 NUM_POOLING_WINDOWS_DOC = 99 MAX_DOC_TERMS = 200 MAX_QUERY_TERMS = 30 self.cosine_module = CosineMatrixAttention() self.duet_local = nn.Sequential( nn.Conv1d(MAX_DOC_TERMS, NUM_HIDDEN_NODES, kernel_size=1), nn.ReLU(), Flatten(), nn.Dropout(p=DROPOUT_RATE), nn.Linear(NUM_HIDDEN_NODES * MAX_QUERY_TERMS, NUM_HIDDEN_NODES), nn.ReLU(), nn.Dropout(p=DROPOUT_RATE), nn.Linear(NUM_HIDDEN_NODES, NUM_HIDDEN_NODES), nn.ReLU(), nn.Dropout(p=DROPOUT_RATE)) self.duet_dist_q = nn.Sequential( nn.Conv1d(NUM_HIDDEN_NODES, NUM_HIDDEN_NODES, kernel_size=3), nn.ReLU(), nn.MaxPool1d(POOLING_KERNEL_WIDTH_QUERY), Flatten(), nn.Linear(NUM_HIDDEN_NODES, NUM_HIDDEN_NODES), nn.ReLU()) self.duet_dist_d = nn.Sequential( nn.Conv1d(NUM_HIDDEN_NODES, NUM_HIDDEN_NODES, kernel_size=3), nn.ReLU(), nn.MaxPool1d(POOLING_KERNEL_WIDTH_DOC, stride=1), nn.Conv1d(NUM_HIDDEN_NODES, NUM_HIDDEN_NODES, kernel_size=1), nn.ReLU()) self.duet_dist = nn.Sequential( Flatten(), nn.Dropout(p=DROPOUT_RATE), nn.Linear(NUM_HIDDEN_NODES * NUM_POOLING_WINDOWS_DOC, NUM_HIDDEN_NODES), nn.ReLU(), nn.Dropout(p=DROPOUT_RATE), nn.Linear(NUM_HIDDEN_NODES, NUM_HIDDEN_NODES), nn.ReLU(), nn.Dropout(p=DROPOUT_RATE)) self.duet_comb = nn.Sequential( nn.Linear(NUM_HIDDEN_NODES, NUM_HIDDEN_NODES), nn.ReLU(), nn.Dropout(p=DROPOUT_RATE), nn.Linear(NUM_HIDDEN_NODES, NUM_HIDDEN_NODES), nn.ReLU(), nn.Dropout(p=DROPOUT_RATE), nn.Linear(NUM_HIDDEN_NODES, 1), nn.ReLU()) #self.scale = nn.Parameter(torch.tensor([0.1]), requires_grad=True) def init_normal(m): if type(m) == nn.Linear: nn.init.uniform_(m.weight, 0, 0.01) self.duet_comb.apply(init_normal)
def __init__(self, word_embeddings: TextFieldEmbedder, n_kernels: int): super(KNRM, self).__init__() self.word_embeddings = word_embeddings # static - kernel size & magnitude variables self.mu = Variable(torch.FloatTensor(self.kernel_mus(n_kernels)), requires_grad=False).view(1, 1, 1, n_kernels) self.sigma = Variable(torch.FloatTensor(self.kernel_sigmas(n_kernels)), requires_grad=False).view(1, 1, 1, n_kernels) #Cosine matrix self.cosine_module = CosineMatrixAttention() # Initialize the Linear transformer model: self.transform = nn.Linear(n_kernels, out_features=1, bias=True)
def __init__(self, word_embeddings: TextFieldEmbedder, conv_output_size: List[int], conv_kernel_size: List[Tuple[int, int]], adaptive_pooling_size: List[Tuple[int, int]]): super(MatchPyramid, self).__init__() self.word_embeddings = word_embeddings self.cosine_module = CosineMatrixAttention() #self.cosine_module = DotProductMatrixAttention() if len(conv_output_size) != len(conv_kernel_size) or len( conv_output_size) != len(adaptive_pooling_size): raise Exception( "conv_output_size, conv_kernel_size, adaptive_pooling_size must have the same length" ) conv_layer_dict = OrderedDict() last_channel_out = 1 for i in range(len(conv_output_size)): conv_layer_dict["pad " + str(i)] = nn.ConstantPad2d( (0, conv_kernel_size[i][0] - 1, 0, conv_kernel_size[i][1] - 1), 0) conv_layer_dict["conv " + str(i)] = nn.Conv2d( kernel_size=conv_kernel_size[i], in_channels=last_channel_out, out_channels=conv_output_size[i]) conv_layer_dict["relu " + str(i)] = nn.ReLU() conv_layer_dict["pool " + str(i)] = nn.AdaptiveMaxPool2d( adaptive_pooling_size[i] ) # this is strange - but so written in the paper # would think only to pool at the end ?? last_channel_out = conv_output_size[i] self.conv_layers = nn.Sequential(conv_layer_dict) #self.dropout = nn.Dropout(0) self.dense = nn.Linear(conv_output_size[-1] * adaptive_pooling_size[-1][0] * adaptive_pooling_size[-1][1], out_features=100, bias=True) self.dense2 = nn.Linear(100, out_features=10, bias=True) self.dense3 = nn.Linear(10, out_features=1, bias=False)
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, num_highway_layers: int, phrase_layer: Seq2SeqEncoder, modeling_layer: Seq2SeqEncoder, span_end_encoder: Seq2SeqEncoder, dropout: float = 0.2, mask_lstms: bool = True, initializer: InitializerApplicator = InitializerApplicator(), regularizer: RegularizerApplicator = RegularizerApplicator()): super(BidirectionalAttentionFlow, self).__init__(vocab, regularizer) self._text_field_embedder = text_field_embedder self._highway_layer = TimeDistributed( Highway(text_field_embedder.get_output_dim(), num_highway_layers)) self._phrase_layer = phrase_layer self._matrix_attention = CosineMatrixAttention() self._modeling_layer = modeling_layer self._span_end_encoder = span_end_encoder encoding_dim = phrase_layer.get_output_dim() modeling_dim = modeling_layer.get_output_dim() span_start_input_dim = encoding_dim * 4 + modeling_dim self._span_start_predictor = TimeDistributed( torch.nn.Linear(span_start_input_dim, 1)) span_end_encoding_dim = span_end_encoder.get_output_dim() span_end_input_dim = encoding_dim * 4 + span_end_encoding_dim self._span_end_predictor = TimeDistributed( torch.nn.Linear(span_end_input_dim, 1)) # Bidaf has lots of layer dimensions which need to match up - these aren't necessarily # obvious from the configuration files, so we check here. check_dimensions_match(modeling_layer.get_input_dim(), 4 * encoding_dim, "modeling layer input dim", "4 * encoding dim") check_dimensions_match(text_field_embedder.get_output_dim(), phrase_layer.get_input_dim(), "text field embedder output dim", "phrase layer input dim") check_dimensions_match(span_end_encoder.get_input_dim(), 4 * encoding_dim + 3 * modeling_dim, "span end encoder input dim", "4 * encoding dim + 3 * modeling dim") self._span_start_accuracy = CategoricalAccuracy() self._span_end_accuracy = CategoricalAccuracy() self._span_accuracy = BooleanAccuracy() self._squad_metrics = SquadEmAndF1() if dropout > 0: self._dropout = torch.nn.Dropout(p=dropout) else: self._dropout = lambda x: x self._mask_lstms = mask_lstms initializer(self)
def __init__(self, n_kernels: int): super(KNRM, self).__init__() # static - kernel size & magnitude variables self.mu = Variable(torch.cuda.FloatTensor(self.kernel_mus(n_kernels)), requires_grad=False).view(1, 1, 1, n_kernels) self.sigma = Variable(torch.cuda.FloatTensor( self.kernel_sigmas(n_kernels)), requires_grad=False).view(1, 1, 1, n_kernels) # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights) self.cosine_module = CosineMatrixAttention() # bias is set to True in original code (we found it to not help, how could it?) self.dense = nn.Linear(n_kernels, 1, bias=False) # init with small weights, otherwise the dense output is way to high for the tanh -> resulting in loss == 1 all the time torch.nn.init.uniform_(self.dense.weight, -0.014, 0.014) # inits taken from matchzoo
def __init__(self, word_embeddings: TextFieldEmbedder, n_grams: int, n_kernels: int, conv_out_dim: int): super(Conv_KNRM, self).__init__() self.word_embeddings = word_embeddings # static - kernel size & magnitude variables self.mu = Variable(torch.FloatTensor(self.kernel_mus(n_kernels)), requires_grad = False).view(1, 1, 1, n_kernels) self.sigma = Variable(torch.FloatTensor(self.kernel_sigmas(n_kernels)), requires_grad = False).view(1, 1, 1, n_kernels) # Implement 1 Dimensional CNN layer for each n-gram type # Also, use RelU as Activation function self.convolutions = [] for i in range (1, n_grams + 1): self.convolutions.append(nn.Sequential( nn.ConstantPad1d((0 , i-1 ), 0), # the kernel size of the convolutional layer is the same as the current i-gram(uni, bi, tri...) in the loop nn.Conv1d(kernel_size = i, in_channels = word_embeddings.get_output_dim(), out_channels = conv_out_dim), nn.ReLU())) # register conv as part of the model self.convolutions = nn.ModuleList(self.convolutions) #Cosine similarity matrix self.cosine_module = CosineMatrixAttention() # Initialize the Linear transformer model: # size of the input: number of elements in the soft-TF feautes * number of kernel products ( # n_kernels * n_grams * n_grams = all combination of match matrix creation # (n-gram pairs from query and document embeddings) # the output will be 1 sample # also use bias based on the paper formula (by default it's true but just to make sure) self.transform = nn.Linear(in_features = n_kernels * n_grams * n_grams, out_features = 1, bias = True)
def __init__(self, conv_output_size: List[int], conv_kernel_size: List[Tuple[int, int]], adaptive_pooling_size: List[Tuple[int, int]]): super(MatchPyramid, self).__init__() self.cosine_module = CosineMatrixAttention() if len(conv_output_size) != len(conv_kernel_size) or len( conv_output_size) != len(adaptive_pooling_size): raise Exception( "conv_output_size, conv_kernel_size, adaptive_pooling_size must have the same length" ) conv_layer_dict = OrderedDict() last_channel_out = 1 for i in range(len(conv_output_size)): conv_layer_dict["pad " + str(i)] = nn.ConstantPad2d( (0, conv_kernel_size[i][0] - 1, 0, conv_kernel_size[i][1] - 1), 0) conv_layer_dict["conv " + str(i)] = nn.Conv2d( kernel_size=conv_kernel_size[i], in_channels=last_channel_out, out_channels=conv_output_size[i]) conv_layer_dict["relu " + str(i)] = nn.ReLU() conv_layer_dict["pool " + str(i)] = nn.AdaptiveMaxPool2d( adaptive_pooling_size[i]) last_channel_out = conv_output_size[i] self.conv_layers = nn.Sequential(conv_layer_dict) self.dense = nn.Linear(conv_output_size[-1] * adaptive_pooling_size[-1][0] * adaptive_pooling_size[-1][1], out_features=100, bias=True) self.dense2 = nn.Linear(100, out_features=10, bias=True) self.dense3 = nn.Linear(10, out_features=1, bias=False)
def __init__(self, word_embeddings: TextFieldEmbedder, vocab: Vocabulary, lstm_hidden_dim: int, top_k: int, cuda_device: int) -> None: super().__init__(vocab) self.word_embeddings = word_embeddings self.query_rep = nn.LSTM(self.word_embeddings.get_output_dim(), lstm_hidden_dim, batch_first=True, bidirectional=True) self.doc_rep = nn.LSTM(self.word_embeddings.get_output_dim(), lstm_hidden_dim, batch_first=True, bidirectional=True) # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights) self.cosine_module = CosineMatrixAttention() self.top_k = top_k self.dense = nn.Linear(top_k, out_features=20, bias=True) self.dense2 = nn.Linear(20, out_features=20, bias=True) self.dense3 = nn.Linear(20, out_features=1, bias=False)
def __init__(self, vocab: Vocabulary, char_embedder: TextFieldEmbedder, word_embedder: TextFieldEmbedder, tokens_encoder: Seq2SeqEncoder, model_args, inp_drop_rate: float = 0.5, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: """ :param vocab: vocabulary from train and dev dataset :param char_embedder: character embedding + cnn encoder :param word_embedder: word embedding :param tokens_encoder: Bi-LSTM backbone for split :param model_args: model arguments :param inp_drop_rate: input dropout rate """ super(FollowUpSnippetModel, self).__init__(vocab, regularizer) self.tokens_encoder = tokens_encoder self.projection_layer = torch.nn.Linear( in_features=word_embedder.get_output_dim() + 1 + char_embedder.get_output_dim(), out_features=self.tokens_encoder.get_input_dim(), bias=False) # integer to mark field, 0 or 1 self.num_classes = 2 self.num_conflicts = 2 self._non_linear = torch.nn.PReLU() self.hidden_size = int(self.tokens_encoder.get_output_dim() / 2) self.policy_net = PolicyNet(self.tokens_encoder.get_output_dim() * 3, self.num_classes) self.token_field_embedding = word_embedder self.char_field_embedding = char_embedder self._scaled_value = 1.0 self._self_attention = CosineMatrixAttention() self.margin_loss = MarginRankingLoss(margin=model_args.margin) # calculate span similarity self.cosine_similar = CosineSimilarity(dim=0) if inp_drop_rate > 0: self._variational_dropout = InputVariationalDropout(p=inp_drop_rate) else: self._variational_dropout = lambda x: x self.metrics = { "bleu": BLEUScore(), "reward": RewardScore(), "symbol": SymbolScore(), "reward_var": RewardScore(), "overall": RewardScore() } initializer(self)
class MV_LSTM(Model): ''' Paper: A Deep Architecture for Semantic Matching with Multiple Positional Sentence Representations, Wan et al., AAAI'16 Reference code (paper author): https://github.com/NTMC-Community/MatchZoo/blob/master/matchzoo/models/mvlstm.py (but in tensorflow) ''' def __init__(self, word_embeddings: TextFieldEmbedder, vocab: Vocabulary, lstm_hidden_dim: int, top_k: int, cuda_device: int) -> None: super().__init__(vocab) self.word_embeddings = word_embeddings self.query_rep = nn.LSTM(self.word_embeddings.get_output_dim(), lstm_hidden_dim, batch_first=True, bidirectional=True) self.doc_rep = nn.LSTM(self.word_embeddings.get_output_dim(), lstm_hidden_dim, batch_first=True, bidirectional=True) # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights) self.cosine_module = CosineMatrixAttention() self.top_k = top_k self.dense = nn.Linear(top_k, out_features=20, bias=True) self.dense2 = nn.Linear(20, out_features=20, bias=True) self.dense3 = nn.Linear(20, out_features=1, bias=False) def forward(self, query: Dict[str, torch.Tensor], document: Dict[str, torch.Tensor], query_length: torch.Tensor, document_length: torch.Tensor) -> torch.Tensor: # pylint: disable=arguments-differ # # prepare embedding tensors & paddings masks # ------------------------------------------------------- # we assume 1 is the unknown token, 0 is padding - both need to be removed if len(query["tokens"].shape) == 2: # (embedding lookup matrix) # shape: (batch, query_max) query_pad_oov_mask = (query["tokens"] > 1).float() # shape: (batch, doc_max) document_pad_oov_mask = (document["tokens"] > 1).float() else: # == 3 (elmo characters per word) # shape: (batch, query_max) query_pad_oov_mask = (torch.sum(query["tokens"], 2) > 0).float() # shape: (batch, doc_max) document_pad_oov_mask = (torch.sum(document["tokens"], 2) > 0).float() # shape: (batch, query_max,emb_dim) query_embeddings = self.word_embeddings( query) * query_pad_oov_mask.unsqueeze(-1) # shape: (batch, document_max,emb_dim) document_embeddings = self.word_embeddings( document) * document_pad_oov_mask.unsqueeze(-1) # # conextualized rep (via lstms) # ------------------------------------------------------- #hidden_d = torch.randn(()) query_rep, hidden_q = self.query_rep(query_embeddings) document_rep, hidden_d = self.doc_rep(document_embeddings) # # cosine matrix # ------------------------------------------------------- # shape: (batch, query_max, doc_max) cosine_matrix = self.cosine_module.forward(query_rep, document_rep) # # topk pooling # ------------------------------------------------------- cosine_flat = cosine_matrix.view(cosine_matrix.shape[0], -1) top_k_elments = torch.topk(cosine_flat, k=self.top_k, sorted=True)[0] ## ## "MLP" layer ## ------------------------------------------------------- dense_out = F.relu(self.dense(top_k_elments)) dense_out = F.relu(self.dense2(dense_out)) dense_out = self.dense3(dense_out) output = torch.squeeze(dense_out, 1) return output
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, contextualizer: Seq2SeqEncoder, labeler: Seq2SeqEncoder, projection_size: int, bidirectional: bool = False, use_hypothesis: bool = True, attention: str = "", # "" - none / cosine / bilinear initializer: InitializerApplicator = None, classifier_dir = "", del_perc_lambda = 1, del_perc = 0.3, del_metric_threshold = 0.1, teacher_lambda = 0.0, coverage_lambda = 0.0, transition_lamb = 0.0, gumbel = True, neutral_label = "") -> None: super().__init__(vocab) self._text_field_embedder = text_field_embedder if contextualizer.is_bidirectional() is not bidirectional: raise ConfigurationError( "Bidirectionality of contextualizer must match bidirectionality of " "language model. " f"Contextualizer bidirectional: {contextualizer.is_bidirectional()}, " f"language model bidirectional: {bidirectional}") self.classifier_dir = classifier_dir self.classifier = None self.coverage_lambda = coverage_lambda self.del_perc_lambda = del_perc_lambda self.del_perc = del_perc self.teacher_lambda = teacher_lambda self.transition_lamb = transition_lamb self.gumbel = gumbel if classifier_dir != "": overrides = '{"model": {"dropout": 0, "output_feedforward": {"dropout": 0}}}' overrides = "" archive = load_archive(classifier_dir, overrides=overrides) self.classifier = archive.model # Freeze parameters for p in self.classifier.parameters(): p.requires_grad = False # A hack that prevents allennlp from crushing when running extend on all submodules def foo(*x, **y): return 1 self.classifier._text_field_embedder.token_embedder_tokens.extend_vocab = foo self.classifier.eval() # get index of the neutral label self.neutral_ind = self.classifier.vocab.get_token_index(neutral_label, 'labels') self.criterion = torch.nn.CrossEntropyLoss() self._contextualizer = contextualizer self._labeler = labeler self._bidirectional = bidirectional self.use_hypothesis = use_hypothesis self.attention = attention self.projection_size = projection_size # hypothesis aggr self.w_prem = torch.nn.Linear(contextualizer.get_output_dim(), projection_size) if use_hypothesis: self.w_hyp = torch.nn.Linear(contextualizer.get_output_dim(), projection_size) self._contextual_dim = contextualizer.get_output_dim() # The dimension for making predictions just in the forward # (or backward) direction. if self._bidirectional: self._forward_dim = self._contextual_dim // 2 else: self._forward_dim = self._contextual_dim if self.attention: if self.attention == "cosine": self.attention_mat = CosineMatrixAttention() elif self.attention == "bilinear": self.attention_mat = BilinearMatrixAttention(self._forward_dim, self._forward_dim) else: raise ConfigurationError("Undefined attention type") self.mask_linear = torch.nn.Linear(self._labeler.get_output_dim(), 2) self._accuracy = CategoricalAccuracy() self._avg_perc_masked = Average() self._avg_transition = Average() self._acc_vs_del = AccuracyVSDeletion(del_threshold=del_metric_threshold) self._acc_plus_del = AccuracyVSDeletion(del_threshold=0, aggr="sum") self._f1_deletions = F1SequenceMeasure(positive_label=1) if initializer is not None: initializer(self)
class CO_PACRR(nn.Module): ''' Paper: Co-PACRR: A Context-Aware Neural IR Model for Ad-hoc Retrieval, Hui et al., WSDM'18 Reference code (but in tensorflow): * first-hand: https://github.com/khui/copacrr/blob/master/models/pacrr.py differences to pacrr: * (1) context vector (query avg, document rolling window avg pool) * (2) cascade k-max pooling * (3) shuffling query terms at the end ''' @staticmethod def from_config(config, word_embeddings_out_dim): return CO_PACRR( unified_query_length=config["pacrr_unified_query_length"], unified_document_length=config["pacrr_unified_document_length"], max_conv_kernel_size=config["pacrr_max_conv_kernel_size"], conv_output_size=config["pacrr_conv_output_size"], kmax_pooling_size=config["pacrr_kmax_pooling_size"]) def __init__( self, unified_query_length: int, unified_document_length: int, max_conv_kernel_size: int, # 2 to n conv_output_size: int, # conv output channels kmax_pooling_size: int): # per query k-max pooling super(CO_PACRR, self).__init__() self.cosine_module = CosineMatrixAttention() self.unified_query_length = unified_query_length self.unified_document_length = unified_document_length self.convolutions = [] for i in range(2, max_conv_kernel_size + 1): self.convolutions.append( nn.Sequential( nn.ConstantPad2d( (0, i - 1, 0, i - 1), 0 ), # this outputs [batch,1,unified_query_length + i - 1 ,unified_document_length + i - 1] nn.Conv2d( kernel_size=i, in_channels=1, out_channels=conv_output_size ), # this outputs [batch,32,unified_query_length,unified_document_length] nn.MaxPool3d( kernel_size=(conv_output_size, 1, 1) ) # this outputs [batch,1,unified_query_length,unified_document_length] )) self.convolutions = nn.ModuleList( self.convolutions) # register conv as part of the model context_pool_size = 6 self.doc_context_pool = nn.Sequential( nn.ConstantPad1d((0, context_pool_size - 1), 0), nn.AvgPool1d(kernel_size=context_pool_size, stride=1)) self.masked_softmax = MaskedSoftmax() self.kmax_pooling_size = kmax_pooling_size kmax_pooling_view_percent = [0.25, 0.5, 0.75, 1] self.kmax_pooling_views = [ int(unified_document_length * x) for x in kmax_pooling_view_percent ] self.dense = nn.Linear(len(self.kmax_pooling_views) * 2 * kmax_pooling_size * unified_query_length * max_conv_kernel_size, out_features=100, bias=True) self.dense2 = nn.Linear(100, out_features=10, bias=True) self.dense3 = nn.Linear(10, out_features=1, bias=False) def forward(self, query_embeddings: torch.Tensor, document_embeddings: torch.Tensor, query_pad_oov_mask: torch.Tensor, document_pad_oov_mask: torch.Tensor, query_idfs: torch.Tensor, document_idfs: torch.Tensor, output_secondary_output: bool = False) -> torch.Tensor: # # similarity matrix # ------------------------------------------------------- # create sim matrix cosine_matrix = self.cosine_module.forward(query_embeddings, document_embeddings) # shape: (batch, 1, query_max, doc_max) for the input of conv_2d cosine_matrix = cosine_matrix[:, None, :, :] # # generate query and doc contexts # ------------------------------------------------------- query_context = torch.mean(query_embeddings, dim=1) document_context = self.doc_context_pool( document_embeddings.transpose(1, 2)).transpose(1, 2) cosine_matrix_context = self.cosine_module.forward( query_context.unsqueeze(dim=1), document_context).squeeze(1) # # duplicate cosine_matrix -> n-gram convolutions, then top-k pooling # ---------------------------------------------- conv_results = [] # # 1x1 cosine matrix (extra without convolutions) # cr_kmax_result = [[], []] for view_size in self.kmax_pooling_views: val, idx = torch.topk(cosine_matrix.squeeze(dim=1)[:, :, 0:view_size], k=self.kmax_pooling_size, sorted=True) cr_kmax_result[0].append(val) cr_kmax_result[1].append(idx) cr_kmax_result[0] = torch.cat(cr_kmax_result[0], dim=-1) cr_kmax_result[1] = torch.cat(cr_kmax_result[1], dim=-1) # incorporate context sims here, by selecting them from the kmax of the non-context sims flat_context = cosine_matrix_context.view(-1) index_offset = cr_kmax_result[1] + torch.arange( 0, cr_kmax_result[1].shape[0] * cosine_matrix_context.shape[1], cosine_matrix_context.shape[1], device=cr_kmax_result[1].device).unsqueeze(-1).unsqueeze(-1) selected_context = flat_context.index_select( dim=0, index=index_offset.view(-1)).view(cr_kmax_result[1].shape[0], cr_kmax_result[1].shape[1], -1) conv_results.append( torch.cat([cr_kmax_result[0], selected_context], dim=2)) # # nxn n-gram cosine matrices # for conv in self.convolutions: cr = conv(cosine_matrix) # # (2) take the kmax at multiple views of the cosine matrix - always starting # cr_kmax_result = [[], []] for view_size in self.kmax_pooling_views: val, idx = torch.topk(cr.squeeze(dim=1)[:, :, 0:view_size], k=self.kmax_pooling_size, sorted=True) cr_kmax_result[0].append(val) cr_kmax_result[1].append(idx) cr_kmax_result[0] = torch.cat(cr_kmax_result[0], dim=-1) cr_kmax_result[1] = torch.cat(cr_kmax_result[1], dim=-1) # # (1) incorporate context sims here, by selecting them from the kmax of the non-context sims # flat_context = cosine_matrix_context.view(-1) index_offset = cr_kmax_result[1] + torch.arange( 0, cr_kmax_result[1].shape[0] * cosine_matrix_context.shape[1], cosine_matrix_context.shape[1], device=cr_kmax_result[1].device).unsqueeze(-1).unsqueeze(-1) selected_context = flat_context.index_select( dim=0, index=index_offset.view(-1)).view(cr_kmax_result[1].shape[0], cr_kmax_result[1].shape[1], -1) conv_results.append( torch.cat([cr_kmax_result[0], selected_context], dim=2)) # # flatten all paths together & weight by query idf # ------------------------------------------------------- per_query_results = torch.cat(conv_results, dim=-1) weighted_per_query = per_query_results * self.masked_softmax( query_idfs, query_pad_oov_mask.unsqueeze(-1)) # # (3) shuffle component # if self.training: weighted_per_query = weighted_per_query[:, torch. randperm(weighted_per_query .shape[1]), :] all_flat = per_query_results.view(weighted_per_query.shape[0], -1) # # dense layer # ------------------------------------------------------- dense_out = F.relu(self.dense(all_flat)) dense_out = F.relu(self.dense2(dense_out)) dense_out = self.dense3(dense_out) output = torch.squeeze(dense_out, 1) if output_secondary_output: return output, {} return output def get_param_stats(self): return "CO-PACRR: / "
class PACRR(nn.Module): ''' Paper: PACRR: A Position-Aware Neural IR Model for Relevance Matching, Hui et al., EMNLP'17 Reference code (but in tensorflow): * first-hand: https://github.com/khui/copacrr/blob/master/models/pacrr.py ''' @staticmethod def from_config(config,word_embeddings_out_dim): return PACRR(unified_query_length=config["pacrr_unified_query_length"], unified_document_length=config["pacrr_unified_document_length"], max_conv_kernel_size=config["pacrr_max_conv_kernel_size"], conv_output_size=config["pacrr_conv_output_size"], kmax_pooling_size=config["pacrr_kmax_pooling_size"]) def __init__(self, unified_query_length:int, unified_document_length:int, max_conv_kernel_size: int, # 2 to n conv_output_size: int, # conv output channels kmax_pooling_size: int): # per query k-max pooling super(PACRR,self).__init__() self.cosine_module = CosineMatrixAttention() self.unified_query_length = unified_query_length self.unified_document_length = unified_document_length self.convolutions = [] for i in range(2, max_conv_kernel_size + 1): self.convolutions.append( nn.Sequential( nn.ConstantPad2d((0,i - 1,0, i - 1), 0), # this outputs [batch,1,unified_query_length + i - 1 ,unified_document_length + i - 1] nn.Conv2d(kernel_size=i, in_channels=1, out_channels=conv_output_size), # this outputs [batch,32,unified_query_length,unified_document_length] nn.MaxPool3d(kernel_size=(conv_output_size,1,1)) # this outputs [batch,1,unified_query_length,unified_document_length] )) self.convolutions = nn.ModuleList(self.convolutions) # register conv as part of the model self.masked_softmax = MaskedSoftmax() self.kmax_pooling_size = kmax_pooling_size self.dense = nn.Linear(kmax_pooling_size * unified_query_length * max_conv_kernel_size, out_features=100, bias=True) self.dense2 = nn.Linear(100, out_features=10, bias=True) self.dense3 = nn.Linear(10, out_features=1, bias=False) def forward(self, query_embeddings: torch.Tensor, document_embeddings: torch.Tensor, query_pad_oov_mask: torch.Tensor, document_pad_oov_mask: torch.Tensor, query_idfs: torch.Tensor, document_idfs: torch.Tensor, output_secondary_output: bool = False) -> torch.Tensor: # # similarity matrix # ------------------------------------------------------- # create sim matrix cosine_matrix = self.cosine_module.forward(query_embeddings, document_embeddings) # shape: (batch, 1, query_max, doc_max) for the input of conv_2d cosine_matrix = cosine_matrix[:,None,:,:] # # duplicate cosine_matrix -> n-gram convolutions, then top-k pooling # ---------------------------------------------- conv_results = [] conv_results.append(torch.topk(cosine_matrix.squeeze(),k=self.kmax_pooling_size,sorted=True)[0]) for conv in self.convolutions: cr = conv(cosine_matrix) cr_kmax_result = torch.topk(cr.squeeze(),k=self.kmax_pooling_size,sorted=True)[0] conv_results.append(cr_kmax_result) # # flatten all paths together & weight by query idf # ------------------------------------------------------- per_query_results = torch.cat(conv_results,dim=-1) weigthed_per_query = per_query_results * self.masked_softmax(query_idfs, query_pad_oov_mask.unsqueeze(-1)) all_flat = per_query_results.view(weigthed_per_query.shape[0],-1) # # dense layer # ------------------------------------------------------- dense_out = F.relu(self.dense(all_flat)) dense_out = F.relu(self.dense2(dense_out)) dense_out = self.dense3(dense_out) output = torch.squeeze(dense_out, 1) return output def get_param_stats(self): return "PACRR: / "
class MatchPyramid(nn.Module): ''' Paper: Text Matching as Image Recognition, Pang et al., AAAI'16 ''' def __init__( self, #The embedding layer is specified as an AllenNLP TextFieldEmbedder. word_embeddings: TextFieldEmbedder, #the size of output channels conv_output_size: List[int], #the size of input channels conv_kernel_size: List[Tuple[int, int]], # the size of pooling layers to reduce the dimension of the feature maps adaptive_pooling_size: List[Tuple[int, int]]): super(MatchPyramid, self).__init__() self.word_embeddings = word_embeddings self.cosine_module = CosineMatrixAttention() if len(conv_output_size) != len(conv_kernel_size) or len( conv_output_size) != len(adaptive_pooling_size): raise Exception( "conv_output_size, conv_kernel_size, adaptive_pooling_size must have the same length" ) #define the dictionary of convolution layers conv_layer_dict = OrderedDict() last_channel_out = 1 for i in range(len(conv_output_size)): #pads the input tensor boundaries with a constant value #padding((padding_left, padding_right,padding_bottom),tuple) conv_layer_dict["pad " + str(i)] = nn.ConstantPad2d( (0, conv_kernel_size[i][0] - 1, 0, conv_kernel_size[i][1] - 1), 0) #applies a 2D convolution conv_layer_dict["conv " + str(i)] = nn.Conv2d( kernel_size=conv_kernel_size[i], in_channels=last_channel_out, out_channels=conv_output_size[i]) #applies a ReLU activation function conv_layer_dict["relu " + str(i)] = nn.ReLU() #applies a 2D adaptive max pooling conv_layer_dict["pool " + str(i)] = nn.AdaptiveMaxPool2d( adaptive_pooling_size[i]) last_channel_out = conv_output_size[i] #add the layers to the model self.conv_layers = nn.Sequential(conv_layer_dict) ##adding FC layers self.dense = nn.Linear(conv_output_size[-1] * adaptive_pooling_size[-1][0] * adaptive_pooling_size[-1][1], out_features=100, bias=True) self.dense2 = nn.Linear(100, out_features=10, bias=True) self.dense3 = nn.Linear(10, out_features=1, bias=False) #initialize weights (values are taken from matchzoo) torch.nn.init.uniform_(self.dense.weight, -0.014, 0.014) #initialize biases self.dense.bias.data.fill_(0.0) def forward(self, query: Dict[str, torch.Tensor], document: Dict[str, torch.Tensor]) -> torch.Tensor: # # prepare embedding tensors # ------------------------------------------------------- # shape: (batch, query_max) query_pad_oov_mask = (query["tokens"] > 0).float() # shape: (batch, doc_max) document_pad_oov_mask = (document["tokens"] > 0).float() # shape: (batch, query_max,emb_dim) query_embeddings = self.word_embeddings( query) * query_pad_oov_mask.unsqueeze(-1) # shape: (batch, document_max,emb_dim) document_embeddings = self.word_embeddings( document) * document_pad_oov_mask.unsqueeze(-1) #similarity matrix #shape: (batch, 1, query_max, doc_max) for the input of conv_2d cosine_matrix = self.cosine_module.forward(query_embeddings, document_embeddings) cosine_matrix = cosine_matrix[:, None, :, :] #convolution #shape: (batch, conv_output_size, query_max, doc_max) conv_result = self.conv_layers(cosine_matrix) #dynamic pooling #flatten the output of dynamic pooling #shape: (batch, conv_output_size * pool_h * pool_w) conv_result_flat = conv_result.view(conv_result.size(0), -1) # # Learning to rank layer # ------------------------------------------------------- dense_out = F.relu(self.dense(conv_result_flat)) dense_out = F.relu(self.dense2(dense_out)) dense_out = self.dense3(dense_out) output = torch.squeeze(dense_out, 1) return output
class MatchPyramid(nn.Module): ''' Paper: Text Matching as Image Recognition, Pang et al., AAAI'16 Reference code (but in tensorflow): * first-hand: https://github.com/pl8787/MatchPyramid-TensorFlow/blob/master/model/model_mp.py * somewhat-third-hand reference: https://github.com/NTMC-Community/MatchZoo/blob/master/matchzoo/models/matchpyramid.py ''' def __init__(self, word_embeddings: TextFieldEmbedder, conv_output_size: List[int], conv_kernel_size: List[Tuple[int, int]], adaptive_pooling_size: List[Tuple[int, int]]): super(MatchPyramid, self).__init__() self.word_embeddings = word_embeddings self.cosine_module = CosineMatrixAttention() #self.cosine_module = DotProductMatrixAttention() if len(conv_output_size) != len(conv_kernel_size) or len( conv_output_size) != len(adaptive_pooling_size): raise Exception( "conv_output_size, conv_kernel_size, adaptive_pooling_size must have the same length" ) conv_layer_dict = OrderedDict() last_channel_out = 1 for i in range(len(conv_output_size)): conv_layer_dict["pad " + str(i)] = nn.ConstantPad2d( (0, conv_kernel_size[i][0] - 1, 0, conv_kernel_size[i][1] - 1), 0) conv_layer_dict["conv " + str(i)] = nn.Conv2d( kernel_size=conv_kernel_size[i], in_channels=last_channel_out, out_channels=conv_output_size[i]) conv_layer_dict["relu " + str(i)] = nn.ReLU() conv_layer_dict["pool " + str(i)] = nn.AdaptiveMaxPool2d( adaptive_pooling_size[i] ) # this is strange - but so written in the paper # would think only to pool at the end ?? last_channel_out = conv_output_size[i] self.conv_layers = nn.Sequential(conv_layer_dict) #self.dropout = nn.Dropout(0) self.dense = nn.Linear(conv_output_size[-1] * adaptive_pooling_size[-1][0] * adaptive_pooling_size[-1][1], out_features=100, bias=True) self.dense2 = nn.Linear(100, out_features=10, bias=True) self.dense3 = nn.Linear(10, out_features=1, bias=False) # init with small weights, otherwise the dense output is way to high for the tanh -> resulting in loss == 1 all the time #torch.nn.init.uniform_(self.dense.weight, -0.014, 0.014) # inits taken from matchzoo #self.dense.bias.data.fill_(0.0) def forward(self, query: Dict[str, torch.Tensor], document: Dict[str, torch.Tensor], query_length: torch.Tensor, document_length: torch.Tensor) -> torch.Tensor: # pylint: disable=arguments-differ # # prepare embedding tensors # ------------------------------------------------------- # we assume 1 is the unknown token, 0 is padding - both need to be removed if len(query["tokens"].shape) == 2: # (embedding lookup matrix) # shape: (batch, query_max) query_pad_oov_mask = (query["tokens"] > 1).float() # shape: (batch, doc_max) document_pad_oov_mask = (document["tokens"] > 1).float() else: # == 3 (elmo characters per word) # shape: (batch, query_max) query_pad_oov_mask = (torch.sum(query["tokens"], 2) > 0).float() # shape: (batch, doc_max) document_pad_oov_mask = (torch.sum(document["tokens"], 2) > 0).float() # shape: (batch, query_max,emb_dim) query_embeddings = self.word_embeddings( query) * query_pad_oov_mask.unsqueeze(-1) # shape: (batch, document_max,emb_dim) document_embeddings = self.word_embeddings( document) * document_pad_oov_mask.unsqueeze(-1) # # similarity matrix # ------------------------------------------------------- cosine_matrix = self.cosine_module.forward(query_embeddings, document_embeddings) # shape: (batch, 1, query_max, doc_max) for the input of conv_2d cosine_matrix = cosine_matrix[:, None, :, :] # # convolution # ------------------------------------------------------- # shape: (batch, conv_output_size, query_max, doc_max) conv_result = self.conv_layers(cosine_matrix) # # dynamic pooling # ------------------------------------------------------- # flatten the output of dynamic pooling # shape: (batch, conv_output_size * pool_h * pool_w) conv_result_flat = conv_result.view(conv_result.size(0), -1) #conv_result_flat = self.dropout(conv_result_flat) # # Learning to rank layer # ------------------------------------------------------- dense_out = F.relu(self.dense(conv_result_flat)) dense_out = F.relu(self.dense2(dense_out)) dense_out = self.dense3(dense_out) #tanh_out = torch.tanh(dense_out) output = torch.squeeze(dense_out, 1) return output
def __init__( self, #The embedding layer is specified as an AllenNLP TextFieldEmbedder. word_embeddings: TextFieldEmbedder, #the size of output channels conv_output_size: List[int], #the size of input channels conv_kernel_size: List[Tuple[int, int]], # the size of pooling layers to reduce the dimension of the feature maps adaptive_pooling_size: List[Tuple[int, int]]): super(MatchPyramid, self).__init__() self.word_embeddings = word_embeddings self.cosine_module = CosineMatrixAttention() if len(conv_output_size) != len(conv_kernel_size) or len( conv_output_size) != len(adaptive_pooling_size): raise Exception( "conv_output_size, conv_kernel_size, adaptive_pooling_size must have the same length" ) #define the dictionary of convolution layers conv_layer_dict = OrderedDict() last_channel_out = 1 for i in range(len(conv_output_size)): #pads the input tensor boundaries with a constant value #padding((padding_left, padding_right,padding_bottom),tuple) conv_layer_dict["pad " + str(i)] = nn.ConstantPad2d( (0, conv_kernel_size[i][0] - 1, 0, conv_kernel_size[i][1] - 1), 0) #applies a 2D convolution conv_layer_dict["conv " + str(i)] = nn.Conv2d( kernel_size=conv_kernel_size[i], in_channels=last_channel_out, out_channels=conv_output_size[i]) #applies a ReLU activation function conv_layer_dict["relu " + str(i)] = nn.ReLU() #applies a 2D adaptive max pooling conv_layer_dict["pool " + str(i)] = nn.AdaptiveMaxPool2d( adaptive_pooling_size[i]) last_channel_out = conv_output_size[i] #add the layers to the model self.conv_layers = nn.Sequential(conv_layer_dict) ##adding FC layers self.dense = nn.Linear(conv_output_size[-1] * adaptive_pooling_size[-1][0] * adaptive_pooling_size[-1][1], out_features=100, bias=True) self.dense2 = nn.Linear(100, out_features=10, bias=True) self.dense3 = nn.Linear(10, out_features=1, bias=False) #initialize weights (values are taken from matchzoo) torch.nn.init.uniform_(self.dense.weight, -0.014, 0.014) #initialize biases self.dense.bias.data.fill_(0.0)
class TKL_sigir20(nn.Module): ''' TKL is a neural IR model for long documents ''' @staticmethod def from_config(config,word_embeddings_out_dim): return TKL_sigir20(word_embeddings_out_dim, kernels_mu = config["tk_kernels_mu"], kernels_sigma = config["tk_kernels_sigma"], att_heads = config["tk_att_heads"], att_layer = config["tk_att_layer"], att_proj_dim = config["tk_att_proj_dim"], att_ff_dim = config["tk_att_ff_dim"], max_length = config["max_doc_length"], use_pos_encoding = config["tk_use_pos_encoding"], use_diff_posencoding = config["tk_use_diff_posencoding"], saturation_type= config["tk_saturation_type"], ) def __init__(self, _embsize:int, kernels_mu: List[float], kernels_sigma: List[float], att_heads: int, att_layer: int, att_proj_dim: int, att_ff_dim: int, max_length, use_pos_encoding, use_diff_posencoding, saturation_type, ): super(TKL_sigir20, self).__init__() n_kernels = len(kernels_mu) self.use_pos_encoding = use_pos_encoding self.use_diff_posencoding = use_diff_posencoding self.re_use_encoding = True self.chunk_size = 40 self.overlap = 5 self.extended_chunk_size = self.chunk_size + 2 * self.overlap self.sliding_window_size = 30 self.top_k_chunks = 3 self.use_idf_sat = saturation_type == "idf" self.use_embedding_sat = saturation_type == "embedding" self.use_linear_sat = saturation_type == "linear" self.use_log_sat = saturation_type == "log" if len(kernels_mu) != len(kernels_sigma): raise Exception("len(kernels_mu) != len(kernels_sigma)") # static - kernel size & magnitude variables self.mu = nn.Parameter(torch.cuda.FloatTensor(kernels_mu), requires_grad=False)#.view(1, 1, 1, n_kernels) self.sigma = nn.Parameter(torch.cuda.FloatTensor(kernels_sigma), requires_grad=False)#.view(1, 1, 1, n_kernels) #self.mu.data.requires_grad=True #self.sigma.data.requires_grad=True pos_f = self.get_positional_features(_embsize, 30) #max_timescale=100000 pos_f.requires_grad = True self.positional_features_q = nn.Parameter(pos_f) self.positional_features_q.requires_grad = True if self.use_diff_posencoding == True: pos_f = self.get_positional_features(_embsize,2000+500+self.extended_chunk_size)[:,500:,:].clone() #max_timescale=100000 pos_f.requires_grad = True self.positional_features_d = nn.Parameter(pos_f) self.positional_features_d.requires_grad = True else: self.positional_features_d = self.positional_features_q self.mixer = nn.Parameter(torch.full([1], 0.5, dtype=torch.float32, requires_grad=True)) self.mixer_sat = nn.Parameter(torch.full([1], 0.5, dtype=torch.float32, requires_grad=True)) #self.emb_reducer = nn.Linear(_embsize, 300, bias=True) encoder_layer = nn.TransformerEncoderLayer(_embsize, att_heads, dim_feedforward=att_ff_dim, dropout=0) self.contextualizer = nn.TransformerEncoder(encoder_layer, att_layer, norm=None) # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights) self.cosine_module = CosineMatrixAttention() self.saturation_linear = nn.Linear(2, 1, bias=True) torch.nn.init.constant_(self.saturation_linear.bias, 100) torch.nn.init.uniform_(self.saturation_linear.weight, -0.014, 0.014) self.saturation_linear2 = nn.Linear(2, 1, bias=True) torch.nn.init.constant_(self.saturation_linear2.bias, 100) torch.nn.init.uniform_(self.saturation_linear2.weight, -0.014, 0.014) self.saturation_linear3 = nn.Linear(2, 1, bias=True) torch.nn.init.constant_(self.saturation_linear3.bias, 100) torch.nn.init.uniform_(self.saturation_linear3.weight, -0.014, 0.014) self.sat_normer = nn.LayerNorm(2,elementwise_affine=True) #self.sat_emb_reduce1 = nn.Linear(_embsize,_embsize, bias=False) self.sat_emb_reduce1 = nn.Linear(_embsize, 1, bias=False) #torch.nn.init.constant_(self.sat_emb_reduce1.bias, 2) self.kernel_mult = nn.Parameter(torch.full([4,1,1,1,n_kernels], 1, dtype=torch.float32, requires_grad=True)) #self.length_normer = nn.Parameter(torch.full([1,1,1,1], 30, dtype=torch.float32, requires_grad=True)) #self.max_chunks = int(max_length / self.chunk_size + 1) self.chunk_scoring = nn.Parameter(torch.full([1,self.top_k_chunks*5], 1, dtype=torch.float32, requires_grad=True)) self.mixer_end = nn.Parameter(torch.full([1], 0.5, dtype=torch.float32, requires_grad=True)) self.dense = nn.Linear(n_kernels, 1, bias=False) torch.nn.init.uniform_(self.dense.weight, -0.014, 0.014) # inits taken from matchzoo def forward(self, query_embeddings: torch.Tensor, document_embeddings: torch.Tensor, query_pad_oov_mask: torch.Tensor, document_pad_oov_mask: torch.Tensor, query_idfs: torch.Tensor, document_idfs: torch.Tensor, output_secondary_output: bool = False) -> torch.Tensor: # pylint: disable=arguments-differ # # contextualization # ------------------------------------------------------- query_embeddings_original = query_embeddings query_embeddings, query_embeddings_tf_output = self.forward_representation(query_embeddings, query_pad_oov_mask, self.positional_features_q[:,:query_embeddings.shape[1],:]) if document_pad_oov_mask.shape[1] > self.overlap: needed_padding = self.extended_chunk_size - ((document_pad_oov_mask.shape[1] - self.overlap) % self.chunk_size) else: needed_padding = self.extended_chunk_size - self.overlap - document_pad_oov_mask.shape[1] document_embeddings = nn.functional.pad(document_embeddings,(0,0,self.overlap, needed_padding)) document_pad_oov_mask = nn.functional.pad(document_pad_oov_mask,(self.overlap, needed_padding)) chunked_docs = document_embeddings.unfold(1,self.extended_chunk_size,self.chunk_size).transpose(-1,-2) chunked_pad = document_pad_oov_mask.unfold(1,self.extended_chunk_size,self.chunk_size) batch_size = chunked_docs.shape[0] chunk_pieces = chunked_docs.shape[1] chunked_docs2=chunked_docs.reshape(-1,self.extended_chunk_size,document_embeddings.shape[-1]) chunked_pad2=chunked_pad.reshape(-1,self.extended_chunk_size) packed_indices = chunked_pad2[:,self.overlap:-self.overlap].sum(-1) != 0 documents_packed = chunked_docs2[packed_indices] padding_packed = chunked_pad2[packed_indices] if self.re_use_encoding: document_pos_encoding = self.positional_features_d[:,:documents_packed.shape[1],:] else: document_pos_encoding = self.positional_features_d[:,:document_embeddings.shape[1],:] document_pos_encoding = document_pos_encoding.unfold(1,self.extended_chunk_size,self.chunk_size).transpose(-1,-2) document_pos_encoding = document_pos_encoding.squeeze(0) document_pos_encoding = document_pos_encoding.repeat(document_embeddings.shape[0],1,1)[packed_indices] documents_packed,_ = self.forward_representation(documents_packed, padding_packed, document_pos_encoding) documents_unique_again = documents_packed[:,self.overlap:-self.overlap,:] document_mask_packed_unique = padding_packed[:,self.overlap:-self.overlap] # # cosine matrix # ------------------------------------------------------- packed_query_embeddings = query_embeddings.unsqueeze(1).expand(-1,chunk_pieces,-1,-1).reshape(-1,query_embeddings.shape[1],query_embeddings.shape[-1])[packed_indices] packed_query_mask = query_pad_oov_mask.unsqueeze(1).expand(-1,chunk_pieces,-1).reshape(-1,query_embeddings.shape[1])[packed_indices] # shape: (batch, query_max, doc_max) cosine_matrix = self.cosine_module.forward(packed_query_embeddings, documents_unique_again) # # gaussian kernels & soft-TF # # first run through kernel, then sum on doc dim then sum on query dim # ------------------------------------------------------- cosine_matrix_extradim = cosine_matrix.unsqueeze(-1) raw_kernel_results = torch.exp(- torch.pow(cosine_matrix_extradim - self.mu.view(1, 1, 1, -1), 2) / (2 * torch.pow(self.sigma.view(1, 1, 1, -1), 2))) kernel_results_masked = raw_kernel_results * document_mask_packed_unique.unsqueeze(1).unsqueeze(-1) kerne_activations_per_doc = torch.zeros((chunked_docs2.shape[0],query_embeddings.shape[1],documents_unique_again.shape[1],kernel_results_masked.shape[-1]), dtype=chunked_docs2.dtype, layout=chunked_docs2.layout, device=chunked_docs2.device) kerne_activations_per_doc[packed_indices] = kernel_results_masked kerne_activations_per_doc = kerne_activations_per_doc.transpose(1,2).reshape(batch_size,-1,query_embeddings.shape[1],kernel_results_masked.shape[-1]).transpose(2,1) # # kernel-pooling # ------------------------------------------------------- if kerne_activations_per_doc.shape[2] < self.sliding_window_size: kerne_activations_per_doc = nn.functional.pad(kerne_activations_per_doc,(0,0,0, self.sliding_window_size - kerne_activations_per_doc.shape[2])) unrolled_kernel_activations = kerne_activations_per_doc.unfold(2,self.sliding_window_size,2).transpose(-1,-2) unrolled_kernel_activation_lengths = torch.sum(unrolled_kernel_activations.sum(dim=-1) != 0,dim=-1) per_kernel_query = torch.sum(unrolled_kernel_activations, -2) if self.use_idf_sat: sat_influencer = torch.cat([torch.relu(query_idfs.expand_as(unrolled_kernel_activation_lengths).unsqueeze(-1)), unrolled_kernel_activation_lengths.float().unsqueeze(-1)],dim=-1) sat1 = self.saturation_linear(sat_influencer) sat2 = 1 / self.saturation_linear2(sat_influencer) sat3 = self.saturation_linear3(sat_influencer) sat_per_kernel_query = sat1 * (torch.clamp(per_kernel_query, min=1e-10) ** sat2) - sat3 elif self.use_embedding_sat: sat_influencer = torch.cat([self.sat_emb_reduce1(query_embeddings).expand_as(unrolled_kernel_activation_lengths).unsqueeze(-1), unrolled_kernel_activation_lengths.float().unsqueeze(-1)],dim=-1) sat_influencer = self.sat_normer(sat_influencer) sat1 = self.saturation_linear(sat_influencer) sat2 = 1 / self.saturation_linear2(sat_influencer) sat3 = self.saturation_linear3(sat_influencer) sat_per_kernel_query = sat1 * (torch.clamp(per_kernel_query, min=1e-10) ** sat2) - sat3 elif self.use_linear_sat: sat_influencer = torch.cat([torch.relu(query_idfs.expand_as(unrolled_kernel_activation_lengths).unsqueeze(-1)), unrolled_kernel_activation_lengths.float().unsqueeze(-1)],dim=-1) sat1 = self.saturation_linear(sat_influencer) sat2 = self.saturation_linear2(sat_influencer) sat_per_kernel_query = sat1 * torch.clamp(per_kernel_query, min=1e-10) + sat2 elif self.use_log_sat: sat_per_kernel_query = torch.log(torch.clamp(per_kernel_query * self.kernel_mult[0], min=1e-10)) sat_per_kernel_query = sat_per_kernel_query * query_pad_oov_mask.unsqueeze(-1).unsqueeze(-1) * (unrolled_kernel_activation_lengths > 0).float().unsqueeze(-1) # make sure we mask out padding values per_kernel = torch.sum(sat_per_kernel_query, 1) dense_out = self.dense(per_kernel) score = dense_out.squeeze(-1) if score.shape[1] < self.top_k_chunks: score = nn.functional.pad(score,(0, self.top_k_chunks - score.shape[1])) score[score == 0] = -9900 orig_score = score # # argmax top-n hills # top_non_overlapping_idx = torch.zeros((orig_score.shape[0],self.top_k_chunks), dtype=torch.long, device=orig_score.device) max_per_region_score = orig_score.clone() r = torch.arange(max_per_region_score.shape[1],device=max_per_region_score.device) for c in range(0,self.top_k_chunks): best_index = torch.argmax(max_per_region_score,dim=1) top_non_overlapping_idx[:,c] = best_index region_pool = torch.abs(r - best_index.unsqueeze(-1)) < self.sliding_window_size / 2 max_per_region_score[region_pool] = -10001 - c top_non_overlapping_idx_neighbors = torch.cat([top_non_overlapping_idx,top_non_overlapping_idx - 1,top_non_overlapping_idx + 1,top_non_overlapping_idx - 2,top_non_overlapping_idx + 2],dim=1) top_non_overlapping_idx_neighbors[top_non_overlapping_idx_neighbors < 0] = 0 top_non_overlapping_idx_neighbors[top_non_overlapping_idx_neighbors >= orig_score.shape[1]] = orig_score.shape[1] - 1 topk_indices_flat = (top_non_overlapping_idx_neighbors + torch.arange(0,orig_score.shape[0]*orig_score.shape[1],orig_score.shape[1],device=orig_score.device).unsqueeze(-1)).view(-1) top_k_non_overlapping = orig_score.view(-1).index_select(0,topk_indices_flat).view(top_non_overlapping_idx.shape[0],-1) top_k_non_overlapping[top_k_non_overlapping <= -9900] = 0 orig_score[orig_score <= -9900] = 0 score = (top_k_non_overlapping * self.chunk_scoring).sum(dim=1) if output_secondary_output: query_mean_vector = query_embeddings.sum(dim=1) / query_pad_oov_mask.sum(dim=1).unsqueeze(-1) sat_influence_from_top_k = sat_influencer.transpose(1,2).reshape(-1,query_embeddings.shape[1],2).index_select(0,topk_indices_flat).view(top_non_overlapping_idx_neighbors.shape[0],top_non_overlapping_idx_neighbors.shape[1],query_embeddings.shape[1],2) return score, {"score":score,"orig_score":orig_score,"top_non_overlapping_idx":top_non_overlapping_idx,"orig_doc_len":document_pad_oov_mask.sum(dim=-1),"top_k_non_overlapping":top_k_non_overlapping,"sat_influence_from_top_k":sat_influence_from_top_k, "total_chunks":chunked_docs2.shape[0],"packed_chunks":documents_packed.shape[0]} else: return score def forward_representation(self, sequence_embeddings: torch.Tensor, sequence_mask: torch.Tensor, positional_features=None) -> torch.Tensor: pos_sequence = sequence_embeddings if self.use_pos_encoding: if positional_features is None: positional_features = self.positional_features_d[:,:sequence_embeddings.shape[1],:] pos_sequence = sequence_embeddings + positional_features sequence_embeddings_context = self.contextualizer((pos_sequence).transpose(1,0),src_key_padding_mask=~sequence_mask.bool()).transpose(1,0) sequence_embeddings = (self.mixer * sequence_embeddings + (1 - self.mixer) * sequence_embeddings_context) * sequence_mask.unsqueeze(-1) return sequence_embeddings,sequence_embeddings_context def get_positional_features(self,dimensions, max_length, min_timescale: float = 1.0, max_timescale: float = 1.0e4): # pylint: disable=line-too-long """ Implements the frequency-based positional encoding described in `Attention is all you Need <https://www.semanticscholar.org/paper/Attention-Is-All-You-Need-Vaswani-Shazeer/0737da0767d77606169cbf4187b83e1ab62f6077>`_ . Adds sinusoids of different frequencies to a ``Tensor``. A sinusoid of a different frequency and phase is added to each dimension of the input ``Tensor``. This allows the attention heads to use absolute and relative positions. The number of timescales is equal to hidden_dim / 2 within the range (min_timescale, max_timescale). For each timescale, the two sinusoidal signals sin(timestep / timescale) and cos(timestep / timescale) are generated and concatenated along the hidden_dim dimension. Parameters ---------- tensor : ``torch.Tensor`` a Tensor with shape (batch_size, timesteps, hidden_dim). min_timescale : ``float``, optional (default = 1.0) The smallest timescale to use. max_timescale : ``float``, optional (default = 1.0e4) The largest timescale to use. Returns ------- The input tensor augmented with the sinusoidal frequencies. """ timesteps=max_length hidden_dim = dimensions timestep_range = self.get_range_vector(timesteps, 0).data.float() # We're generating both cos and sin frequencies, # so half for each. num_timescales = hidden_dim // 2 timescale_range = self.get_range_vector(num_timescales, 0).data.float() log_timescale_increments = math.log(float(max_timescale) / float(min_timescale)) / float(num_timescales - 1) inverse_timescales = min_timescale * torch.exp(timescale_range * -log_timescale_increments) # Broadcasted multiplication - shape (timesteps, num_timescales) scaled_time = timestep_range.unsqueeze(1) * inverse_timescales.unsqueeze(0) # shape (timesteps, 2 * num_timescales) sinusoids = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 1) if hidden_dim % 2 != 0: # if the number of dimensions is odd, the cos and sin # timescales had size (hidden_dim - 1) / 2, so we need # to add a row of zeros to make up the difference. sinusoids = torch.cat([sinusoids, sinusoids.new_zeros(timesteps, 1)], 1) return sinusoids.unsqueeze(0) def get_range_vector(self, size: int, device: int) -> torch.Tensor: """ Returns a range vector with the desired size, starting at 0. The CUDA implementation is meant to avoid copy data from CPU to GPU. """ if device > -1: return torch.cuda.LongTensor(size, device=device).fill_(1).cumsum(0) - 1 else: return torch.arange(0, size, dtype=torch.long) def get_param_stats(self): #" b: "+str(self.dense.bias.data) +\ "b: "+str(self.dense_mean.bias.data) +#"scaler: "+str(self.nn_scaler.data) +\ # " bias: " +str(self.saturation_linear.bias.data) +\ return "TK: dense w: "+str(self.dense.weight.data) +\ " self.chunk_scoring: " +str(self.chunk_scoring.data) +\ " self.kernel_mult: " +str(self.kernel_mult.data) +\ " self.saturation_linear: " +str(self.saturation_linear.weight.data) + " bias: " +str(self.saturation_linear.bias.data) +\ " self.saturation_linear2: " +str(self.saturation_linear2.weight.data) + " bias: " +str(self.saturation_linear2.bias.data) +\ " self.saturation_linear3: " +str(self.saturation_linear3.weight.data) + " bias: " +str(self.saturation_linear3.bias.data) +\ "mixer: "+str(self.mixer.data) #+ "mixer_end: "+str(self.mixer_end.data) def get_param_secondary(self): return {"dense_weight":self.dense.weight, "saturation_linear_weight":self.saturation_linear.weight, "saturation_linear_bias":self.saturation_linear.bias, "saturation_linear2_weight":self.saturation_linear2.weight, "saturation_linear2_bias":self.saturation_linear2.bias, "saturation_linear3_weight":self.saturation_linear3.weight, "saturation_linear3_bias":self.saturation_linear3.bias, "chunk_scoring":self.chunk_scoring, "kernel_mult":self.kernel_mult, "mixer":self.mixer}
def __init__(self, _embsize:int, kernels_mu: List[float], kernels_sigma: List[float], att_heads: int, att_layer: int, att_proj_dim: int, att_ff_dim: int, max_length, use_pos_encoding, use_diff_posencoding, saturation_type, ): super(TKL_sigir20, self).__init__() n_kernels = len(kernels_mu) self.use_pos_encoding = use_pos_encoding self.use_diff_posencoding = use_diff_posencoding self.re_use_encoding = True self.chunk_size = 40 self.overlap = 5 self.extended_chunk_size = self.chunk_size + 2 * self.overlap self.sliding_window_size = 30 self.top_k_chunks = 3 self.use_idf_sat = saturation_type == "idf" self.use_embedding_sat = saturation_type == "embedding" self.use_linear_sat = saturation_type == "linear" self.use_log_sat = saturation_type == "log" if len(kernels_mu) != len(kernels_sigma): raise Exception("len(kernels_mu) != len(kernels_sigma)") # static - kernel size & magnitude variables self.mu = nn.Parameter(torch.cuda.FloatTensor(kernels_mu), requires_grad=False)#.view(1, 1, 1, n_kernels) self.sigma = nn.Parameter(torch.cuda.FloatTensor(kernels_sigma), requires_grad=False)#.view(1, 1, 1, n_kernels) #self.mu.data.requires_grad=True #self.sigma.data.requires_grad=True pos_f = self.get_positional_features(_embsize, 30) #max_timescale=100000 pos_f.requires_grad = True self.positional_features_q = nn.Parameter(pos_f) self.positional_features_q.requires_grad = True if self.use_diff_posencoding == True: pos_f = self.get_positional_features(_embsize,2000+500+self.extended_chunk_size)[:,500:,:].clone() #max_timescale=100000 pos_f.requires_grad = True self.positional_features_d = nn.Parameter(pos_f) self.positional_features_d.requires_grad = True else: self.positional_features_d = self.positional_features_q self.mixer = nn.Parameter(torch.full([1], 0.5, dtype=torch.float32, requires_grad=True)) self.mixer_sat = nn.Parameter(torch.full([1], 0.5, dtype=torch.float32, requires_grad=True)) #self.emb_reducer = nn.Linear(_embsize, 300, bias=True) encoder_layer = nn.TransformerEncoderLayer(_embsize, att_heads, dim_feedforward=att_ff_dim, dropout=0) self.contextualizer = nn.TransformerEncoder(encoder_layer, att_layer, norm=None) # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights) self.cosine_module = CosineMatrixAttention() self.saturation_linear = nn.Linear(2, 1, bias=True) torch.nn.init.constant_(self.saturation_linear.bias, 100) torch.nn.init.uniform_(self.saturation_linear.weight, -0.014, 0.014) self.saturation_linear2 = nn.Linear(2, 1, bias=True) torch.nn.init.constant_(self.saturation_linear2.bias, 100) torch.nn.init.uniform_(self.saturation_linear2.weight, -0.014, 0.014) self.saturation_linear3 = nn.Linear(2, 1, bias=True) torch.nn.init.constant_(self.saturation_linear3.bias, 100) torch.nn.init.uniform_(self.saturation_linear3.weight, -0.014, 0.014) self.sat_normer = nn.LayerNorm(2,elementwise_affine=True) #self.sat_emb_reduce1 = nn.Linear(_embsize,_embsize, bias=False) self.sat_emb_reduce1 = nn.Linear(_embsize, 1, bias=False) #torch.nn.init.constant_(self.sat_emb_reduce1.bias, 2) self.kernel_mult = nn.Parameter(torch.full([4,1,1,1,n_kernels], 1, dtype=torch.float32, requires_grad=True)) #self.length_normer = nn.Parameter(torch.full([1,1,1,1], 30, dtype=torch.float32, requires_grad=True)) #self.max_chunks = int(max_length / self.chunk_size + 1) self.chunk_scoring = nn.Parameter(torch.full([1,self.top_k_chunks*5], 1, dtype=torch.float32, requires_grad=True)) self.mixer_end = nn.Parameter(torch.full([1], 0.5, dtype=torch.float32, requires_grad=True)) self.dense = nn.Linear(n_kernels, 1, bias=False) torch.nn.init.uniform_(self.dense.weight, -0.014, 0.014) # inits taken from matchzoo
class Conv_KNRM(nn.Module): ''' Paper: Convolutional Neural Networks for Soſt-Matching N-Grams in Ad-hoc Search, Dai et al. WSDM 18 ''' def __init__(self, word_embeddings: TextFieldEmbedder, n_grams: int, n_kernels: int, conv_out_dim: int): super(Conv_KNRM, self).__init__() self.word_embeddings = word_embeddings # static - kernel size & magnitude variables self.mu = Variable(torch.FloatTensor(self.kernel_mus(n_kernels)), requires_grad = False).view(1, 1, 1, n_kernels) self.sigma = Variable(torch.FloatTensor(self.kernel_sigmas(n_kernels)), requires_grad = False).view(1, 1, 1, n_kernels) # Implement 1 Dimensional CNN layer for each n-gram type # Also, use RelU as Activation function self.convolutions = [] for i in range (1, n_grams + 1): self.convolutions.append(nn.Sequential( nn.ConstantPad1d((0 , i-1 ), 0), # the kernel size of the convolutional layer is the same as the current i-gram(uni, bi, tri...) in the loop nn.Conv1d(kernel_size = i, in_channels = word_embeddings.get_output_dim(), out_channels = conv_out_dim), nn.ReLU())) # register conv as part of the model self.convolutions = nn.ModuleList(self.convolutions) #Cosine similarity matrix self.cosine_module = CosineMatrixAttention() # Initialize the Linear transformer model: # size of the input: number of elements in the soft-TF feautes * number of kernel products ( # n_kernels * n_grams * n_grams = all combination of match matrix creation # (n-gram pairs from query and document embeddings) # the output will be 1 sample # also use bias based on the paper formula (by default it's true but just to make sure) self.transform = nn.Linear(in_features = n_kernels * n_grams * n_grams, out_features = 1, bias = True) def forward(self, query: Dict[str, torch.Tensor], document: Dict[str, torch.Tensor]) -> torch.Tensor: # # prepare embedding tensorsss # ------------------------------------------------------- # we assume 0 is padding - both need to be removed # shape: (batch, query_max) # query_pad_mask = (query["tokens"] > 0).float() # > 1 to also mask oov terms document_pad_mask = (document["tokens"] > 0).float() maskedEmbed = getMaskedEmbed(query_pad_mask, document_pad_mask) maskedEmbed = (maskedEmbed.unsqueeze(-1)).cuda() #Before the conv queryEmbeddings = (self.word_embeddings(query)).cuda() documentEmbeddings = (self.word_embeddings(document)).cuda() # Transpose the embeddings make it applicible to the convolution layer # after the conv feed an relu-layer, it will be transposed back query_embeddings_t = queryEmbeddings.transpose(1, 2) document_embeddings_t = documentEmbeddings.transpose(1, 2) #Initialize list to store each convolutioned n-gram document and query embeddings # Do we have to pre-define the sizes of list? can it make the process faster? convQueries = [] convDocs = [] #Loop through all n-gram convolution ty for conv in self.convolutions: # get the embeddings through the layers, and store them in the list in the original row-column format convQueries.append(conv(query_embeddings_t).transpose(1, 2)) convDocs.append(conv(document_embeddings_t).transpose(1, 2)) #Place sigma and mu into the gpu mu = self.mu mu = mu.cuda() sigma = self.sigma sigma = sigma.cuda() #Now we have the convolutiend n-gram embeddings for document and queries # Next step: # For each n-gram combination: create a match matrix: combine each n-gram document and word embeddings: # It will provide n*n match matrix #Concept: loop through each convolutioned document embedding and calculate the cosine similarity # then we have the cosine similarity, apply kernel pooling (where the padding will be masked), # then store the results in a list called kernelresult (or softTFFeatures?) softTFFeatures = [] #Initialize the document embedding loop for d in convDocs: #initialize the inner loop which will provide to loop through all query embeds for q in convQueries: # Calculate cosine similarity matchMatrix = self.cosine_module.forward(q, d) #Add a new dimension to resolve mismatch matchMatrix = matchMatrix.unsqueeze(-1).cuda() # Calculate kernel pooling on the match matrix, input parameters: match matrix and the mask - matrix kernelResult = calculateKernel(matchMatrix, maskedEmbed, query_pad_mask, mu = mu, sigma = sigma) # the results are the soft-tf features provided by the d-gram document with the q-gram query cosine similarity #Store the features in the list softTFFeatures.append(kernelResult) # Concatenate kernel pooling results/soft-tf features: basicallly it creates a new matrix, # where each row is a soft-tf feature (so our list can be considered now as Sequence of tensors), # which will be concatenated row-wise? pooling_sum = torch.cat(softTFFeatures, 1).cuda() # Then Linear transformation will be applied on the matrix # The learning - to - rank(LeToR) layer combines the soft-TF ranking features into a ranking score: # Steps: # apply linear transformation on the concatenated matrixes, # calculate hyperbolic tangent on it # Create Final Scoring, also Remove the 2nd tensor dimension if it's size is 1 output = torch.squeeze(torch.tanh(self.transform(pooling_sum)), 1).cuda() return output def kernel_mus(self, n_kernels: int): """ get the mu for each guassian kernel. Mu is the middle of each bin :param n_kernels: number of kernels (including exact match). first one is exact match :return: l_mu, a list of mu. """ l_mu = [1.0] if n_kernels == 1: return l_mu bin_size = 2.0 / (n_kernels - 1) # score range from [-1, 1] l_mu.append(1 - bin_size / 2) # mu: middle of the bin for i in range(1, n_kernels - 1): l_mu.append(l_mu[i] - bin_size) return l_mu def kernel_sigmas(self, n_kernels: int): """ get sigmas for each guassian kernel. :param n_kernels: number of kernels (including exactmath.) :param lamb: :param use_exact: :return: l_sigma, a list of simga """ bin_size = 2.0 / (n_kernels - 1) l_sigma = [0.001] # for exact match. small variance -> exact match if n_kernels == 1: return l_sigma l_sigma += [0.5 * bin_size] * (n_kernels - 1) return l_sigma
class TK_v1(nn.Module): ''' TK is a neural IR model - a fusion between transformer contextualization & kernel-based scoring -> uses 1 transformer block to contextualize embeddings -> soft-histogram kernels to score interactions ''' @staticmethod def from_config(config, word_embeddings_out_dim): return TK_v1(word_embeddings_out_dim, kernels_mu=config["tk_kernels_mu"], kernels_sigma=config["tk_kernels_sigma"], att_heads=config["tk_att_heads"], att_layer=config["tk_att_layer"], att_proj_dim=config["tk_att_proj_dim"], att_ff_dim=config["tk_att_ff_dim"]) def __init__(self, _embsize: int, kernels_mu: List[float], kernels_sigma: List[float], att_heads: int, att_layer: int, att_proj_dim: int, att_ff_dim: int): super(TK_v1, self).__init__() n_kernels = len(kernels_mu) if len(kernels_mu) != len(kernels_sigma): raise Exception("len(kernels_mu) != len(kernels_sigma)") # static - kernel size & magnitude variables self.mu = Variable(torch.cuda.FloatTensor(kernels_mu), requires_grad=False).view(1, 1, 1, n_kernels) self.sigma = Variable(torch.cuda.FloatTensor(kernels_sigma), requires_grad=False).view(1, 1, 1, n_kernels) self.nn_scaler = nn.Parameter( torch.full([1], 0.01, dtype=torch.float32, requires_grad=True)) self.mixer = nn.Parameter( torch.full([1, 1, 1], 0.5, dtype=torch.float32, requires_grad=True)) self.stacked_att = StackedSelfAttentionEncoder( input_dim=_embsize, hidden_dim=_embsize, projection_dim=att_proj_dim, feedforward_hidden_dim=att_ff_dim, num_layers=att_layer, num_attention_heads=att_heads, dropout_prob=0, residual_dropout_prob=0, attention_dropout_prob=0) # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights) self.cosine_module = CosineMatrixAttention() # bias is set to True in original code (we found it to not help, how could it?) self.dense = nn.Linear(n_kernels, 1, bias=False) self.dense_mean = nn.Linear(n_kernels, 1, bias=False) self.dense_comb = nn.Linear(2, 1, bias=False) # init with small weights, otherwise the dense output is way to high for the tanh -> resulting in loss == 1 all the time torch.nn.init.uniform_(self.dense.weight, -0.014, 0.014) # inits taken from matchzoo torch.nn.init.uniform_(self.dense_mean.weight, -0.014, 0.014) # inits taken from matchzoo # init with small weights, otherwise the dense output is way to high for the tanh -> resulting in loss == 1 all the time torch.nn.init.uniform_(self.dense.weight, -0.014, 0.014) # inits taken from matchzoo #self.dense.bias.data.fill_(0.0) def forward(self, query_embeddings: torch.Tensor, document_embeddings: torch.Tensor, query_pad_oov_mask: torch.Tensor, document_pad_oov_mask: torch.Tensor, output_secondary_output: bool = False) -> torch.Tensor: # pylint: disable=arguments-differ query_embeddings = query_embeddings * query_pad_oov_mask.unsqueeze(-1) document_embeddings = document_embeddings * document_pad_oov_mask.unsqueeze( -1) query_embeddings_context = self.stacked_att(query_embeddings, query_pad_oov_mask) document_embeddings_context = self.stacked_att(document_embeddings, document_pad_oov_mask) #query_embeddings = torch.cat([query_embeddings,query_embeddings_context],dim=2) * query_pad_oov_mask.unsqueeze(-1) #document_embeddings = torch.cat([document_embeddings,document_embeddings_context],dim=2) * document_pad_oov_mask.unsqueeze(-1) query_embeddings = (self.mixer * query_embeddings + (1 - self.mixer) * query_embeddings_context ) * query_pad_oov_mask.unsqueeze(-1) document_embeddings = (self.mixer * document_embeddings + (1 - self.mixer) * document_embeddings_context ) * document_pad_oov_mask.unsqueeze(-1) # # prepare embedding tensors & paddings masks # ------------------------------------------------------- query_by_doc_mask = torch.bmm( query_pad_oov_mask.unsqueeze(-1), document_pad_oov_mask.unsqueeze(-1).transpose(-1, -2)) query_by_doc_mask_view = query_by_doc_mask.unsqueeze(-1) # # cosine matrix # ------------------------------------------------------- # shape: (batch, query_max, doc_max) cosine_matrix = self.cosine_module.forward(query_embeddings, document_embeddings) cosine_matrix_masked = cosine_matrix * query_by_doc_mask cosine_matrix_extradim = cosine_matrix_masked.unsqueeze(-1) # # gaussian kernels & soft-TF # # first run through kernel, then sum on doc dim then sum on query dim # ------------------------------------------------------- raw_kernel_results = torch.exp( -torch.pow(cosine_matrix_extradim - self.mu, 2) / (2 * torch.pow(self.sigma, 2))) kernel_results_masked = raw_kernel_results * query_by_doc_mask_view # # mean kernels # #kernel_results_masked2 = kernel_results_masked.clone() doc_lengths = torch.sum(document_pad_oov_mask, 1) #kernel_results_masked2_mean = kernel_results_masked / doc_lengths.unsqueeze(-1) per_kernel_query = torch.sum(kernel_results_masked, 2) log_per_kernel_query = torch.log2( torch.clamp(per_kernel_query, min=1e-10)) * self.nn_scaler log_per_kernel_query_masked = log_per_kernel_query * query_pad_oov_mask.unsqueeze( -1) # make sure we mask out padding values per_kernel = torch.sum(log_per_kernel_query_masked, 1) #per_kernel_query_mean = torch.sum(kernel_results_masked2_mean, 2) per_kernel_query_mean = per_kernel_query / ( doc_lengths.view(-1, 1, 1) + 1 ) # well, that +1 needs an explanation, sometimes training data is just broken ... (and nans all the things!) log_per_kernel_query_mean = per_kernel_query_mean * self.nn_scaler log_per_kernel_query_masked_mean = log_per_kernel_query_mean * query_pad_oov_mask.unsqueeze( -1) # make sure we mask out padding values per_kernel_mean = torch.sum(log_per_kernel_query_masked_mean, 1) ## ## "Learning to rank" layer - connects kernels with learned weights ## ------------------------------------------------------- dense_out = self.dense(per_kernel) dense_mean_out = self.dense_mean(per_kernel_mean) dense_comb_out = self.dense_comb( torch.cat([dense_out, dense_mean_out], dim=1)) score = torch.squeeze(dense_comb_out, 1) #torch.tanh(dense_out), 1) if output_secondary_output: query_mean_vector = query_embeddings.sum( dim=1) / query_pad_oov_mask.sum(dim=1).unsqueeze(-1) return score, { "score": score, "dense_out": dense_out, "dense_mean_out": dense_mean_out, "per_kernel": per_kernel, "per_kernel_mean": per_kernel_mean, "query_mean_vector": query_mean_vector, "cosine_matrix_masked": cosine_matrix_masked } else: return score def forward_representation(self, sequence_embeddings: torch.Tensor, sequence_mask: torch.Tensor) -> torch.Tensor: seq_embeddings = sequence_embeddings * sequence_mask.unsqueeze(-1) seq_embeddings_context = self.stacked_att(sequence_embeddings, sequence_mask) seq_embeddings = (self.mixer * sequence_embeddings + (1 - self.mixer) * seq_embeddings_context) * sequence_mask.unsqueeze(-1) return seq_embeddings def get_param_stats( self ): #" b: "+str(self.dense.bias.data) +\ "b: "+str(self.dense_mean.bias.data) + return "TK: dense w: "+str(self.dense.weight.data)+\ "dense_mean weight: "+str(self.dense_mean.weight.data)+\ "dense_comb weight: "+str(self.dense_comb.weight.data) + "scaler: "+str(self.nn_scaler.data) +"mixer: "+str(self.mixer.data) def get_param_secondary(self): return { "dense_weight": self.dense.weight, #"dense_bias":self.dense.bias, "dense_mean_weight": self.dense_mean.weight, #"dense_mean_bias":self.dense_mean.bias, "dense_comb_weight": self.dense_comb.weight, "scaler": self.nn_scaler, "mixer": self.mixer }
class TK_v2(nn.Module): ''' TK is a neural IR model - a fusion between transformer contextualization & kernel-based scoring -> uses 1 transformer block to contextualize embeddings -> soft-histogram kernels to score interactions ''' @staticmethod def from_config(config, word_embeddings_out_dim): ws = [20, 30, 50, 80, 100, 120, 150] max_windows = [ math.ceil(config["max_doc_length"] / float(w)) for w in ws ] return TK_v2(word_embeddings_out_dim, kernels_mu=config["tk_kernels_mu"], kernels_sigma=config["tk_kernels_sigma"], att_heads=config["tk_att_heads"], att_layer=config["tk_att_layer"], att_proj_dim=config["tk_att_proj_dim"], att_ff_dim=config["tk_att_ff_dim"], win_size=ws, max_windows=max_windows) def __init__(self, _embsize: int, kernels_mu: List[float], kernels_sigma: List[float], att_heads: int, att_layer: int, att_proj_dim: int, att_ff_dim: int, win_size: int, max_windows: int): super(TK_v2, self).__init__() n_kernels = len(kernels_mu) if len(kernels_mu) != len(kernels_sigma): raise Exception("len(kernels_mu) != len(kernels_sigma)") # static - kernel size & magnitude variables self.mu = Variable(torch.cuda.FloatTensor(kernels_mu), requires_grad=False).view(1, 1, 1, n_kernels) self.sigma = Variable(torch.cuda.FloatTensor(kernels_sigma), requires_grad=False).view(1, 1, 1, n_kernels) self.mixer = nn.Parameter( torch.full([1, 1, 1], 0.5, dtype=torch.float32, requires_grad=True)) self.stacked_att = StackedSelfAttentionEncoder( input_dim=_embsize, hidden_dim=_embsize, projection_dim=att_proj_dim, feedforward_hidden_dim=att_ff_dim, num_layers=att_layer, num_attention_heads=att_heads, dropout_prob=0, residual_dropout_prob=0, attention_dropout_prob=0) # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights) self.cosine_module = CosineMatrixAttention() self.nn_scaler = nn.ParameterList([ nn.Parameter( torch.full([1], 0.01, dtype=torch.float32, requires_grad=True)) for w in win_size ]) self.kernel_weights = nn.ModuleList( [nn.Linear(n_kernels, 1, bias=False) for w in win_size]) self.window_size = win_size self.window_scorer = [] for w in max_windows: l = nn.Linear(w, 1, bias=False) torch.nn.init.constant_(l.weight, 1 / w) self.window_scorer.append(l) self.window_scorer = nn.ModuleList(self.window_scorer) self.window_merger = nn.Linear(len(self.window_size), 1, bias=False) def forward(self, query_embeddings: torch.Tensor, document_embeddings: torch.Tensor, query_pad_oov_mask: torch.Tensor, document_pad_oov_mask: torch.Tensor, output_secondary_output: bool = False) -> torch.Tensor: # pylint: disable=arguments-differ query_embeddings = query_embeddings * query_pad_oov_mask.unsqueeze(-1) document_embeddings = document_embeddings * document_pad_oov_mask.unsqueeze( -1) query_embeddings_context = self.stacked_att(query_embeddings, query_pad_oov_mask) document_embeddings_context = self.stacked_att(document_embeddings, document_pad_oov_mask) #query_embeddings = torch.cat([query_embeddings,query_embeddings_context],dim=2) * query_pad_oov_mask.unsqueeze(-1) #document_embeddings = torch.cat([document_embeddings,document_embeddings_context],dim=2) * document_pad_oov_mask.unsqueeze(-1) query_embeddings = (self.mixer * query_embeddings + (1 - self.mixer) * query_embeddings_context ) * query_pad_oov_mask.unsqueeze(-1) document_embeddings = (self.mixer * document_embeddings + (1 - self.mixer) * document_embeddings_context ) * document_pad_oov_mask.unsqueeze(-1) # # prepare embedding tensors & paddings masks # ------------------------------------------------------- query_by_doc_mask = torch.bmm( query_pad_oov_mask.unsqueeze(-1), document_pad_oov_mask.unsqueeze(-1).transpose(-1, -2)) query_by_doc_mask_view = query_by_doc_mask.unsqueeze(-1) # # cosine matrix # ------------------------------------------------------- # shape: (batch, query_max, doc_max) cosine_matrix = self.cosine_module.forward(query_embeddings, document_embeddings) cosine_matrix_masked = torch.tanh(cosine_matrix * query_by_doc_mask) cosine_matrix_extradim = cosine_matrix_masked.unsqueeze(-1) # # gaussian kernels & soft-TF # # first run through kernel, then sum on doc dim then sum on query dim # ------------------------------------------------------- raw_kernel_results = torch.exp( -torch.pow(cosine_matrix_extradim - self.mu, 2) / (2 * torch.pow(self.sigma, 2))) kernel_results_masked = raw_kernel_results * query_by_doc_mask_view # # mean kernels # #kernel_results_masked2 = kernel_results_masked.clone() individual_window_scores = [] for i, window in enumerate(self.window_size): kernel_results_masked = nn.functional.pad( kernel_results_masked, (0, 0, 0, window - kernel_results_masked.shape[-2] % window)) scoring_windows = kernel_results_masked.unfold(dimension=-2, size=window, step=window) scoring_windows = scoring_windows.transpose(-1, -2) #kernel_results_masked2_mean = kernel_results_masked / doc_lengths.unsqueeze(-1) per_kernel_query = torch.sum(scoring_windows, -2) log_per_kernel_query = torch.log( torch.clamp(per_kernel_query, min=1e-10)) #* log_per_kernel_query_masked = log_per_kernel_query * ( per_kernel_query.sum(dim=-1) != 0).unsqueeze(-1).float() #log_per_kernel_query_masked = log_per_kernel_query * query_pad_oov_mask.unsqueeze(-1).unsqueeze(-1) # make sure we mask out padding values per_kernel = torch.sum(log_per_kernel_query_masked, 1) window_scores = self.kernel_weights[i](per_kernel).squeeze(-1) window_scores_exp = torch.exp( window_scores * self.nn_scaler[i]) * (window_scores != 0).float() #window_scores_exp=window_scores if window_scores_exp.shape[-1] > self.window_scorer[i].in_features: window_scores_exp = window_scores_exp[:, :self.window_scorer[i] .in_features] if window_scores_exp.shape[-1] < self.window_scorer[i].in_features: window_scores_exp = nn.functional.pad( window_scores_exp, (0, self.window_scorer[i].in_features - window_scores_exp.shape[-1])) window_scores_exp = window_scores_exp.sort(dim=-1, descending=True)[0] individual_window_scores.append( self.window_scorer[i](window_scores_exp)) #final_score = window_scores.sum(dim=-1) / (window_scores != 0).sum(dim=-1).float() final_window_score = self.window_merger( torch.cat(individual_window_scores, dim=1)) score = torch.squeeze(final_window_score, 1) #torch.tanh(dense_out), 1) if output_secondary_output: return score, {} return score def get_param_stats(self): return "tk_v2: "+\ " ".join([" kernel_weight ("+str(self.window_size[i])+")"+str(w.weight.data) for i,w in enumerate(self.kernel_weights)])+"\n"+\ " ".join([" nn_scaler ("+str(self.window_size[i])+")"+str(w.data) for i,w in enumerate(self.nn_scaler)])+"\n"+\ " ".join([" window_scorer ("+str(self.window_size[i])+")"+str(w.weight.data) for i,w in enumerate(self.window_scorer)])+"\n"+\ "mixer: "+str(self.mixer.data) + "window_merger: "+str(self.window_merger.weight.data) def get_param_secondary(self): return { #"dense_weight":self.dense.weight,"dense_bias":self.dense.bias, #"dense_mean_weight":self.dense_mean.weight,"dense_mean_bias":self.dense_mean.bias, "window_merger": self.window_merger.weight, #"scaler: ":self.nn_scaler , "mixer: ": self.mixer }
class MatchPyramid(nn.Module): ''' Paper: Text Matching as Image Recognition, Pang et al., AAAI'16 Reference code (but in tensorflow): * first-hand: https://github.com/pl8787/MatchPyramid-TensorFlow/blob/master/model/model_mp.py * somewhat-third-hand reference: https://github.com/NTMC-Community/MatchZoo/blob/master/matchzoo/models/matchpyramid.py ''' @staticmethod def from_config(config, word_embeddings_out_dim): return MatchPyramid( conv_output_size=config["match_pyramid_conv_output_size"], conv_kernel_size=config["match_pyramid_conv_kernel_size"], adaptive_pooling_size=config["match_pyramid_adaptive_pooling_size"] ) def __init__(self, conv_output_size: List[int], conv_kernel_size: List[Tuple[int, int]], adaptive_pooling_size: List[Tuple[int, int]]): super(MatchPyramid, self).__init__() self.cosine_module = CosineMatrixAttention() if len(conv_output_size) != len(conv_kernel_size) or len( conv_output_size) != len(adaptive_pooling_size): raise Exception( "conv_output_size, conv_kernel_size, adaptive_pooling_size must have the same length" ) conv_layer_dict = OrderedDict() last_channel_out = 1 for i in range(len(conv_output_size)): conv_layer_dict["pad " + str(i)] = nn.ConstantPad2d( (0, conv_kernel_size[i][0] - 1, 0, conv_kernel_size[i][1] - 1), 0) conv_layer_dict["conv " + str(i)] = nn.Conv2d( kernel_size=conv_kernel_size[i], in_channels=last_channel_out, out_channels=conv_output_size[i]) conv_layer_dict["relu " + str(i)] = nn.ReLU() conv_layer_dict["pool " + str(i)] = nn.AdaptiveMaxPool2d( adaptive_pooling_size[i]) last_channel_out = conv_output_size[i] self.conv_layers = nn.Sequential(conv_layer_dict) self.dense = nn.Linear(conv_output_size[-1] * adaptive_pooling_size[-1][0] * adaptive_pooling_size[-1][1], out_features=100, bias=True) self.dense2 = nn.Linear(100, out_features=10, bias=True) self.dense3 = nn.Linear(10, out_features=1, bias=False) # init with small weights, otherwise the dense output is way to high for the tanh -> resulting in loss == 1 all the time #torch.nn.init.uniform_(self.dense.weight, -0.014, 0.014) # inits taken from matchzoo #self.dense.bias.data.fill_(0.0) def forward(self, query_embeddings: torch.Tensor, document_embeddings: torch.Tensor, query_pad_oov_mask: torch.Tensor, document_pad_oov_mask: torch.Tensor, output_secondary_output: bool = False) -> torch.Tensor: # # similarity matrix # ------------------------------------------------------- cosine_matrix = self.cosine_module.forward(query_embeddings, document_embeddings) # shape: (batch, 1, query_max, doc_max) for the input of conv_2d cosine_matrix = cosine_matrix[:, None, :, :] # # convolution # ------------------------------------------------------- # shape: (batch, conv_output_size, query_max, doc_max) conv_result = self.conv_layers(cosine_matrix) # # dynamic pooling # ------------------------------------------------------- # flatten the output of dynamic pooling # shape: (batch, conv_output_size * pool_h * pool_w) conv_result_flat = conv_result.view(conv_result.size(0), -1) #conv_result_flat = self.dropout(conv_result_flat) # # Learning to rank layer # ------------------------------------------------------- dense_out = F.relu(self.dense(conv_result_flat)) dense_out = F.relu(self.dense2(dense_out)) dense_out = self.dense3(dense_out) #tanh_out = torch.tanh(dense_out) output = torch.squeeze(dense_out, 1) if output_secondary_output: return output, {} return output def get_param_stats(self): return "MP: / " def get_param_secondary(self): return {}
class KNRM(nn.Module): ''' Paper: End-to-End Neural Ad-hoc Ranking with Kernel Pooling, Xiong et al., SIGIR'17 ''' def __init__(self, word_embeddings: TextFieldEmbedder, n_kernels: int): super(KNRM, self).__init__() self.word_embeddings = word_embeddings # static - kernel size & magnitude variables self.mu = Variable(torch.FloatTensor(self.kernel_mus(n_kernels)), requires_grad=False).view(1, 1, 1, n_kernels) self.sigma = Variable(torch.FloatTensor(self.kernel_sigmas(n_kernels)), requires_grad=False).view(1, 1, 1, n_kernels) #Cosine matrix self.cosine_module = CosineMatrixAttention() # Initialize the Linear transformer model: self.transform = nn.Linear(n_kernels, out_features=1, bias=True) def forward(self, query: Dict[str, torch.Tensor], document: Dict[str, torch.Tensor]) -> torch.Tensor: # pylint: disable=arguments-differ # # prepare embedding tensors & paddings masks # ------------------------------------------------------- # shape: (batch, query_max) query_pad_oov_mask = (query["tokens"] > 0).float().cuda() # > 1 to also mask oov terms # shape: (batch, doc_max) document_pad_oov_mask = (document["tokens"] > 0).float().cuda() # shape: (batch, query_max,emb_dim) query_embeddings = self.word_embeddings(query) # shape: (batch, document_max,emb_dim) document_embeddings = self.word_embeddings(document) #Create a mask matrix maskedEmbed = getMaskedEmbed(query_pad_oov_mask, document_pad_oov_mask) maskedEmbed = maskedEmbed.unsqueeze(-1).cuda() # # cosine matrix # ------------------------------------------------------- matchMatrix = self.cosine_module.forward(query_embeddings, document_embeddings) # Add an extra dimension the solve the dimensionality mismatch matchMatrix = matchMatrix.unsqueeze(-1).cuda() mu = self.mu mu = mu.cuda() sigma = self.sigma sigma = sigma.cuda() #Calculate the Soft-TF features from the Matchmatrix sofTFFeatures = calculateKernel(matchMatrix=matchMatrix, maskedMatrix=maskedEmbed, queryMask=query_pad_oov_mask, mu=mu, sigma=sigma) # apply linear transformation on the soft tf features, # calculate hyperbolic tangent on it # Remove the 2nd tensor dimension if it's size is 1 output = torch.squeeze(torch.tanh(self.transform(sofTFFeatures)), 1).cuda() return output def kernel_mus(self, n_kernels: int): """ get the mu for each guassian kernel. Mu is the middle of each bin :param n_kernels: number of kernels (including exact match). first one is exact match :return: l_mu, a list of mu. """ l_mu = [1.0] if n_kernels == 1: return l_mu bin_size = 2.0 / (n_kernels - 1) # score range from [-1, 1] l_mu.append(1 - bin_size / 2) # mu: middle of the bin for i in range(1, n_kernels - 1): l_mu.append(l_mu[i] - bin_size) return l_mu def kernel_sigmas(self, n_kernels: int): """ get sigmas for each guassian kernel. :param n_kernels: number of kernels (including exactmath.) :param lamb: :param use_exact: :return: l_sigma, a list of simga """ bin_size = 2.0 / (n_kernels - 1) l_sigma = [0.0001] # for exact match. small variance -> exact match if n_kernels == 1: return l_sigma l_sigma += [0.5 * bin_size] * (n_kernels - 1) return l_sigma
class Conv_KNRM(nn.Module): ''' Paper: Convolutional Neural Networks for Soſt-Matching N-Grams in Ad-hoc Search, Dai et al. WSDM 18 third-hand reference: https://github.com/NTMC-Community/MatchZoo/blob/master/matchzoo/models/conv_knrm.py (tensorflow) https://github.com/thunlp/EntityDuetNeuralRanking/blob/master/baselines/CKNRM.py (pytorch) ''' def __init__(self, word_embeddings: TextFieldEmbedder, n_grams: int, n_kernels: int, conv_out_dim: int): super(Conv_KNRM, self).__init__() self.word_embeddings = word_embeddings # static - kernel size & magnitude variables self.mu = Variable(torch.cuda.FloatTensor(self.kernel_mus(n_kernels)), requires_grad=False).view(1, 1, 1, n_kernels) self.sigma = Variable(torch.cuda.FloatTensor( self.kernel_sigmas(n_kernels)), requires_grad=False).view(1, 1, 1, n_kernels) self.convolutions = [] for i in range(1, n_grams + 1): self.convolutions.append( nn.Sequential( nn.ConstantPad1d((0, i - 1), 0), nn.Conv1d(kernel_size=i, in_channels=word_embeddings.get_output_dim(), out_channels=conv_out_dim), nn.ReLU())) self.convolutions = nn.ModuleList( self.convolutions) # register conv as part of the model # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights) self.cosine_module = CosineMatrixAttention() # *9 because we concat the 3x3 conv match sums together before the dense layer self.dense = nn.Linear(n_kernels * n_grams * n_grams, 1, bias=False) # init with small weights, otherwise the dense output is way to high fot torch.nn.init.uniform_(self.dense.weight, -0.014, 0.014) # inits taken from matchzoo def forward(self, query: Dict[str, torch.Tensor], document: Dict[str, torch.Tensor], query_length: torch.Tensor, document_length: torch.Tensor) -> torch.Tensor: # # prepare embedding tensors # ------------------------------------------------------- # we assume 1 is the unknown token, 0 is padding - both need to be removed if len(query["tokens"].shape) == 2: # (embedding lookup matrix) # shape: (batch, query_max) query_pad_oov_mask = (query["tokens"] > 1).float() # shape: (batch, doc_max) document_pad_oov_mask = (document["tokens"] > 1).float() # shape: (batch, query_max) query_pad_mask = (query["tokens"] > 0).float() # shape: (batch, doc_max) document_pad_mask = (document["tokens"] > 0).float() else: # == 3 (elmo characters per word) # shape: (batch, query_max) query_pad_oov_mask = (torch.sum(query["tokens"], 2) > 0).float() query_pad_mask = query_pad_oov_mask # shape: (batch, doc_max) document_pad_oov_mask = (torch.sum(document["tokens"], 2) > 0).float() document_pad_mask = document_pad_oov_mask query_by_doc_mask = torch.bmm( query_pad_mask.unsqueeze(-1), document_pad_mask.unsqueeze(-1).transpose(-1, -2)) #query_by_doc_mask_view = query_by_doc_mask.unsqueeze(-1) # shape: (batch, query_max,emb_dim) query_embeddings = self.word_embeddings( query) * query_pad_oov_mask.unsqueeze(-1) # shape: (batch, document_max,emb_dim) document_embeddings = self.word_embeddings( document) * document_pad_oov_mask.unsqueeze(-1) # !! conv1d requires tensor in shape: [batch, emb_dim, sequence_length ] # so we transpose embedding tensors from : [batch, sequence_length,emb_dim] to [batch, emb_dim, sequence_length ] # feed that into the conv1d and reshape output from [batch, conv1d_out_channels, sequence_length ] # to [batch, sequence_length, conv1d_out_channels] query_embeddings_t = query_embeddings.transpose(1, 2) document_embeddings_t = document_embeddings.transpose(1, 2) query_results = [] document_results = [] for i, conv in enumerate(self.convolutions): query_conv = conv(query_embeddings_t).transpose(1, 2) document_conv = conv(document_embeddings_t).transpose(1, 2) query_results.append(query_conv) document_results.append(document_conv) matched_results = [] for i in range(len(query_results)): for t in range(len(query_results)): matched_results.append( self.forward_matrix_kernel_pooling(query_results[i], document_results[t], query_by_doc_mask, query_pad_mask)) # # "Learning to rank" layer # ------------------------------------------------------- all_grams = torch.cat(matched_results, 1) dense_out = self.dense(all_grams) #tanh_out = torch.tanh(dense_out) output = torch.squeeze(dense_out, 1) return output # # create a match matrix between query & document terms # def forward_matrix_kernel_pooling(self, query_tensor, document_tensor, query_by_doc_mask, query_pad_oov_mask): # # cosine matrix # ------------------------------------------------------- # shape: (batch, query_max, doc_max) cosine_matrix = self.cosine_module.forward(query_tensor, document_tensor) cosine_matrix_masked = cosine_matrix * query_by_doc_mask cosine_matrix_extradim = cosine_matrix_masked.unsqueeze(-1) # # gaussian kernels & soft-TF # # first run through kernel, then sum on doc dim then sum on query dim # ------------------------------------------------------- raw_kernel_results = torch.exp( -torch.pow(cosine_matrix_extradim - self.mu, 2) / (2 * torch.pow(self.sigma, 2))) kernel_results_masked = raw_kernel_results * query_by_doc_mask.unsqueeze( -1) per_kernel_query = torch.sum(kernel_results_masked, 2) log_per_kernel_query = torch.log( torch.clamp(per_kernel_query, min=1e-10)) * 0.01 log_per_kernel_query_masked = log_per_kernel_query * query_pad_oov_mask.unsqueeze( -1) # make sure we mask out padding values per_kernel = torch.sum(log_per_kernel_query_masked, 1) return per_kernel def kernel_mus(self, n_kernels: int): """ get the mu for each guassian kernel. Mu is the middle of each bin :param n_kernels: number of kernels (including exact match). first one is exact match :return: l_mu, a list of mu. """ l_mu = [1.0] if n_kernels == 1: return l_mu bin_size = 2.0 / (n_kernels - 1) # score range from [-1, 1] l_mu.append(1 - bin_size / 2) # mu: middle of the bin for i in range(1, n_kernels - 1): l_mu.append(l_mu[i] - bin_size) return l_mu def kernel_sigmas(self, n_kernels: int): """ get sigmas for each guassian kernel. :param n_kernels: number of kernels (including exactmath.) :param lamb: :param use_exact: :return: l_sigma, a list of simga """ bin_size = 2.0 / (n_kernels - 1) l_sigma = [0.001] # for exact match. small variance -> exact match if n_kernels == 1: return l_sigma l_sigma += [0.5 * bin_size] * (n_kernels - 1) return l_sigma
class KNRM(nn.Module): ''' Paper: End-to-End Neural Ad-hoc Ranking with Kernel Pooling, Xiong et al., SIGIR'17 Reference code (paper author): https://github.com/AdeDZY/K-NRM/blob/master/knrm/model/model_knrm.py (but in tensorflow) third-hand reference: https://github.com/NTMC-Community/MatchZoo/blob/master/matchzoo/models/knrm.py ''' def __init__(self, word_embeddings: TextFieldEmbedder, n_kernels: int): super(KNRM, self).__init__() self.word_embeddings = word_embeddings # static - kernel size & magnitude variables self.mu = Variable(torch.cuda.FloatTensor(self.kernel_mus(n_kernels)), requires_grad=False).view(1, 1, 1, n_kernels) self.sigma = Variable(torch.cuda.FloatTensor( self.kernel_sigmas(n_kernels)), requires_grad=False).view(1, 1, 1, n_kernels) # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights) self.cosine_module = CosineMatrixAttention() # bias is set to True in original code (we found it to not help, how could it?) self.dense = nn.Linear(n_kernels, 1, bias=False) # init with small weights, otherwise the dense output is way to high for the tanh -> resulting in loss == 1 all the time torch.nn.init.uniform_(self.dense.weight, -0.014, 0.014) # inits taken from matchzoo #self.dense.bias.data.fill_(0.0) def forward(self, query: Dict[str, torch.Tensor], document: Dict[str, torch.Tensor], query_length: torch.Tensor, document_length: torch.Tensor) -> torch.Tensor: # pylint: disable=arguments-differ # # prepare embedding tensors & paddings masks # ------------------------------------------------------- # shape: (batch, query_max,emb_dim) query_embeddings = self.word_embeddings(query) # shape: (batch, document_max,emb_dim) document_embeddings = self.word_embeddings(document) # we assume 1 is the unknown token, 0 is padding - both need to be removed if len(query["tokens"].shape) == 2: # (embedding lookup matrix) # shape: (batch, query_max) query_pad_oov_mask = (query["tokens"] > 1).float() # shape: (batch, doc_max) document_pad_oov_mask = (document["tokens"] > 1).float() else: # == 3 (elmo characters per word) # shape: (batch, query_max) query_pad_oov_mask = (torch.sum(query["tokens"], 2) > 0).float() # shape: (batch, doc_max) document_pad_oov_mask = (torch.sum(document["tokens"], 2) > 0).float() query_by_doc_mask = torch.bmm( query_pad_oov_mask.unsqueeze(-1), document_pad_oov_mask.unsqueeze(-1).transpose(-1, -2)) query_by_doc_mask_view = query_by_doc_mask.unsqueeze(-1) # # cosine matrix # ------------------------------------------------------- # shape: (batch, query_max, doc_max) cosine_matrix = self.cosine_module.forward(query_embeddings, document_embeddings) cosine_matrix_masked = cosine_matrix * query_by_doc_mask cosine_matrix_extradim = cosine_matrix_masked.unsqueeze(-1) # # gaussian kernels & soft-TF # # first run through kernel, then sum on doc dim then sum on query dim # ------------------------------------------------------- raw_kernel_results = torch.exp( -torch.pow(cosine_matrix_extradim - self.mu, 2) / (2 * torch.pow(self.sigma, 2))) kernel_results_masked = raw_kernel_results * query_by_doc_mask_view per_kernel_query = torch.sum(kernel_results_masked, 2) log_per_kernel_query = torch.log( torch.clamp(per_kernel_query, min=1e-10)) * 0.01 log_per_kernel_query_masked = log_per_kernel_query * query_pad_oov_mask.unsqueeze( -1) # make sure we mask out padding values per_kernel = torch.sum(log_per_kernel_query_masked, 1) ## ## "Learning to rank" layer - connects kernels with learned weights ## ------------------------------------------------------- dense_out = self.dense(per_kernel) score = torch.squeeze(dense_out, 1) #torch.tanh(dense_out), 1) return score def kernel_mus(self, n_kernels: int): """ get the mu for each guassian kernel. Mu is the middle of each bin :param n_kernels: number of kernels (including exact match). first one is exact match :return: l_mu, a list of mu. """ l_mu = [1.0] if n_kernels == 1: return l_mu bin_size = 2.0 / (n_kernels - 1) # score range from [-1, 1] l_mu.append(1 - bin_size / 2) # mu: middle of the bin for i in range(1, n_kernels - 1): l_mu.append(l_mu[i] - bin_size) return l_mu def kernel_sigmas(self, n_kernels: int): """ get sigmas for each guassian kernel. :param n_kernels: number of kernels (including exactmath.) :param lamb: :param use_exact: :return: l_sigma, a list of simga """ bin_size = 2.0 / (n_kernels - 1) l_sigma = [0.0001] # for exact match. small variance -> exact match if n_kernels == 1: return l_sigma l_sigma += [0.5 * bin_size] * (n_kernels - 1) return l_sigma