def build_pair_sentence_module(task, d_inp, model, params): """ Build a pair classifier, shared if necessary """ def build_pair_attn(d_in, d_hid_attn): """ Build the pair model """ d_inp_model = 2 * d_in modeling_layer = s2s_e.by_name("lstm").from_params( Params( { "input_size": d_inp_model, "hidden_size": d_hid_attn, "num_layers": 1, "bidirectional": True, } ) ) pair_attn = AttnPairEncoder(model.vocab, modeling_layer, dropout=params["dropout"]) return pair_attn # Build the "pooler", which does pools a variable length sequence # possibly with a projection layer beforehand if params["attn"] and not model.use_bert: pooler = Pooler(project=False, d_inp=params["d_hid_attn"], d_proj=params["d_hid_attn"]) d_out = params["d_hid_attn"] * 2 else: pooler = Pooler( project=not model.use_bert, d_inp=d_inp, d_proj=params["d_proj"], pool_type=params["pool_type"], ) d_out = d_inp if model.use_bert else params["d_proj"] # Build an attention module if necessary if params["shared_pair_attn"] and params["attn"] and not model.use_bert: # shared attn if not hasattr(model, "pair_attn"): pair_attn = build_pair_attn(d_inp, params["d_hid_attn"]) model.pair_attn = pair_attn else: pair_attn = model.pair_attn elif params["attn"] and not model.use_bert: # non-shared attn pair_attn = build_pair_attn(d_inp, params["d_hid_attn"]) else: # no attn pair_attn = None # Build the classifier n_classes = task.n_classes if hasattr(task, "n_classes") else 1 if model.use_bert: # BERT handles pair tasks by concatenating the inputs and classifying the joined # sequence, so we use a single sentence classifier if isinstance(task, WiCTask): d_out *= 3 # also pass the two contextual word representations classifier = Classifier.from_params(d_out, n_classes, params) module = SingleClassifier(pooler, classifier) else: d_out = d_out + d_inp if isinstance(task, WiCTask) else d_out classifier = Classifier.from_params(4 * d_out, n_classes, params) module = PairClassifier(pooler, classifier, pair_attn) return module
def build_multiple_choice_module(task, d_sent, use_bert, params): """ Basic parts for MC task: reduce a vector representation for each model into a scalar. """ pooler = Pooler( project=not use_bert, d_inp=d_sent, d_proj=params["d_proj"], pool_type=params["pool_type"] ) d_out = d_sent if use_bert else params["d_proj"] choice2scalar = Classifier(d_out, n_classes=1, cls_type=params["cls_type"]) return SingleClassifier(pooler, choice2scalar)
def build_qa_module(task, d_inp, use_bert, params): """ Build a simple QA module that 1) pools representations (either of the joint (context, question, answer) or individually 2) projects down to two logits 3) classifier This module models each question-answer pair _individually_ """ pooler = Pooler( project=not use_bert, d_inp=d_inp, d_proj=params["d_proj"], pool_type=params["pool_type"] ) d_out = d_inp if use_bert else params["d_proj"] classifier = Classifier.from_params(d_out, 2, params) return SingleClassifier(pooler, classifier)
def build_single_sentence_module(task, d_inp: int, use_bert: bool, params: Params): """ Build a single sentence classifier args: - task (Task): task object, used to get the number of output classes - d_inp (int): input dimension to the module, needed for optional linear projection - use_bert (bool): if using BERT, skip projection before pooling. - params (Params): Params object with task-specific parameters returns: - SingleClassifier (nn.Module): single-sentence classifier consisting of (optional) a linear projection, pooling, and an MLP classifier """ pooler = Pooler( project=not use_bert, d_inp=d_inp, d_proj=params["d_proj"], pool_type=params["pool_type"] ) d_out = d_inp if use_bert else params["d_proj"] classifier = Classifier.from_params(d_out, task.n_classes, params) module = SingleClassifier(pooler, classifier) return module
def __init__( self, vocab: Vocabulary, input_dim: int, decoder_hidden_size: int, max_decoding_steps: int, output_proj_input_dim: int, target_namespace: str = "targets", target_embedding_dim: int = None, attention: str = "none", dropout: float = 0.0, scheduled_sampling_ratio: float = 0.0, ) -> None: super(Seq2SeqDecoder, self).__init__(vocab) self._max_decoding_steps = max_decoding_steps self._target_namespace = target_namespace # We need the start symbol to provide as the input at the first timestep of decoding, and # end symbol as a way to indicate the end of the decoded sequence. self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) self._unk_index = self.vocab.get_token_index("@@UNKNOWN@@", self._target_namespace) num_classes = self.vocab.get_vocab_size(self._target_namespace) # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with that of the final hidden states of the encoder. Also, if # we're using attention with ``DotProductSimilarity``, this is needed. self._encoder_output_dim = input_dim self._decoder_hidden_dim = decoder_hidden_size if self._encoder_output_dim != self._decoder_hidden_dim: self._projection_encoder_out = Linear(self._encoder_output_dim, self._decoder_hidden_dim) else: self._projection_encoder_out = lambda x: x self._decoder_output_dim = self._decoder_hidden_dim self._output_proj_input_dim = output_proj_input_dim self._target_embedding_dim = target_embedding_dim self._target_embedder = Embedding(num_classes, self._target_embedding_dim) # Used to get an initial hidden state from the encoder states self._sent_pooler = Pooler(project=True, d_inp=input_dim, d_proj=decoder_hidden_size) if attention == "Bahdanau": self._decoder_attention = BahdanauAttention( decoder_hidden_size + target_embedding_dim, input_dim) # The output of attention, a weighted average over encoder outputs, will be # concatenated to the input vector of the decoder at each time # step. self._decoder_input_dim = input_dim + target_embedding_dim elif attention == "bilinear": self._decoder_attention = BilinearAttention( decoder_hidden_size + target_embedding_dim, input_dim) # The output of attention, a weighted average over encoder outputs, will be # concatenated to the input vector of the decoder at each time # step. self._decoder_input_dim = input_dim + target_embedding_dim elif attention == "none": self._decoder_attention = None self._decoder_input_dim = target_embedding_dim else: raise Exception("attention not implemented {}".format(attention)) self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_hidden_dim) # Allow for a bottleneck layer between encoder outputs and distribution over vocab # The bottleneck layer consists of a linear transform and helps to reduce # number of parameters if self._output_proj_input_dim != self._decoder_output_dim: self._projection_bottleneck = Linear(self._decoder_output_dim, self._output_proj_input_dim) else: self._projection_bottleneck = lambda x: x self._output_projection_layer = Linear(self._output_proj_input_dim, num_classes) self._dropout = torch.nn.Dropout(p=dropout)
def build_image_sent_module(task, d_inp, params): pooler = Pooler(project=True, d_inp=d_inp, d_proj=params["d_proj"]) return pooler