Exemplo n.º 1
0
 def test_get_dimension_is_correct(self):
     encoder = StackedSelfAttentionEncoder(input_dim=9,
                                           hidden_dim=12,
                                           projection_dim=7,
                                           feedforward_hidden_dim=5,
                                           num_layers=3,
                                           num_attention_heads=3)
     assert encoder.get_input_dim() == 9
     # hidden_dim + projection_dim
     assert encoder.get_output_dim() == 12
 def test_get_dimension_is_correct(self):
     encoder = StackedSelfAttentionEncoder(input_dim=9,
                                           hidden_dim=12,
                                           projection_dim=6,
                                           feedforward_hidden_dim=5,
                                           num_layers=3,
                                           num_attention_heads=3)
     assert encoder.get_input_dim() == 9
     # hidden_dim + projection_dim
     assert encoder.get_output_dim() == 12
Exemplo n.º 3
0
def get_encoder(input_dim, output_dim, encoder_type, args):
    if encoder_type == "pass":
        return PassThroughEncoder(input_dim)
    if encoder_type == "bilstm":
        return PytorchSeq2SeqWrapper(
            AllenNLPSequential(torch.nn.ModuleList(
                [get_encoder(input_dim, output_dim, "bilstm-unwrapped",
                             args)]),
                               input_dim,
                               output_dim,
                               bidirectional=True,
                               residual_connection=args.residual_connection,
                               dropout=args.dropout))
    if encoder_type == "bilstm-unwrapped":
        return torch.nn.LSTM(
            input_dim,
            output_dim,
            batch_first=True,
            bidirectional=True,
            dropout=args.dropout,
        )
    if encoder_type == "self_attention":
        return IntraSentenceAttentionEncoder(input_dim=input_dim,
                                             projection_dim=output_dim)
    if encoder_type == "stacked_self_attention":
        return StackedSelfAttentionEncoder(
            input_dim=input_dim,
            hidden_dim=output_dim,
            projection_dim=output_dim,
            feedforward_hidden_dim=output_dim,
            num_attention_heads=5,
            num_layers=3,
            dropout_prob=args.dropout,
        )
    raise RuntimeError(f"Unknown encoder type={encoder_type}")
Exemplo n.º 4
0
def transformer_seq2seq(input_dim: int,
                        model_dim: int,
                        feedforward_hidden_dim: int = 2048,
                        num_layers: int = 6,
                        projection_dim: int = 64,
                        num_attention_heads: int = 8,
                        ttype: str = 'custom',
                        dropout: float = 0.1) -> Seq2SeqEncoder:
    if ttype == 'custom':
        return TransformerEncoder(
            input_dim=input_dim,
            model_dim=model_dim,
            feedforward_hidden_dim=feedforward_hidden_dim,
            num_layers=num_layers,
            num_attention_heads=num_attention_heads,
            dropout_prob=dropout
        )
    elif ttype == 'allen':
        return StackedSelfAttentionEncoder(
            input_dim=input_dim,
            hidden_dim=model_dim,
            feedforward_hidden_dim=feedforward_hidden_dim,
            num_layers=num_layers,
            num_attention_heads=num_attention_heads,
            projection_dim=model_dim,
            dropout_prob=dropout
        )
    else:
        raise ValueError(f'Invalid transformer type {ttype}')
Exemplo n.º 5
0
    def __init__(self,
                 input_dim: int,
                 hidden_dim: int,
                 projection_dim: int,
                 feedforward_hidden_dim: int,
                 num_layers: int,
                 num_attention_heads: int,
                 use_positional_encoding: bool = True,
                 dropout_prob: float = 0.1,
                 residual_dropout_prob: float = 0.2,
                 attention_dropout_prob: float = 0.1) -> None:

        super().__init__()

        self.seq2seq = StackedSelfAttentionEncoder(
            input_dim=input_dim,
            hidden_dim=hidden_dim,
            projection_dim=projection_dim,
            feedforward_hidden_dim=feedforward_hidden_dim,
            num_layers=num_layers,
            num_attention_heads=num_attention_heads,
            use_positional_encoding=use_positional_encoding,
            dropout_prob=dropout_prob,
            residual_dropout_prob=residual_dropout_prob,
            attention_dropout_prob=attention_dropout_prob)

        self.hidden_dim = hidden_dim
        self.input_dim = input_dim
Exemplo n.º 6
0
    def __init__(self, args, input_dim, hidden_dim, word_embedder):
        super(DefinitionSentenceEncoder, self).__init__()
        self.config = args
        self.args = args
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim

        self.projection_dim = input_dim
        self.feedforward_hidden_dim = input_dim
        self.num_layers = self.args.num_layers_for_stackatt
        self.num_attention_heads = self.args.num_atthead_for_stackatt

        self.word_embedder = word_embedder
        self.word_embedding_dropout = nn.Dropout(
            self.args.word_embedding_dropout)

        self.mentiontransformer = StackedSelfAttentionEncoder(
            input_dim=input_dim,
            hidden_dim=hidden_dim,
            projection_dim=self.projection_dim,
            feedforward_hidden_dim=self.feedforward_hidden_dim,
            num_layers=self.num_layers,
            num_attention_heads=self.num_attention_heads)

        self.senttransformer = StackedSelfAttentionEncoder(
            input_dim=input_dim,
            hidden_dim=hidden_dim,
            projection_dim=self.projection_dim,
            feedforward_hidden_dim=self.feedforward_hidden_dim,
            num_layers=self.num_layers,
            num_attention_heads=self.num_attention_heads)

        self.ff_seq2vecs = nn.Linear(input_dim, input_dim)

        self.rnn = PytorchSeq2VecWrapper(
            nn.LSTM(bidirectional=True,
                    num_layers=2,
                    input_size=input_dim,
                    hidden_size=hidden_dim // 2,
                    batch_first=True,
                    dropout=self.args.lstmdropout))

        self.bow = BagOfEmbeddingsEncoder(input_dim, self.args.bow_avg)
Exemplo n.º 7
0
def main():
    reader = Seq2SeqDatasetReader(
        source_tokenizer=WordTokenizer(),
        target_tokenizer=CharacterTokenizer(),
        source_token_indexers={'tokens': SingleIdTokenIndexer()},
        target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='target_tokens')})
    train_dataset = reader.read('data/mt/tatoeba.eng_cmn.train.tsv')
    validation_dataset = reader.read('data/mt/tatoeba.eng_cmn.dev.tsv')

    vocab = Vocabulary.from_instances(train_dataset + validation_dataset,
                                      min_count={'tokens': 3, 'target_tokens': 3})

    en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                             embedding_dim=EN_EMBEDDING_DIM)
    # encoder = PytorchSeq2SeqWrapper(
    #     torch.nn.LSTM(EN_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
    encoder = StackedSelfAttentionEncoder(input_dim=EN_EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, projection_dim=128, feedforward_hidden_dim=128, num_layers=1, num_attention_heads=8)

    source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding})

    # attention = LinearAttention(HIDDEN_DIM, HIDDEN_DIM, activation=Activation.by_name('tanh')())
    # attention = BilinearAttention(HIDDEN_DIM, HIDDEN_DIM)
    attention = DotProductAttention()

    max_decoding_steps = 20   # TODO: make this variable
    model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps,
                          target_embedding_dim=ZH_EMBEDDING_DIM,
                          target_namespace='target_tokens',
                          attention=attention,
                          beam_size=8,
                          use_bleu=True)
    optimizer = optim.Adam(model.parameters())
    iterator = BucketIterator(batch_size=32, sorting_keys=[("source_tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      num_epochs=1,
                      cuda_device=CUDA_DEVICE)

    for i in range(50):
        print('Epoch: {}'.format(i))
        trainer.train()

        predictor = SimpleSeq2SeqPredictor(model, reader)

        for instance in itertools.islice(validation_dataset, 10):
            print('SOURCE:', instance.fields['source_tokens'].tokens)
            print('GOLD:', instance.fields['target_tokens'].tokens)
            print('PRED:', predictor.predict_instance(instance)['predicted_tokens'])
Exemplo n.º 8
0
    def __init__(self, _embsize: int, kernels_mu: List[float],
                 kernels_sigma: List[float], att_heads: int, att_layer: int,
                 att_proj_dim: int, att_ff_dim: int, win_size: int,
                 max_windows: int):

        super(TK_v2, self).__init__()

        n_kernels = len(kernels_mu)

        if len(kernels_mu) != len(kernels_sigma):
            raise Exception("len(kernels_mu) != len(kernels_sigma)")

        # static - kernel size & magnitude variables
        self.mu = Variable(torch.cuda.FloatTensor(kernels_mu),
                           requires_grad=False).view(1, 1, 1, n_kernels)
        self.sigma = Variable(torch.cuda.FloatTensor(kernels_sigma),
                              requires_grad=False).view(1, 1, 1, n_kernels)
        self.mixer = nn.Parameter(
            torch.full([1, 1, 1], 0.5, dtype=torch.float32,
                       requires_grad=True))

        self.stacked_att = StackedSelfAttentionEncoder(
            input_dim=_embsize,
            hidden_dim=_embsize,
            projection_dim=att_proj_dim,
            feedforward_hidden_dim=att_ff_dim,
            num_layers=att_layer,
            num_attention_heads=att_heads,
            dropout_prob=0,
            residual_dropout_prob=0,
            attention_dropout_prob=0)

        # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights)
        self.cosine_module = CosineMatrixAttention()

        self.nn_scaler = nn.ParameterList([
            nn.Parameter(
                torch.full([1], 0.01, dtype=torch.float32, requires_grad=True))
            for w in win_size
        ])

        self.kernel_weights = nn.ModuleList(
            [nn.Linear(n_kernels, 1, bias=False) for w in win_size])

        self.window_size = win_size
        self.window_scorer = []
        for w in max_windows:
            l = nn.Linear(w, 1, bias=False)
            torch.nn.init.constant_(l.weight, 1 / w)
            self.window_scorer.append(l)

        self.window_scorer = nn.ModuleList(self.window_scorer)

        self.window_merger = nn.Linear(len(self.window_size), 1, bias=False)
Exemplo n.º 9
0
 def test_stacked_self_attention_can_run_foward_on_multiple_gpus(self):
     encoder = StackedSelfAttentionEncoder(input_dim=9,
                                           hidden_dim=12,
                                           projection_dim=9,
                                           feedforward_hidden_dim=5,
                                           num_layers=3,
                                           num_attention_heads=3).to(0)
     parallel_encoder = DataParallel(encoder, device_ids=[0, 1])
     inputs = torch.randn([3, 5, 9]).to(0)
     encoder_output = parallel_encoder(inputs, None)
     assert list(encoder_output.size()) == [3, 5, 12]
Exemplo n.º 10
0
 def test_stacked_self_attention_can_run_foward(self):
     # Correctness checks are elsewhere - this is just stacking
     # blocks which are already well tested, so we just check shapes.
     encoder = StackedSelfAttentionEncoder(input_dim=9,
                                           hidden_dim=12,
                                           projection_dim=9,
                                           feedforward_hidden_dim=5,
                                           num_layers=3,
                                           num_attention_heads=3)
     inputs = Variable(torch.randn([3, 5, 9]))
     encoder_output = encoder(inputs, None)
     assert list(encoder_output.size()) == [3, 5, 12]
Exemplo n.º 11
0
    def __init__(self, _embsize: int, kernels_mu: List[float],
                 kernels_sigma: List[float], att_heads: int, att_layer: int,
                 att_proj_dim: int, att_ff_dim: int):

        super(TK_v1, self).__init__()

        n_kernels = len(kernels_mu)

        if len(kernels_mu) != len(kernels_sigma):
            raise Exception("len(kernels_mu) != len(kernels_sigma)")

        # static - kernel size & magnitude variables
        self.mu = Variable(torch.cuda.FloatTensor(kernels_mu),
                           requires_grad=False).view(1, 1, 1, n_kernels)
        self.sigma = Variable(torch.cuda.FloatTensor(kernels_sigma),
                              requires_grad=False).view(1, 1, 1, n_kernels)
        self.nn_scaler = nn.Parameter(
            torch.full([1], 0.01, dtype=torch.float32, requires_grad=True))
        self.mixer = nn.Parameter(
            torch.full([1, 1, 1], 0.5, dtype=torch.float32,
                       requires_grad=True))

        self.stacked_att = StackedSelfAttentionEncoder(
            input_dim=_embsize,
            hidden_dim=_embsize,
            projection_dim=att_proj_dim,
            feedforward_hidden_dim=att_ff_dim,
            num_layers=att_layer,
            num_attention_heads=att_heads,
            dropout_prob=0,
            residual_dropout_prob=0,
            attention_dropout_prob=0)

        # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights)
        self.cosine_module = CosineMatrixAttention()

        # bias is set to True in original code (we found it to not help, how could it?)
        self.dense = nn.Linear(n_kernels, 1, bias=False)
        self.dense_mean = nn.Linear(n_kernels, 1, bias=False)
        self.dense_comb = nn.Linear(2, 1, bias=False)

        # init with small weights, otherwise the dense output is way to high for the tanh -> resulting in loss == 1 all the time
        torch.nn.init.uniform_(self.dense.weight, -0.014,
                               0.014)  # inits taken from matchzoo
        torch.nn.init.uniform_(self.dense_mean.weight, -0.014,
                               0.014)  # inits taken from matchzoo

        # init with small weights, otherwise the dense output is way to high for the tanh -> resulting in loss == 1 all the time
        torch.nn.init.uniform_(self.dense.weight, -0.014,
                               0.014)  # inits taken from matchzoo
Exemplo n.º 12
0
    def __init__(self, input_dim: int, hidden_dim: int, projection_dim: int,
                 feedforward_hidden_dim: int, num_layers: int,
                 num_attention_heads: int):

        super(AttentionSeq2Veq, self).__init__(stateful=False)

        self._seq2seq = StackedSelfAttentionEncoder(
            input_dim=input_dim,
            hidden_dim=hidden_dim,
            projection_dim=projection_dim,
            feedforward_hidden_dim=feedforward_hidden_dim,
            num_layers=num_layers,
            num_attention_heads=num_attention_heads)

        self._hidden_dim = hidden_dim
        self._input_dim = input_dim
Exemplo n.º 13
0
 def __init__(self,
              input_dim,
              hidden_dim,
              projection_dim,
              feedforward_hidden_dim,
              num_layers,
              num_attention_heads,
              stateful: bool = False) -> None:
     super().__init__(stateful)
     self.input_dim = input_dim
     self.hidden_dim = hidden_dim
     self.seq_2_seq = StackedSelfAttentionEncoder(input_dim=input_dim,
                                                  hidden_dim=hidden_dim,
                                                  projection_dim=projection_dim,
                                                  feedforward_hidden_dim=feedforward_hidden_dim,
                                                  num_layers=num_layers,
                                                  num_attention_heads=num_attention_heads)
Exemplo n.º 14
0
    def __init__(self, args, dictionary, source_embedder: TextFieldEmbedder, left_pad=False):
        super().__init__(dictionary)

        self._seq2seq_encoder = StackedSelfAttentionEncoder(input_dim= int(source_embedder.get_output_dim()),
                                                    hidden_dim= int(args.encoder_embed_dim),
                                                    projection_dim= int(args.encoder_embed_dim / args.encoder_attention_heads),
                                                    feedforward_hidden_dim= int(args.encoder_ffn_embed_dim),
                                                    num_layers= int(args.encoder_layers),
                                                    num_attention_heads= int(args.encoder_attention_heads),
                                                    use_positional_encoding = True,
                                                    dropout_prob = int(args.dropout),
                                                    residual_dropout_prob = int(args.relu_dropout),
                                                    attention_dropout_prob = int(args.attention_dropout))

        self._source_embedder = source_embedder
        embed_dim = source_embedder.get_output_dim()
        self.embed_scale = math.sqrt(embed_dim)
        self._max_source_positions = args.max_source_positions
Exemplo n.º 15
0
 def __init__(self,
              input_size: int,
              hidden_size: int,
              projection_dim: int,
              feedforward_hidden_dim: int,
              num_layers: int,
              num_attention_heads: int,
              use_positional_encoding: bool = True,
              dropout_prob: float = 0.1,
              residual_dropout_prob: float = 0.2,
              attention_dropout_prob: float = 0.1) -> None:
     super(TransformerSeq2VecEncoder, self).__init__()
     self.stacked_attention = StackedSelfAttentionEncoder(
         input_size, hidden_size, projection_dim, feedforward_hidden_dim,
         num_layers, num_attention_heads, use_positional_encoding,
         dropout_prob, residual_dropout_prob, attention_dropout_prob)
     self.input_dim = input_size
     self.output_dim = self.stacked_attention._attention_layers[
         -1].get_output_dim()
    def __init__(
        self,
        vocab: Vocabulary,
        source_embedder: TextFieldEmbedder,
        upsample: torch.nn.Module = None, 
        net: Seq2SeqEncoder = None,
        target_namespace: str = "target_tokens",
        target_embedding_dim: int = None,
        use_bleu: bool = True,
        loss_type: str = "ctc",
        label_smoothing: float = None,
    ) -> None:
        super(LatentAignmentCTC, self).__init__(vocab)
        self._target_namespace = target_namespace
        
        self._pad_index = self.vocab.get_token_index(self.vocab._padding_token, 
                                                    self._target_namespace)
        self._blank_index = self.vocab.get_token_index(SPECIAL_BLANK_TOKEN, 
                                                       self._target_namespace)

        if use_bleu:
            self._bleu = BLEU(exclude_indices={self._pad_index, self._blank_index})
        else:
            self._bleu = None

        self._source_embedder = source_embedder
        source_embedding_dim = source_embedder.get_output_dim()

        self._upsample = upsample or LinearUpsample(source_embedding_dim, s = 3)
        self._net = net or StackedSelfAttentionEncoder(input_dim = source_embedding_dim,
                                                       hidden_dim = 128,
                                                       projection_dim = 128,
                                                       feedforward_hidden_dim = 512,
                                                       num_layers = 4,
                                                       num_attention_heads = 4)
        
        num_classes = self.vocab.get_vocab_size(self._target_namespace)
        target_embedding_dim = self._net.get_output_dim()

        self._output_projection = torch.nn.Linear(target_embedding_dim, num_classes)
        self.loss_type = loss_type
        self.label_smoothing = label_smoothing
Exemplo n.º 17
0
 def __init__(self,
              input_dim: int,
              num_head: int = 3,
              bert_self_attn_layers: BertSelfAttnLayers = None) -> None:
     super().__init__()
     self._global_attention = TimeDistributed(torch.nn.Linear(input_dim, 1))
     self._num_heads = num_head
     self._span_token_emb = nn.Parameter(torch.Tensor(input_dim))
     nn.init.normal_(self._span_token_emb)
     if bert_self_attn_layers is not None:
         self._input_dim = self._output_dim = input_dim
         self._stacked_self_attention = bert_self_attn_layers
     else:
         self._input_dim = input_dim
         self._output_dim = input_dim // 4
         self._stacked_self_attention = StackedSelfAttentionEncoder(
             input_dim=input_dim,
             hidden_dim=self._output_dim,
             projection_dim=self._output_dim,
             feedforward_hidden_dim=4 * self._output_dim,
             num_layers=2,
             num_attention_heads=1,
             use_positional_encoding=True)
Exemplo n.º 18
0
    def __init__(self,
                 idiom_vector_path: str,
                 idiom_graph_path: str,
                 dropout: float,
                 vocab: Vocabulary,
                 content_embedder: TextFieldEmbedder,
                 neighbor_num: int = 7,
                 mode: List[str] = None) -> None:
        super().__init__(vocab)
        self.content_embedder = content_embedder

        idiom_list, idiom_vectors = [], []
        with open(idiom_vector_path) as fh:
            for line in fh:
                idiom_list.append(line.strip().split()[0])
                idiom_vectors.append(list(map(float,
                                              line.strip().split()[1:])))

        self.graph_embedder = GraphEmbedder(idiom_graph_path,
                                            neighbor_num=neighbor_num,
                                            drop_neighbor=False)

        embedding_dim = self.content_embedder.get_output_dim()
        self.option_embedder = modules.Embedding(
            num_embeddings=len(idiom_list),
            embedding_dim=embedding_dim,
            # 使用 预训练的成语向量
            # weight=torch.FloatTensor(idiom_vectors)
        )

        self.dropout = nn.Dropout(dropout)
        self.scorer = nn.Linear(self.content_embedder.get_output_dim(), 1)

        embedding_size = self.content_embedder.get_output_dim()

        self.neighbour_reasoner = StackedSelfAttentionEncoder(
            input_dim=embedding_size,
            hidden_dim=embedding_size,
            projection_dim=embedding_size,
            feedforward_hidden_dim=embedding_size,
            num_layers=1,
            num_attention_heads=2,
            use_positional_encoding=False)
        self.option_encoder = FirstVecEncoder(embedding_dim=embedding_size)

        self.option_reasoner = StackedSelfAttentionEncoder(
            input_dim=embedding_size,
            hidden_dim=embedding_size,
            projection_dim=embedding_size,
            feedforward_hidden_dim=embedding_size,
            num_layers=1,
            num_attention_heads=2,
            use_positional_encoding=False)

        if mode is None:
            mode = ['raw', 'ocn', 'nn']
        else:
            for item in mode:
                assert item in ['raw', 'ocn', 'nn'], f"{item} is invalid"
        self.mode = mode

        self.data_merger = FeedForward(
            input_dim=embedding_size * len(mode),
            num_layers=1,
            hidden_dims=embedding_size,
            activations=Activation.by_name('linear')(),
            dropout=0.1)

        self.loss = nn.CrossEntropyLoss()
        self.acc = CategoricalAccuracy()
Exemplo n.º 19
0
def build_model(args, vocab, pretrained_embs, tasks):
    '''Build model according to args '''

    # Build embeddings.
    if args.openai_transformer:
        # Note: incompatible with other embedders, but logic in preprocess.py
        # should prevent these from being enabled anyway.
        from .openai_transformer_lm.utils import OpenAIEmbedderModule
        log.info("Using OpenAI transformer model; skipping other embedders.")
        cove_layer = None
        embedder = OpenAIEmbedderModule(args)
        d_emb = embedder.get_output_dim()
    else:
        # Default case, used for ELMo, CoVe, word embeddings, etc.
        d_emb, embedder, cove_layer = build_embeddings(args, vocab, tasks,
                                                       pretrained_embs)
    d_sent = args.d_hid

    # Build single sentence encoder: the main component of interest
    # Need special handling for language modeling

    # Note: sent_enc is expected to apply dropout to its input _and_ output if needed.
    # So, embedding modules and classifier modules should not apply dropout there.
    tfm_params = Params({
        'input_dim': d_emb,
        'hidden_dim': args.d_hid,
        'projection_dim': args.d_tproj,
        'feedforward_hidden_dim': args.d_ff,
        'num_layers': args.n_layers_enc,
        'num_attention_heads': args.n_heads
    })
    rnn_params = Params({
        'input_size': d_emb,
        'bidirectional': True,
        'hidden_size': args.d_hid,
        'num_layers': args.n_layers_enc
    })

    if any(isinstance(task, LanguageModelingTask) for task in tasks) or \
            args.sent_enc == 'bilm':
        assert_for_log(args.sent_enc in ['rnn', 'bilm'],
                       "Only RNNLM supported!")
        if args.elmo:
            assert_for_log(args.elmo_chars_only,
                           "LM with full ELMo not supported")
        bilm = BiLMEncoder(d_emb, args.d_hid, args.d_hid, args.n_layers_enc)
        sent_encoder = SentenceEncoder(
            vocab,
            embedder,
            args.n_layers_highway,
            bilm,
            skip_embs=args.skip_embs,
            dropout=args.dropout,
            sep_embs_for_skip=args.sep_embs_for_skip,
            cove_layer=cove_layer)
        d_sent = 2 * args.d_hid
        log.info("Using BiLM architecture for shared encoder!")
    elif args.sent_enc == 'bow':
        sent_encoder = BoWSentEncoder(vocab, embedder)
        log.info("Using BoW architecture for shared encoder!")
        assert_for_log(
            not args.skip_embs,
            "Skip connection not currently supported with `bow` encoder.")
        d_sent = d_emb
    elif args.sent_enc == 'rnn':
        sent_rnn = s2s_e.by_name('lstm').from_params(copy.deepcopy(rnn_params))
        sent_encoder = SentenceEncoder(
            vocab,
            embedder,
            args.n_layers_highway,
            sent_rnn,
            skip_embs=args.skip_embs,
            dropout=args.dropout,
            sep_embs_for_skip=args.sep_embs_for_skip,
            cove_layer=cove_layer)
        d_sent = 2 * args.d_hid
        log.info("Using BiLSTM architecture for shared encoder!")
    elif args.sent_enc == 'transformer':
        transformer = StackedSelfAttentionEncoder.from_params(
            copy.deepcopy(tfm_params))
        sent_encoder = SentenceEncoder(
            vocab,
            embedder,
            args.n_layers_highway,
            transformer,
            dropout=args.dropout,
            skip_embs=args.skip_embs,
            cove_layer=cove_layer,
            sep_embs_for_skip=args.sep_embs_for_skip)
        log.info("Using Transformer architecture for shared encoder!")
    elif args.sent_enc == 'null':
        # Expose word representation layer (GloVe, ELMo, etc.) directly.
        assert_for_log(
            args.skip_embs, f"skip_embs must be set for "
            "'{args.sent_enc}' encoder")
        phrase_layer = NullPhraseLayer(rnn_params['input_size'])
        sent_encoder = SentenceEncoder(
            vocab,
            embedder,
            args.n_layers_highway,
            phrase_layer,
            skip_embs=args.skip_embs,
            dropout=args.dropout,
            sep_embs_for_skip=args.sep_embs_for_skip,
            cove_layer=cove_layer)
        d_sent = 0  # skip connection added below
        log.info("No shared encoder (just using word embeddings)!")
    else:
        assert_for_log(False, "No valid sentence encoder specified.")

    d_sent += args.skip_embs * d_emb

    # Build model and classifiers
    model = MultiTaskModel(args, sent_encoder, vocab)

    if args.is_probing_task:
        # TODO: move this logic to preprocess.py;
        # current implementation reloads MNLI data, which is slow.
        train_task_whitelist, eval_task_whitelist = get_task_whitelist(args)
        tasks_to_build, _, _ = get_tasks(train_task_whitelist,
                                         eval_task_whitelist,
                                         args.max_seq_len,
                                         path=args.data_dir,
                                         scratch_path=args.exp_dir)
    else:
        tasks_to_build = tasks

    # Attach task-specific params.
    for task in set(tasks + tasks_to_build):
        task_params = get_task_specific_params(args, task.name)
        log.info("\tTask '%s' params: %s", task.name,
                 json.dumps(task_params.as_dict(), indent=2))
        # Store task-specific params in case we want to access later
        setattr(model, '%s_task_params' % task.name, task_params)

    # Actually construct modules.
    for task in tasks_to_build:
        # If the name of the task is different than the classifier it should use
        # then skip the module creation.
        if task.name != model._get_task_params(task.name).get(
                'use_classifier', task.name):
            continue
        build_module(task, model, d_sent, d_emb, vocab, embedder, args)
    model = model.cuda() if args.cuda >= 0 else model
    log.info(model)
    param_count = 0
    trainable_param_count = 0
    for name, param in model.named_parameters():
        param_count += np.prod(param.size())
        if param.requires_grad:
            trainable_param_count += np.prod(param.size())
            log.info(">> Trainable param %s: %s = %d", name, str(param.size()),
                     np.prod(param.size()))
    log.info(
        "Total number of parameters: {ct:d} ({ct:g})".format(ct=param_count))
    log.info("Number of trainable parameters: {ct:d} ({ct:g})".format(
        ct=trainable_param_count))
    return model
Exemplo n.º 20
0
train_dataset = reader.read('/.../en_el_train.txt')
validation_dataset = reader.read('/.../en_el_dev.txt')

vocab = Vocabulary.from_instances(train_dataset + validation_dataset,
                                  min_count={
                                      'tokens': 3,
                                      'target_tokens': 3
                                  })

en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                         embedding_dim=EN_EMBEDDING_DIM)
# encoder = PytorchSeq2SeqWrapper(
#     torch.nn.LSTM(EN_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
encoder = StackedSelfAttentionEncoder(input_dim=EN_EMBEDDING_DIM,
                                      hidden_dim=HIDDEN_DIM,
                                      projection_dim=128,
                                      feedforward_hidden_dim=128,
                                      num_layers=1,
                                      num_attention_heads=8)

source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding})

# attention = LinearAttention(HIDDEN_DIM, HIDDEN_DIM, activation=Activation.by_name('tanh')())
# attention = BilinearAttention(HIDDEN_DIM, HIDDEN_DIM)
attention = DotProductAttention()

max_decoding_steps = 800
model = SimpleSeq2Seq(vocab,
                      source_embedder,
                      encoder,
                      max_decoding_steps,
                      target_embedding_dim=ZH_EMBEDDING_DIM,
Exemplo n.º 21
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 num_highway_layers: int,
                 phrase_layer: Seq2SeqEncoder,
                 similarity_function: SimilarityFunction,
                 self_attention_layer: StackedSelfAttentionEncoder,
                 modeling_layer: Seq2SeqEncoder,
                 span_end_encoder: Seq2SeqEncoder,
                 dropout: float = 0.2,
                 mask_lstms: bool = True,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(BidirectionalAttentionFlow, self).__init__(vocab, regularizer)

        self._text_field_embedder = text_field_embedder
        self._highway_layer = TimeDistributed(
            Highway(text_field_embedder.get_output_dim(), num_highway_layers))
        self._phrase_layer = phrase_layer
        self._matrix_attention = LegacyMatrixAttention(similarity_function)
        self._modeling_layer = modeling_layer
        self._span_end_encoder = span_end_encoder

        #New Self Attention layer
        self._self_attention_layer = self_attention_layer
        self._sa_matrix_attention = LegacyMatrixAttention(similarity_function)
        selfattent_dim = self_attention_layer.get_output_dim()  # its 200
        #print("Self Attention Output Dim:",selfattent_dim,"\n")

        encoding_dim = phrase_layer.get_output_dim()
        modeling_dim = modeling_layer.get_output_dim()
        #span_start_input_dim = encoding_dim * 4 + modeling_dim

        span_start_input_dim = encoding_dim * 4 + modeling_dim + 2 * selfattent_dim

        self._span_start_predictor = TimeDistributed(
            torch.nn.Linear(span_start_input_dim, 1))

        span_end_encoding_dim = span_end_encoder.get_output_dim()
        #span_end_input_dim = encoding_dim * 4 + span_end_encoding_dim
        span_end_input_dim = encoding_dim * 4 + span_end_encoding_dim + 2 * selfattent_dim

        self._span_end_predictor = TimeDistributed(
            torch.nn.Linear(span_end_input_dim, 1))

        # Bidaf has lots of layer dimensions which need to match up - these aren't necessarily
        # obvious from the configuration files, so we check here.
        check_dimensions_match(modeling_layer.get_input_dim(),
                               4 * encoding_dim + 2 * selfattent_dim,
                               "modeling layer input dim", "4 * encoding dim")
        check_dimensions_match(text_field_embedder.get_output_dim(),
                               phrase_layer.get_input_dim(),
                               "text field embedder output dim",
                               "phrase layer input dim")
        check_dimensions_match(
            span_end_encoder.get_input_dim(),
            4 * encoding_dim + 3 * modeling_dim + 2 * selfattent_dim,
            "span end encoder input dim",
            "4 * encoding dim + 3 * modeling dim")

        self._na_accuracy = CategoricalAccuracy()
        self._span_start_accuracy = CategoricalAccuracy()
        self._span_end_accuracy = CategoricalAccuracy()
        self._span_accuracy = BooleanAccuracy()
        self._squad_metrics = SquadEmAndF1()

        self._na_dense = lambda in_dim: torch.nn.Linear(in_dim, 2).cuda()

        if dropout > 0:
            self._dropout = torch.nn.Dropout(p=dropout)
        else:
            self._dropout = lambda x: x
        self._mask_lstms = mask_lstms

        initializer(self)
Exemplo n.º 22
0
    def __init__(self,
                 idiom_vector_path: str,
                 dropout: float,
                 vocab: Vocabulary,
                 content_embedder: TextFieldEmbedder,
                 use_pretrained: bool = False,
                 use_reasoner: bool = False,
                 idiom_vector_size: int = None,
                 reasoner_mode: str = None) -> None:
        super().__init__(vocab)
        self.content_embedder = content_embedder

        if idiom_vector_size is not None and use_pretrained:
            raise ValueError(
                "When `use_pretrained` is True, `idiom_vector_size` must be None."
            )

        idiom_list, idiom_vectors = [], []
        with open(idiom_vector_path) as fh:
            for line in fh:
                idiom_list.append(line.strip().split()[0])
                idiom_vectors.append(list(map(float,
                                              line.strip().split()[1:])))

        self.use_pretrained = use_pretrained

        if self.use_pretrained:
            self.option_embedder = modules.Embedding(
                num_embeddings=len(idiom_list),
                embedding_dim=len(idiom_vectors[0]),
                projection_dim=self.content_embedder.get_output_dim(),
                # 使用 预训练的成语向量
                weight=torch.FloatTensor(idiom_vectors))
        else:
            embedding_dim = idiom_vector_size or len(idiom_vectors[0])
            self.option_embedder = modules.Embedding(
                num_embeddings=len(idiom_list),
                embedding_dim=embedding_dim,
                projection_dim=self.content_embedder.get_output_dim(),
                # 使用 预训练的成语向量
                # weight=torch.FloatTensor(idiom_vectors)
            )

        self.dropout = nn.Dropout(dropout)
        self.scorer = nn.Linear(self.content_embedder.get_output_dim(), 1)

        self.use_reasoner = use_reasoner
        if use_reasoner:
            embedding_size = self.content_embedder.get_output_dim()
            if reasoner_mode is None:
                reasoner_mode = 'self_attention'
            else:
                reasoner_mode = reasoner_mode.lower()
                assert reasoner_mode in ('self_attention',
                                         'gated_self_attention')
            self.reasoner_mode = reasoner_mode
            if reasoner_mode == 'self_attention':
                self.option_reasoner = StackedSelfAttentionEncoder(
                    input_dim=embedding_size,
                    hidden_dim=embedding_size,
                    projection_dim=embedding_size,
                    feedforward_hidden_dim=embedding_size,
                    num_layers=1,
                    num_attention_heads=2,
                    use_positional_encoding=False)
            elif reasoner_mode == "gated_self_attention":
                self.option_reasoner = GatedSelfAttention(
                    input_dim=embedding_size,
                    hidden_dim=embedding_size,
                    projection_dim=embedding_size,
                    feedforward_hidden_dim=embedding_size,
                    num_layers=1,
                    num_attention_heads=2)

        self.loss = nn.CrossEntropyLoss()
        self.acc = CategoricalAccuracy()
Exemplo n.º 23
0
    def __init__(self,
                 idiom_vector_path: str,
                 dropout: float,
                 vocab: Vocabulary,
                 content_embedder: TextFieldEmbedder,
                 use_pretrained: bool = False,
                 use_reasoner: bool = False,
                 idiom_vector_size: int = None,
                 denoise_mode: str = 'soft',
                 denoise_lambda: float = None,
                 teacher_model_path: str = None,
                 teacher_mode: str = None) -> None:
        super().__init__(vocab)
        self.content_embedder = content_embedder

        if idiom_vector_size is not None and use_pretrained:
            raise ValueError(
                "When `use_pretrained` is True, `idiom_vector_size` must be None."
            )

        if teacher_mode is not None:
            teacher_mode = teacher_mode.lower()
            assert teacher_mode in ('initialization', 'teacher'), (
                f'teacher_mode ({teacher_mode}) '
                'not in ("initialization", "teacher").')
        if teacher_mode is not None and teacher_model_path is None:
            raise ValueError(
                "Please set teacher_model_path when teacher_mode is not None.")
        self.teacher_mode = teacher_mode
        self.teacher_model_path = teacher_model_path
        self.teacher = self.load_teacher()

        idiom_list, idiom_vectors = [], []
        with open(idiom_vector_path) as fh:
            for line in fh:
                idiom_list.append(line.strip().split()[0])
                idiom_vectors.append(list(map(float,
                                              line.strip().split()[1:])))

        self.use_pretrained = use_pretrained

        if self.use_pretrained:
            self.option_embedder = modules.Embedding(
                num_embeddings=len(idiom_list),
                embedding_dim=len(idiom_vectors[0]),
                projection_dim=self.content_embedder.get_output_dim(),
                # 使用 预训练的成语向量
                weight=torch.FloatTensor(idiom_vectors))
        else:
            embedding_dim = idiom_vector_size or len(idiom_vectors[0])
            self.option_embedder = modules.Embedding(
                num_embeddings=len(idiom_list),
                embedding_dim=embedding_dim,
                projection_dim=self.content_embedder.get_output_dim(),
                # 使用 预训练的成语向量
                # weight=torch.FloatTensor(idiom_vectors)
            )

        self.dropout = nn.Dropout(dropout)
        self.scorer = nn.Linear(self.content_embedder.get_output_dim(), 1)

        if use_reasoner:
            logger.info(f"{type(self)} always uses the reasoner.")

        self.use_reasoner = True

        embedding_size = self.content_embedder.get_output_dim()
        self.option_reasoner = StackedSelfAttentionEncoder(
            input_dim=embedding_size,
            hidden_dim=embedding_size,
            projection_dim=embedding_size,
            feedforward_hidden_dim=embedding_size,
            num_layers=1,
            num_attention_heads=2,
            use_positional_encoding=False)

        if self.teacher_mode == 'initialization':
            self.option_embedder.weight = self.teacher.option_embedder.weight
            self.option_embedder.weight.requires_grad = True

            self.scorer.weight = self.teacher.scorer.weight
            self.scorer.weight.requires_grad = True

        denoise_mode = denoise_mode.lower()
        assert denoise_mode in ('soft', 'hard', 'both', 'lambda'), (
            f'denoise_mode ({denoise_mode}) '
            'not in ("soft", "hard", "both", "lambda").')
        self.denoise_mode = denoise_mode
        self.denoise_lambda = denoise_lambda

        self.loss = nn.CrossEntropyLoss()
        self.acc = CategoricalAccuracy()
Exemplo n.º 24
0
    def __init__(self, args, input_dim, hidden_dim, word_embedder):
        super(RelationAttendedDefinitionSentenceEncoder, self).__init__()
        self.config = args
        self.args = args
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim

        self.projection_dim = input_dim
        self.feedforward_hidden_dim = input_dim
        self.num_layers = self.args.num_layers_for_stackatt
        self.num_attention_heads = self.args.num_atthead_for_stackatt

        self.word_embedder = word_embedder
        self.word_embedding_dropout = nn.Dropout(
            self.args.word_embedding_dropout)

        # from allennlp.modules.seq2seq_encoders import , , \
        #     , ,
        #     BidirectionalLanguageModelTransformer, FeedForwardEncoder

        if self.args.definition_seq2seq == 'passthrough':
            self.seq2seq = PassThroughEncoder(input_dim=input_dim)
        elif self.args.definition_seq2seq == 'multiheadstackatt':
            self.seq2seq = StackedSelfAttentionEncoder(
                input_dim=input_dim,
                hidden_dim=input_dim,
                projection_dim=input_dim,
                feedforward_hidden_dim=input_dim,
                num_layers=2,
                num_attention_heads=2)
        elif self.args.definition_seq2seq == 'qanet':
            self.seq2seq = QaNetEncoder(input_dim=input_dim,
                                        hidden_dim=input_dim,
                                        attention_projection_dim=input_dim,
                                        feedforward_hidden_dim=input_dim,
                                        num_blocks=2,
                                        num_convs_per_block=2,
                                        conv_kernel_size=3,
                                        num_attention_heads=2)
        elif self.args.definition_seq2seq == 'intrasentenceatt':
            self.seq2seq = IntraSentenceAttentionEncoder(
                input_dim=input_dim,
                projection_dim=input_dim,
                output_dim=input_dim)
        elif self.args.definition_seq2seq == 'gatedcnn':
            self.seq2seq = GatedCnnEncoder(input_dim=512,
                                           layers=[[[4, 512]],
                                                   [[4, 512], [4, 512]],
                                                   [[4, 512], [4, 512]],
                                                   [[4, 512], [4, 512]]],
                                           dropout=0.05)
        elif self.args.definition_seq2seq == 'bilmtransformer':
            self.seq2seq = BidirectionalLanguageModelTransformer(
                input_dim=input_dim, hidden_dim=input_dim, num_layers=2)
        # elif self.args.definition_seq2seq == 'feedfoward':
        #     feedforward = FeedForward(input_dim=input_dim, num_layers=1, hidden_dims=input_dim, activations=self.args.activation_for_sentence_ff)
        #     self.seq2seq = FeedForwardEncoder(feedforward)

        # '''
        # *"linear"
        # *`"relu" < https: // pytorch.org / docs / master / nn.html  # torch.nn.ReLU>`_
        # *`"relu6" < https: // pytorch.org / docs / master / nn.html  # torch.nn.ReLU6>`_
        # *`"elu" < https: // pytorch.org / docs / master / nn.html  # torch.nn.ELU>`_
        # *`"prelu" < https: // pytorch.org / docs / master / nn.html  # torch.nn.PReLU>`_
        # *`"leaky_relu" < https: // pytorch.org / docs / master / nn.html  # torch.nn.LeakyReLU>`_
        # *`"threshold" < https: // pytorch.org / docs / master / nn.html  # torch.nn.Threshold>`_
        # *`"hardtanh" < https: // pytorch.org / docs / master / nn.html  # torch.nn.Hardtanh>`_
        # *`"sigmoid" < https: // pytorch.org / docs / master / nn.html  # torch.nn.Sigmoid>`_
        # *`"tanh" < https: // pytorch.org / docs / master / nn.html  # torch.nn.Tanh>`_
        # *`"log_sigmoid" < https: // pytorch.org / docs / master / nn.html  # torch.nn.LogSigmoid>`_
        # *`"softplus" < https: // pytorch.org / docs / master / nn.html  # torch.nn.Softplus>`_
        # *`"softshrink" < https: // pytorch.org / docs / master / nn.html  # torch.nn.Softshrink>`_
        # *`"softsign" < https: // pytorch.org / docs / master / nn.html  # torch.nn.Softsign>`_
        # *`"tanhshrink" < https: // pytorch.org / docs / master / nn.html  # torch.nn.Tanhshrink>`_
        # '''

        elif self.args.definition_seq2seq == 'multiheadselfatt':
            self.seq2seq = MultiHeadSelfAttention(
                num_heads=2,
                input_dim=input_dim,
                output_projection_dim=input_dim,
                attention_dim=input_dim,
                values_dim=input_dim)
        else:
            print('Encoder not defined:', self.args.definition_seq2seq)
            exit()
train_dataset = reader.read(cached_path("data/train"))
validation_dataset = reader.read(cached_path("data/dev"))
vocab = Vocabulary.from_instances(train_dataset + validation_dataset)
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=100,
                            pretrained_file="https://s3-us-west-2.amazonaws.com/allennlp/"
                                            "datasets/glove/glove.6B.100d.txt.gz", trainable=True)
source_embedder = BasicTextFieldEmbedder({"tokens": token_embedding})
# encoder = PytorchSeq2SeqWrapper(StackedAlternatingLstm(input_size=200,
#                                                        hidden_size=300,
#                                                        num_layers=4,
#                                                        recurrent_dropout_probability=0.1,
#                                                        use_highway=True))
encoder = StackedSelfAttentionEncoder(
    input_dim=200,
    hidden_dim=300,
    projection_dim=128,
    feedforward_hidden_dim=128,
    num_layers=1,
    num_attention_heads=8)

# attention = LinearAttention(HIDDEN_DIM, HIDDEN_DIM, activation=Activation.by_name('tanh')())
# attention = BilinearAttention(HIDDEN_DIM, HIDDEN_DIM)
attention = DotProductAttention()
from allennlp.models.encoder_decoders.simple_seq2seq import SimpleSeq2Seq
max_decoding_steps = 50   # TODO: make this variable
ZH_EMBEDDING_DIM = 10
model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps,
                          target_embedding_dim=ZH_EMBEDDING_DIM,
                          target_namespace='labels',
                          attention=attention,
                          beam_size=8,
Exemplo n.º 26
0
def build_sent_encoder(args, vocab, d_emb, tasks, embedder, cove_layer):
    # Build single sentence encoder: the main component of interest
    # Need special handling for language modeling
    # Note: sent_enc is expected to apply dropout to its input _and_ output if needed.
    tfm_params = Params({
        'input_dim': d_emb,
        'hidden_dim': args.d_hid,
        'projection_dim': args.d_tproj,
        'feedforward_hidden_dim': args.d_ff,
        'num_layers': args.n_layers_enc,
        'num_attention_heads': args.n_heads
    })
    rnn_params = Params({
        'input_size': d_emb,
        'bidirectional': True,
        'hidden_size': args.d_hid,
        'num_layers': args.n_layers_enc
    })
    # Make sentence encoder
    if any(isinstance(task, LanguageModelingTask) for task in tasks) or \
            args.sent_enc == 'bilm':
        assert_for_log(args.sent_enc in ['rnn', 'bilm'],
                       "Only RNNLM supported!")
        if args.elmo:
            assert_for_log(args.elmo_chars_only,
                           "LM with full ELMo not supported")
        bilm = BiLMEncoder(d_emb, args.d_hid, args.d_hid, args.n_layers_enc)
        sent_encoder = SentenceEncoder(
            vocab,
            embedder,
            args.n_layers_highway,
            bilm,
            skip_embs=args.skip_embs,
            dropout=args.dropout,
            sep_embs_for_skip=args.sep_embs_for_skip,
            cove_layer=cove_layer)
        d_sent = 2 * args.d_hid
        log.info("Using BiLM architecture for shared encoder!")
    elif args.sent_enc == 'bow':
        sent_encoder = BoWSentEncoder(vocab, embedder)
        log.info("Using BoW architecture for shared encoder!")
        assert_for_log(
            not args.skip_embs,
            "Skip connection not currently supported with `bow` encoder.")
        d_sent = d_emb
    elif args.sent_enc == 'rnn':
        sent_rnn = s2s_e.by_name('lstm').from_params(copy.deepcopy(rnn_params))
        sent_encoder = SentenceEncoder(
            vocab,
            embedder,
            args.n_layers_highway,
            sent_rnn,
            skip_embs=args.skip_embs,
            dropout=args.dropout,
            sep_embs_for_skip=args.sep_embs_for_skip,
            cove_layer=cove_layer)
        d_sent = 2 * args.d_hid
        log.info("Using BiLSTM architecture for shared encoder!")
    elif args.sent_enc == 'transformer':
        transformer = StackedSelfAttentionEncoder.from_params(
            copy.deepcopy(tfm_params))
        sent_encoder = SentenceEncoder(
            vocab,
            embedder,
            args.n_layers_highway,
            transformer,
            dropout=args.dropout,
            skip_embs=args.skip_embs,
            cove_layer=cove_layer,
            sep_embs_for_skip=args.sep_embs_for_skip)
        log.info("Using Transformer architecture for shared encoder!")
    elif args.sent_enc == 'null':
        # Expose word representation layer (GloVe, ELMo, etc.) directly.
        assert_for_log(
            args.skip_embs, f"skip_embs must be set for "
            "'{args.sent_enc}' encoder")
        phrase_layer = NullPhraseLayer(rnn_params['input_size'])
        sent_encoder = SentenceEncoder(
            vocab,
            embedder,
            args.n_layers_highway,
            phrase_layer,
            skip_embs=args.skip_embs,
            dropout=args.dropout,
            sep_embs_for_skip=args.sep_embs_for_skip,
            cove_layer=cove_layer)
        d_sent = 0  # skip connection added below
        log.info("No shared encoder (just using word embeddings)!")
    else:
        assert_for_log(False, "No valid sentence encoder specified.")
    return sent_encoder, d_sent
Exemplo n.º 27
0
    def __init__(self,
                 idiom_vector_path: str,
                 dropout: float,
                 vocab: Vocabulary,
                 content_embedder: TextFieldEmbedder,
                 option_vector_encoder: Seq2VecEncoder,
                 use_pretrained: bool = False,
                 use_idiom_embedding: bool = True,
                 use_idiom_text: bool = False,
                 use_idiom_definition: bool = False,
                 use_reasoner: bool = False,
                 idiom_vector_size: int = None) -> None:
        super().__init__(vocab)
        if idiom_vector_size is not None and use_pretrained:
            raise ValueError(
                "When `use_pretrained` is True, `idiom_vector_size` must be None."
            )
        if not use_idiom_embedding and use_pretrained:
            raise ValueError(
                "use_pretrained=True but use_idiom_embedding=False.")

        # suppose to be BERT model
        self.content_embedder = content_embedder
        self.option_vector_encoder = option_vector_encoder

        self.use_idiom_embedding = use_idiom_embedding

        if self.use_idiom_embedding:
            idiom_list, idiom_vectors = [], []
            with open(idiom_vector_path) as fh:
                for line in fh:
                    idiom_list.append(line.strip().split()[0])
                    idiom_vectors.append(
                        list(map(float,
                                 line.strip().split()[1:])))

            self.use_pretrained = use_pretrained
            if self.use_pretrained:
                self.option_embedder = modules.Embedding(
                    num_embeddings=len(idiom_list),
                    embedding_dim=len(idiom_vectors[0]),
                    projection_dim=self.content_embedder.get_output_dim(),
                    # 使用 预训练的成语向量
                    weight=torch.FloatTensor(idiom_vectors))
            else:
                embedding_dim = idiom_vector_size or len(idiom_vectors[0])
                self.option_embedder = modules.Embedding(
                    num_embeddings=len(idiom_list),
                    embedding_dim=embedding_dim,
                    projection_dim=self.content_embedder.get_output_dim(),
                    # 使用 预训练的成语向量
                    # weight=torch.FloatTensor(idiom_vectors)
                )

        self.option_vector_encoder = option_vector_encoder

        if self.use_idiom_embedding:
            idiom_merger_in_features = self.option_embedder.get_output_dim()
        else:
            idiom_merger_in_features = 0

        self.use_idiom_text = use_idiom_text
        if self.use_idiom_text:
            idiom_merger_in_features += self.option_vector_encoder.get_output_dim(
            )

        self.use_idiom_definition = use_idiom_definition
        if self.use_idiom_definition:
            idiom_merger_in_features += self.option_vector_encoder.get_output_dim(
            )

        self.option_merger = nn.Linear(
            in_features=idiom_merger_in_features,
            out_features=self.content_embedder.get_output_dim(),
            bias=True)

        self.dropout = nn.Dropout(dropout)
        self.scorer = nn.Linear(self.content_embedder.get_output_dim(), 1)

        self.use_reasoner = use_reasoner
        if use_reasoner:
            embedding_size = self.content_embedder.get_output_dim()
            self.option_reasoner = StackedSelfAttentionEncoder(
                input_dim=embedding_size,
                hidden_dim=embedding_size,
                projection_dim=embedding_size,
                feedforward_hidden_dim=embedding_size,
                num_layers=1,
                num_attention_heads=2,
                use_positional_encoding=False)

        self.loss = nn.CrossEntropyLoss()
        self.acc = CategoricalAccuracy()
Exemplo n.º 28
0
def build_model(args, vocab, pretrained_embs, tasks):
    '''Build model according to args '''

    # Build embeddings.
    d_emb, embedder, cove_emb = build_embeddings(args, vocab, pretrained_embs)
    d_sent = args.d_hid

    # Build single sentence encoder: the main component of interest
    # Need special handling for language modeling
    tfm_params = Params({'input_dim': d_emb, 'hidden_dim': args.d_hid,
                         'projection_dim': args.d_tproj,
                         'feedforward_hidden_dim': args.d_ff,
                         'num_layers': args.n_layers_enc,
                         'num_attention_heads': args.n_heads})
    rnn_params = Params({'input_size': d_emb, 'bidirectional': args.bidirectional,
                         'hidden_size': args.d_hid, 'num_layers': args.n_layers_enc})

    if sum([isinstance(task, LanguageModelingTask) for task in tasks]):
        if args.bidirectional:
            rnn_params['bidirectional'] = False
            if args.sent_enc == 'rnn':
                fwd = s2s_e.by_name('lstm').from_params(copy.deepcopy(rnn_params))
                bwd = s2s_e.by_name('lstm').from_params(copy.deepcopy(rnn_params))
            elif args.sent_enc == 'transformer':
                fwd = MaskedStackedSelfAttentionEncoder.from_params(copy.deepcopy(tfm_params))
                bwd = MaskedStackedSelfAttentionEncoder.from_params(copy.deepcopy(tfm_params))
            sent_encoder = BiLMEncoder(vocab, embedder, args.n_layers_highway,
                                       fwd, bwd, dropout=args.dropout,
                                       skip_embs=args.skip_embs, cove_layer=cove_emb)
        else:  # not bidirectional
            if args.sent_enc == 'rnn':
                fwd = s2s_e.by_name('lstm').from_params(copy.deepcopy(rnn_params))
            elif args.sent_enc == 'transformer':
                fwd = MaskedStackedSelfAttentionEncoder.from_params(copy.deepcopy(tfm_params))
            sent_encoder = SentenceEncoder(vocab, embedder, args.n_layers_highway,
                                           fwd, skip_embs=args.skip_embs,
                                           dropout=args.dropout, cove_layer=cove_emb)
    elif args.sent_enc == 'bow':
        sent_encoder = BoWSentEncoder(vocab, embedder)
        d_sent = d_emb
    elif args.sent_enc == 'rnn':
        sent_rnn = s2s_e.by_name('lstm').from_params(copy.deepcopy(rnn_params))
        sent_encoder = SentenceEncoder(vocab, embedder, args.n_layers_highway,
                                       sent_rnn, skip_embs=args.skip_embs,
                                       dropout=args.dropout, cove_layer=cove_emb)
        d_sent = (1 + args.bidirectional) * args.d_hid
    elif args.sent_enc == 'transformer':
        transformer = StackedSelfAttentionEncoder.from_params(copy.deepcopy(tfm_params))
        sent_encoder = SentenceEncoder(vocab, embedder, args.n_layers_highway,
                                       transformer, dropout=args.dropout,
                                       skip_embs=args.skip_embs, cove_layer=cove_emb)
    else:
        assert_for_log(False, "No valid sentence encoder specified.")

    d_sent += args.skip_embs * d_emb

    # Build model and classifiers
    model = MultiTaskModel(args, sent_encoder, vocab)

    if args.is_probing_task:
        # TODO: move this logic to preprocess.py;
        # current implementation reloads MNLI data, which is slow.
        train_task_whitelist, eval_task_whitelist = get_task_whitelist(args)
        tasks_to_build, _, _ = get_tasks(train_task_whitelist,
                                         eval_task_whitelist,
                                         args.max_seq_len,
                                         path=args.data_dir,
                                         scratch_path=args.exp_dir)
    else:
        tasks_to_build = tasks

    # Attach task-specific params.
    for task in set(tasks + tasks_to_build):
        task_params = get_task_specific_params(args, task.name)
        log.info("\tTask '%s' params: %s", task.name,
                 json.dumps(task_params.as_dict(), indent=2))
        # Store task-specific params in case we want to access later
        setattr(model, '%s_task_params' % task.name, task_params)

    # Actually construct modules.
    for task in tasks_to_build:
        build_module(task, model, d_sent, vocab, embedder, args)
    model = model.cuda() if args.cuda >= 0 else model
    log.info(model)
    param_count = 0
    trainable_param_count = 0
    for name, param in model.named_parameters():
        param_count += np.prod(param.size())
        if param.requires_grad:
            trainable_param_count += np.prod(param.size())
    log.info("Total number of parameters: {}".format(param_count))
    log.info("Number of trainable parameters: {}".format(trainable_param_count))
    return model
Exemplo n.º 29
0
Arquivo: full.py Projeto: WrRan/SKER
    def __init__(self,
                 idiom_vector_path: str,
                 idiom_graph_path: str,
                 dropout: float,
                 vocab: Vocabulary,
                 content_embedder: TextFieldEmbedder,
                 use_pretrained: bool = False,
                 idiom_vector_size: int = None,
                 neighbor_num: int = 7,
                 num_neighbour_attention_heads: int = 2) -> None:
        super().__init__(vocab)
        self.content_embedder = content_embedder

        if idiom_vector_size is not None and use_pretrained:
            raise ValueError(
                "When `use_pretrained` is True, `idiom_vector_size` must be None."
            )

        idiom_list, idiom_vectors = [], []
        with open(idiom_vector_path) as fh:
            for line in fh:
                idiom_list.append(line.strip().split()[0])
                idiom_vectors.append(list(map(float,
                                              line.strip().split()[1:])))

        self.graph_embedder = GraphEmbedder(idiom_graph_path,
                                            neighbor_num=neighbor_num,
                                            drop_neighbor=False)

        self.use_pretrained = use_pretrained

        if self.use_pretrained:
            self.option_embedder = modules.Embedding(
                num_embeddings=len(idiom_list),
                embedding_dim=len(idiom_vectors[0]),
                projection_dim=self.content_embedder.get_output_dim(),
                # 使用 预训练的成语向量
                weight=torch.FloatTensor(idiom_vectors))
        else:
            embedding_dim = idiom_vector_size or len(idiom_vectors[0])
            self.option_embedder = modules.Embedding(
                num_embeddings=len(idiom_list),
                embedding_dim=embedding_dim,
                projection_dim=self.content_embedder.get_output_dim(),
                # 使用 预训练的成语向量
                # weight=torch.FloatTensor(idiom_vectors)
            )

        self.dropout = nn.Dropout(dropout)
        self.scorer = nn.Linear(self.content_embedder.get_output_dim(), 1)

        embedding_size = self.content_embedder.get_output_dim()

        self.neighbour_reasoner = StackedSelfAttentionEncoder(
            input_dim=embedding_size,
            hidden_dim=embedding_size,
            projection_dim=embedding_size,
            feedforward_hidden_dim=embedding_size,
            num_layers=1,
            num_attention_heads=num_neighbour_attention_heads,
            use_positional_encoding=False)
        self.option_encoder = FirstVecEncoder(embedding_dim=embedding_size)

        self.option_reasoner = StackedSelfAttentionEncoder(
            input_dim=embedding_size,
            hidden_dim=embedding_size,
            projection_dim=embedding_size,
            feedforward_hidden_dim=embedding_size,
            num_layers=1,
            num_attention_heads=2,
            use_positional_encoding=False)

        self.data_merger = FeedForward(
            input_dim=embedding_size + embedding_size + embedding_size,
            num_layers=1,
            hidden_dims=embedding_size,
            activations=Activation.by_name('linear')(),
            dropout=0.1)

        self.loss = nn.CrossEntropyLoss()
        self.acc = CategoricalAccuracy()