Пример #1
0
 def test_get_dimensions_is_correct(self):
     lstm = LSTM(bidirectional=True, num_layers=3, input_size=2, hidden_size=7, batch_first=True)
     encoder = PytorchSeq2VecWrapper(lstm)
     assert encoder.get_output_dim() == 14
     assert encoder.get_input_dim() == 2
     lstm = LSTM(bidirectional=False, num_layers=3, input_size=2, hidden_size=7, batch_first=True)
     encoder = PytorchSeq2VecWrapper(lstm)
     assert encoder.get_output_dim() == 7
     assert encoder.get_input_dim() == 2
Пример #2
0
    def __init__(self,
                 vocab_size: int,
                 embedding_size: int,
                 hidden_size: int,
                 label_size: int = 9,
                 dropout: float = 0.2,
                 user_feats_dim: int = 20):
        super().__init__()

        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.label_size = label_size
        self.dropout = dropout
        self.user_feats_dim = user_feats_dim

        self.embedding = nn.Embedding(vocab_size, embedding_size)

        self.sentence_rnn = PytorchSeq2VecWrapper(nn.GRU(embedding_size,
                                                         hidden_size,
                                                         batch_first=True,
                                                         bidirectional=True))

        self.review_rnn = PytorchSeq2VecWrapper(nn.GRU(hidden_size * 2,
                                                       hidden_size,
                                                       batch_first=True,
                                                       bidirectional=True))

        self.product_rnn = nn.GRU(hidden_size * 2 + self.user_feats_dim,
                                  hidden_size,
                                  batch_first=True,
                                  bidirectional=True)

        self.review_feedforward = nn.Sequential(
            nn.Linear(hidden_size * 2 + self.user_feats_dim, hidden_size // 2),
            nn.ELU(),
            nn.Dropout(p=dropout),
            nn.Linear(hidden_size // 2, 1)
        )

        self.product_feedforward = nn.Sequential(
            nn.Linear(hidden_size * 2, hidden_size // 2),
            nn.ELU(),
            nn.Dropout(p=dropout),
            nn.Linear(hidden_size // 2, self.label_size)
        )

        if self.user_feats_dim > 0:
            self.user_feats_weights = nn.Parameter(torch.ones(self.user_feats_dim), requires_grad=True)
Пример #3
0
    def test_forward_pulls_out_correct_tensor_with_sequence_lengths(self):
        lstm = LSTM(bidirectional=True,
                    num_layers=3,
                    input_size=3,
                    hidden_size=7,
                    batch_first=True)
        encoder = PytorchSeq2VecWrapper(lstm)

        input_tensor = torch.rand([5, 7, 3])
        input_tensor[1, 6:, :] = 0
        input_tensor[2, 4:, :] = 0
        input_tensor[3, 2:, :] = 0
        input_tensor[4, 1:, :] = 0
        mask = torch.ones(5, 7).bool()
        mask[1, 6:] = False
        mask[2, 4:] = False
        mask[3, 2:] = False
        mask[4, 1:] = False

        sequence_lengths = get_lengths_from_binary_sequence_mask(mask)
        packed_sequence = pack_padded_sequence(input_tensor,
                                               sequence_lengths.tolist(),
                                               batch_first=True)
        _, state = lstm(packed_sequence)
        # Transpose output state, extract the last forward and backward states and
        # reshape to be of dimension (batch_size, 2 * hidden_size).
        reshaped_state = state[0].transpose(0, 1)[:, -2:, :].contiguous()
        explicitly_concatenated_state = torch.cat([
            reshaped_state[:, 0, :].squeeze(1), reshaped_state[:,
                                                               1, :].squeeze(1)
        ], -1)
        encoder_output = encoder(input_tensor, mask)
        assert_almost_equal(encoder_output.data.numpy(),
                            explicitly_concatenated_state.data.numpy())
Пример #4
0
    def test_forward_works_even_with_empty_sequences(self):
        lstm = LSTM(bidirectional=True,
                    num_layers=3,
                    input_size=3,
                    hidden_size=11,
                    batch_first=True)
        encoder = PytorchSeq2VecWrapper(lstm)

        tensor = torch.rand([5, 7, 3])
        tensor[1, 6:, :] = 0
        tensor[2, :, :] = 0
        tensor[3, 2:, :] = 0
        tensor[4, :, :] = 0
        mask = torch.ones(5, 7).bool()
        mask[1, 6:] = False
        mask[2, :] = False
        mask[3, 2:] = False
        mask[4, :] = False

        results = encoder(tensor, mask)

        for i in (0, 1, 3):
            assert not (results[i] == 0.0).data.all()
        for i in (2, 4):
            assert (results[i] == 0.0).data.all()
def get_encoder(input_dim, output_dim, encoder_type, args):
    if encoder_type == "bag":
        return BagOfEmbeddingsEncoder(input_dim)
    if encoder_type == "bilstm":
        return PytorchSeq2VecWrapper(
            AllenNLPSequential(torch.nn.ModuleList(
                [get_encoder(input_dim, output_dim, "bilstm-unwrapped",
                             args)]),
                               input_dim,
                               output_dim,
                               bidirectional=True,
                               residual_connection=args.residual_connection,
                               dropout=args.dropout))
    if encoder_type == "bilstm-unwrapped":
        return torch.nn.LSTM(
            input_dim,
            output_dim,
            batch_first=True,
            bidirectional=True,
            dropout=args.dropout,
        )
    if encoder_type == "cnn":
        return CnnEncoder(embedding_dim=input_dim, num_filters=output_dim)
    if encoder_type == "cnn_highway":
        filter_size: int = output_dim // 4
        return CnnHighwayEncoder(
            embedding_dim=input_dim,
            filters=[(2, filter_size), (3, filter_size), (4, filter_size),
                     (5, filter_size)],
            projection_dim=output_dim,
            num_highway=3,
            do_layer_norm=True,
        )
    raise RuntimeError(f"Unknown encoder type={encoder_type}")
 def test_wrapper_raises_if_batch_first_is_false(self):
     with pytest.raises(ConfigurationError):
         lstm = LSTM(bidirectional=True,
                     num_layers=3,
                     input_size=3,
                     hidden_size=7)
         _ = PytorchSeq2VecWrapper(lstm)
Пример #7
0
 def __init__(self, e_dim, h_dim, num_layers=1,
              dropout=0.0, base_rnn=nn.LSTM,
              dropout_p=0.1, bidirectional=False,
              batch_first=True, memmory_embed=None,
              use_memory=False, mem_size=None, mem_context_size=None,
              inv_temp=None, use_binary=False):
     super(HashedMemoryRNN, self).__init__()
     self.acc_slots = 10
     self.memory_embeddings = memmory_embed
     self.e_dim = e_dim
     self.hidden_size = h_dim
     #self.hh = [Hash(self.memory_embeddings.get_output_dim(), self.mem_size) for _ in range(self.acc_slots)]
     self.memory = KeyValueMemory(use_memory=use_memory,
                                  emb_dim=self.e_dim,
                                  mem_size=mem_size,
                                  mem_context_size=mem_context_size,
                                  inv_temp=inv_temp,
                                  use_binary=use_binary)
     self.lstm = PytorchSeq2VecWrapper(torch.nn.LSTM(self.memory.get_input_size(), h_dim,
                                           num_layers=num_layers,
                                           dropout=dropout,
                                           bidirectional=bidirectional,
                                           batch_first=batch_first))
     self.softmax = torch.nn.Softmax()
     if USE_CUDA:
         self.lstm = self.lstm.cuda()
         self.memory = self.memory.cuda()
    def test_forward_pulls_out_correct_tensor_with_sequence_lengths(self):
        lstm = LSTM(bidirectional=True,
                    num_layers=3,
                    input_size=3,
                    hidden_size=7,
                    batch_first=True)
        encoder = PytorchSeq2VecWrapper(lstm)

        tensor = torch.rand([5, 7, 3])
        tensor[1, 6:, :] = 0
        tensor[2, 4:, :] = 0
        tensor[3, 2:, :] = 0
        tensor[4, 1:, :] = 0

        input_tensor = Variable(tensor)
        sequence_lengths = Variable(torch.LongTensor([7, 6, 4, 2, 1]))
        packed_sequence = pack_padded_sequence(input_tensor,
                                               list(sequence_lengths.data),
                                               batch_first=True)
        _, state = lstm(packed_sequence)
        # Transpose output state, extract the last forward and backward states and
        # reshape to be of dimension (batch_size, 2 * hidden_size).
        reshaped_state = state[0].transpose(0, 1)[:, -2:, :].contiguous()
        explicitly_concatenated_state = torch.cat([
            reshaped_state[:, 0, :].squeeze(1), reshaped_state[:,
                                                               1, :].squeeze(1)
        ], -1)
        encoder_output = encoder(input_tensor, sequence_lengths)
        assert_almost_equal(encoder_output.data.numpy(),
                            explicitly_concatenated_state.data.numpy())
Пример #9
0
 def test_forward_pulls_out_correct_tensor_without_sequence_lengths(self):
     lstm = LSTM(bidirectional=True, num_layers=3, input_size=2, hidden_size=7, batch_first=True)
     encoder = PytorchSeq2VecWrapper(lstm)
     input_tensor = Variable(torch.FloatTensor([[[.7, .8], [.1, 1.5]]]))
     lstm_output = lstm(input_tensor)
     encoder_output = encoder(input_tensor, None)
     assert_almost_equal(encoder_output.data.numpy(), lstm_output[0].data.numpy()[:, -1, :])
Пример #10
0
    def test_forward_pulls_out_correct_tensor_with_unsorted_batches(self):
        lstm = LSTM(bidirectional=True, num_layers=3, input_size=3, hidden_size=7, batch_first=True)
        encoder = PytorchSeq2VecWrapper(lstm)

        tensor = torch.rand([5, 7, 3])
        tensor[0, 3:, :] = 0
        tensor[1, 4:, :] = 0
        tensor[2, 2:, :] = 0
        tensor[3, 6:, :] = 0
        mask = torch.ones(5, 7)
        mask[0, 3:] = 0
        mask[1, 4:] = 0
        mask[2, 2:] = 0
        mask[3, 6:] = 0

        input_tensor = Variable(tensor)
        mask = Variable(mask)
        sequence_lengths = get_lengths_from_binary_sequence_mask(mask)
        sorted_inputs, sorted_sequence_lengths, restoration_indices = sort_batch_by_length(input_tensor,
                                                                                           sequence_lengths)
        packed_sequence = pack_padded_sequence(sorted_inputs,
                                               sorted_sequence_lengths.data.tolist(),
                                               batch_first=True)
        _, state = lstm(packed_sequence)
        # Transpose output state, extract the last forward and backward states and
        # reshape to be of dimension (batch_size, 2 * hidden_size).
        sorted_transposed_state = state[0].transpose(0, 1).index_select(0, restoration_indices)
        reshaped_state = sorted_transposed_state[:, -2:, :].contiguous()
        explicitly_concatenated_state = torch.cat([reshaped_state[:, 0, :].squeeze(1),
                                                   reshaped_state[:, 1, :].squeeze(1)], -1)
        encoder_output = encoder(input_tensor, mask)
        assert_almost_equal(encoder_output.data.numpy(), explicitly_concatenated_state.data.numpy())
Пример #11
0
def main():
    reader = TatoebaSentenceReader()
    train_set = reader.read('data/mt/sentences.top10langs.train.tsv')
    dev_set = reader.read('data/mt/sentences.top10langs.dev.tsv')

    vocab = Vocabulary.from_instances(train_set,
                                      min_count={'tokens': 3})
    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    positive_label = vocab.get_token_index('eng', namespace='labels')
    model = LstmClassifier(word_embeddings, encoder, vocab, positive_label=positive_label)

    optimizer = optim.Adam(model.parameters())

    iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_set,
                      validation_dataset=dev_set,
                      num_epochs=3)

    trainer.train()
 def get_wrapped_encoder(encoder_list):
     return PytorchSeq2VecWrapper(
         AllenNLPSequential(torch.nn.ModuleList(encoder_list),
                            elmo_embedding_dim,
                            hidden_dim,
                            bidirectional=True,
                            residual_connection=residual_connection,
                            dropout=dropout))
Пример #13
0
    def test_wrapper_works_with_alternating_lstm(self):
        model = PytorchSeq2VecWrapper(
            StackedAlternatingLstm(input_size=4, hidden_size=5, num_layers=3))

        input_tensor = torch.randn(2, 3, 4)
        mask = torch.ones(2, 3).bool()
        output = model(input_tensor, mask)
        assert tuple(output.size()) == (2, 5)
Пример #14
0
def multitask_learning():
    # load datasetreader 
    # Save logging to a local file
    # Multitasking
    log.getLogger().addHandler(log.FileHandler(directory+"/log.log"))

    lr = 0.00001
    batch_size = 2
    epochs = 10 
    max_seq_len = 512
    max_span_width = 30
    #token_indexer = BertIndexer(pretrained_model="bert-base-uncased", max_pieces=max_seq_len, do_lowercase=True,)
    token_indexer = PretrainedBertIndexer("bert-base-cased", do_lowercase=False)
    conll_reader = ConllCorefBertReader(max_span_width = max_span_width, token_indexers = {"tokens": token_indexer})
    swag_reader = SWAGDatasetReader(tokenizer=token_indexer.wordpiece_tokenizer,lazy=True, token_indexers=token_indexer)
    EMBEDDING_DIM = 1024
    HIDDEN_DIM = 200
    conll_datasets, swag_datasets = load_datasets(conll_reader, swag_reader, directory)
    conll_vocab = Vocabulary()
    swag_vocab = Vocabulary()
    conll_iterator = BasicIterator(batch_size=batch_size)
    conll_iterator.index_with(conll_vocab)

    swag_vocab = Vocabulary()
    swag_iterator = BasicIterator(batch_size=batch_size)
    swag_iterator.index_with(swag_vocab)


    from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder

    bert_embedder = PretrainedBertEmbedder(pretrained_model="bert-base-cased",top_layer_only=True, requires_grad=True)

    word_embedding = BasicTextFieldEmbedder({"tokens": bert_embedder}, allow_unmatched_keys=True)
    BERT_DIM = word_embedding.get_output_dim()

    seq2seq = PytorchSeq2SeqWrapper(torch.nn.LSTM(BERT_DIM, HIDDEN_DIM, batch_first=True, bidirectional=True))
    seq2vec = PytorchSeq2VecWrapper(torch.nn.LSTM(BERT_DIM, HIDDEN_DIM, batch_first=True, bidirectional=True))
    mention_feedforward = FeedForward(input_dim = 2336, num_layers = 2, hidden_dims = 150, activations = torch.nn.ReLU())
    antecedent_feedforward = FeedForward(input_dim = 7776, num_layers = 2, hidden_dims = 150, activations = torch.nn.ReLU())
    model1 = CoreferenceResolver(vocab=conll_vocab, text_field_embedder=word_embedding,context_layer= seq2seq, mention_feedforward=mention_feedforward,antecedent_feedforward=antecedent_feedforward , feature_size=768,max_span_width=max_span_width,spans_per_word=0.4,max_antecedents=250,lexical_dropout= 0.2)

    model2 = SWAGExampleModel(vocab=swag_vocab, text_field_embedder=word_embedding, phrase_encoder=seq2vec)
    optimizer1 = optim.Adam(model1.parameters(), lr=lr)
    optimizer2 = optim.Adam(model2.parameters(), lr=lr)

    swag_train_iterator = swag_iterator(swag_datasets[0], num_epochs=1, shuffle=True)
    conll_train_iterator = conll_iterator(conll_datasets[0], num_epochs=1, shuffle=True)
    swag_val_iterator = swag_iterator(swag_datasets[1], num_epochs=1, shuffle=True)
    conll_val_iterator:q = conll_iterator(conll_datasets[1], num_epochs=1, shuffle=True)
    task_infos = {"swag": {"model": model2, "optimizer": optimizer2, "loss": 0.0, "iterator": swag_iterator, "train_data": swag_datasets[0], "val_data": swag_datasets[1], "num_train": len(swag_datasets[0]), "num_val": len(swag_datasets[1]), "lr": lr, "score": {"accuracy":0.0}}, \
                    "conll": {"model": model1, "iterator": conll_iterator, "loss": 0.0, "val_data": conll_datasets[1], "train_data": conll_datasets[0], "optimizer": optimizer1, "num_train": len(conll_datasets[0]), "num_val": len(conll_datasets[1]),"lr": lr, "score": {"coref_prediction": 0.0, "coref_recall": 0.0, "coref_f1": 0.0,"mention_recall": 0.0}}}
    USE_GPU = 1
    trainer = MultiTaskTrainer(
        task_infos=task_infos, 
        num_epochs=epochs,
        serialization_dir=directory + "saved_models/multitask/"
    ) 
    metrics = trainer.train()
Пример #15
0
def main():
    elmo_token_indexer = ELMoTokenCharactersIndexer()

    reader = StanfordSentimentTreeBankDatasetReader(
        token_indexers={'tokens': elmo_token_indexer})

    train_dataset = reader.read(
        'data/stanfordSentimentTreebank/trees/train.txt')
    dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt')

    # You can optionally specify the minimum count of tokens/labels.
    # `min_count={'tokens':3}` here means that any tokens that appear less than three times
    # will be ignored and not included in the vocabulary.
    vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                      min_count={'tokens': 3})

    # Use the 'Small' pre-trained model
    options_file = (
        'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo'
        '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json'
    )
    weight_file = (
        'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo'
        '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'
    )

    elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)

    # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens,
    # not for labels, which are used as-is as the "answer" of the sentence classification
    word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})

    # Seq2VecEncoder is a neural network abstraction that takes a sequence of something
    # (usually a sequence of embedded word vectors), processes it, and returns a single
    # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but
    # AllenNLP also supports CNNs and other simple architectures (for example,
    # just averaging over the input vectors).
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(elmo_embedding_dim, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(word_embeddings, encoder, vocab)
    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      patience=10,
                      num_epochs=20)

    trainer.train()
Пример #16
0
    def __init__(self, config, device, vocab_size, pad_idx=0):
        super().__init__()

        self.emb_dim = config.pop("embedding_dim")
        self.hidden_size = config.pop("hidden_size")
        self.d = numpy.sqrt(self.hidden_size)
        self.vocab_size = vocab_size
        self.pad_idx = pad_idx

        self.embedding = Embedding(self.vocab_size,
                                   self.emb_dim,
                                   padding_idx=self.pad_idx)

        self.state_embedder = PytorchSeq2SeqWrapper(
            LSTM(batch_first=True,
                 input_size=self.emb_dim,
                 hidden_size=self.hidden_size))

        self.state_recurrence = PytorchSeq2VecWrapper(
            GRU(
                batch_first=True,
                input_size=self.hidden_size,
                hidden_size=self.hidden_size,
            ))

        self.action_embedder = PytorchSeq2VecWrapper(
            GRU(batch_first=True,
                input_size=self.emb_dim,
                hidden_size=self.hidden_size))

        self.recipe_embedder = PytorchSeq2VecWrapper(
            LSTM(batch_first=True,
                 input_size=self.emb_dim,
                 hidden_size=self.hidden_size))

        self.state_to_hidden = Linear(self.hidden_size, self.hidden_size)
        self.state_to_hidden2 = Linear(self.hidden_size, self.hidden_size // 2)

        self.action_to_hidden = Linear(self.hidden_size, self.hidden_size)
        self.action_to_hidden2 = Linear(self.hidden_size,
                                        self.hidden_size // 2)

        self.elu = ELU()
        self.device = device
Пример #17
0
def init_gru(vocab, d_embedding, hidden_rnn_sz, rnn_num_layers,
             rnn_dropout, all_code_types, feedforward_num_layers, feedforward_hidden_dims, feedforward_activations,
             feedforward_dropout, leadlag, add_time, t_max, t_scale, use_timestamps, split_paths):
    """Construct and train GRU"""

    # Init feedward params
    feedforward_hidden_dims = [feedforward_hidden_dims] * feedforward_num_layers
    feedforward_activations = [Activation.by_name(feedforward_activations)()] * feedforward_num_layers
    feedforward_dropout = [feedforward_dropout] * feedforward_num_layers

    # Needed for final layer
    feedforward_num_layers += 1
    feedforward_hidden_dims.append(1)
    feedforward_activations.append(Activation.by_name('linear')())
    feedforward_dropout.append(0)

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size(),
                                embedding_dim=d_embedding)

    # Handle Augmentations
    augmentations = []
    if add_time:
        augmentations.append('add_time')
    if leadlag:
        augmentations.append('leadlag')

    d_embedding_updated = update_dims(augmentations, d_embedding)
    i_augmentations = init_augmentations(augmentations, use_timestamps=use_timestamps, t_max=t_max, t_scale=t_scale)

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size(),
                                embedding_dim=d_embedding)

    # Embedder maps the input tokens to the appropriate embedding matrix
    word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder({"tokens": token_embedding})

    # Encoder takes path of (N, L, C) and encodes into state vector
    # encoder = BagOfEmbeddingsEncoder(embedding_dim=d_embedding)
    encoder: Seq2VecEncoder = PytorchSeq2VecWrapper(
        nn.GRU(d_embedding_updated, hidden_rnn_sz, num_layers=rnn_num_layers, batch_first=True, dropout=rnn_dropout))

    classifier_feedforward: FeedForward = FeedForward(
        input_dim=encoder.get_output_dim() * 3 if (all_code_types and split_paths) else encoder.get_output_dim(),
        num_layers=feedforward_num_layers,
        hidden_dims=feedforward_hidden_dims,
        activations=feedforward_activations,
        dropout=feedforward_dropout
    )

    model = BaseModel(
        vocab,
        word_embeddings,
        encoder,
        classifier_feedforward,
        augmentations=i_augmentations
    )
    return model
Пример #18
0
def main():
    reader = StanfordSentimentTreeBankDatasetReader()
    train_path = 'https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/train.txt'
    dev_path = 'https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/dev.txt'

    sampler = BucketBatchSampler(batch_size=32, sorting_keys=["tokens"])
    train_data_loader = MultiProcessDataLoader(reader,
                                               train_path,
                                               batch_sampler=sampler)
    dev_data_loader = MultiProcessDataLoader(reader,
                                             dev_path,
                                             batch_sampler=sampler)

    # You can optionally specify the minimum count of tokens/labels.
    # `min_count={'tokens':3}` here means that any tokens that appear less than three times
    # will be ignored and not included in the vocabulary.
    vocab = Vocabulary.from_instances(chain(train_data_loader.iter_instances(),
                                            dev_data_loader.iter_instances()),
                                      min_count={'tokens': 3})
    train_data_loader.index_with(vocab)
    dev_data_loader.index_with(vocab)

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)

    # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens,
    # not for labels, which are used as-is as the "answer" of the sentence classification
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

    # Seq2VecEncoder is a neural network abstraction that takes a sequence of something
    # (usually a sequence of embedded word vectors), processes it, and returns a single
    # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but
    # AllenNLP also supports CNNs and other simple architectures (for example,
    # just averaging over the input vectors).
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(word_embeddings, encoder, vocab)

    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

    trainer = GradientDescentTrainer(model=model,
                                     optimizer=optimizer,
                                     data_loader=train_data_loader,
                                     validation_data_loader=dev_data_loader,
                                     patience=10,
                                     num_epochs=20,
                                     cuda_device=-1)

    trainer.train()

    predictor = SentenceClassifierPredictor(model, dataset_reader=reader)
    logits = predictor.predict('This is the best movie ever!')['logits']
    label_id = np.argmax(logits)

    print(model.vocab.get_token_from_index(label_id, 'labels'))
Пример #19
0
def gru_encoder(input_dim: int, output_dim: int, num_layers: int = 1,
                bidirectional: bool = False, dropout: float = 0.0
                ) -> Seq2VecEncoder:
    """
    Our encoder is going to be an GRU. We have to wrap it for AllenNLP,
    though.
    """
    return PytorchSeq2VecWrapper(torch.nn.GRU(
        input_dim, output_dim, batch_first=True, num_layers=num_layers,
        bidirectional=bidirectional, dropout=dropout))
def train_only_swag():
    # load datasetreader 
    # Save logging to a local file
    # Multitasking
    log.getLogger().addHandler(log.FileHandler(directory+"/log.log"))

    lr = 0.00001
    batch_size = 2
    epochs = 100
    max_seq_len = 512
    max_span_width = 30
    #token_indexer = BertIndexer(pretrained_model="bert-base-uncased", max_pieces=max_seq_len, do_lowercase=True,)
    token_indexer = PretrainedBertIndexer("bert-base-cased", do_lowercase=False)
    swag_reader = SWAGDatasetReader(tokenizer=token_indexer.wordpiece_tokenizer,lazy=True, token_indexers=token_indexer)
    EMBEDDING_DIM = 1024
    HIDDEN_DIM = 200
    swag_datasets = load_swag(swag_reader, directory)
    swag_vocab = Vocabulary()

    swag_vocab = Vocabulary()
    swag_iterator = BasicIterator(batch_size=batch_size)
    swag_iterator.index_with(swag_vocab)

    from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder

    bert_embedder = PretrainedBertEmbedder(pretrained_model="bert-base-cased",top_layer_only=True, requires_grad=True)

    word_embedding = BasicTextFieldEmbedder({"tokens": bert_embedder}, allow_unmatched_keys=True)
    BERT_DIM = word_embedding.get_output_dim()
    seq2vec = PytorchSeq2VecWrapper(torch.nn.LSTM(BERT_DIM, HIDDEN_DIM, batch_first=True, bidirectional=True))
    mention_feedforward = FeedForward(input_dim = 2336, num_layers = 2, hidden_dims = 150, activations = torch.nn.ReLU())
    antecedent_feedforward = FeedForward(input_dim = 7776, num_layers = 2, hidden_dims = 150, activations = torch.nn.ReLU())

    model = SWAGExampleModel(vocab=swag_vocab, text_field_embedder=word_embedding, phrase_encoder=seq2vec)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    USE_GPU =1 
    val_iterator = swag_iterator(swag_datasets[1], num_epochs=1, shuffle=True)
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        iterator=swag_iterator,
        validation_iterator = swag_iterator, 
        train_dataset=swag_datasets[0],
        validation_dataset = swag_datasets[1], 
        validation_metric = "+accuracy",
        cuda_device=0 if USE_GPU else -1,
        serialization_dir= directory + "saved_models/current_run_model_state_swag",
        num_epochs=epochs,
    )    

    metrics = trainer.train()
    # save the model
    with open(directory + "saved_models/current_run_model_state", 'wb') as f:
        torch.save(model.state_dict(), f)
Пример #21
0
def build_model(options_file, weight_file):
    vocab = Vocabulary()
    iterator = BucketIterator(batch_size=config.batch_size, sorting_keys=[("tokens", "num_tokens")])
    iterator.index_with(vocab)

    elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
    word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})
    encoder: Seq2VecEncoder = PytorchSeq2VecWrapper(nn.LSTM(word_embeddings.get_output_dim(), config.hidden_size, bidirectional=True, batch_first=True))
    model = BaselineModel(word_embeddings, encoder, vocab)

    return model, iterator, vocab
Пример #22
0
    def __init__(self, config: Config):
        super().__init__()

        for k, v in vars(config).items():
            setattr(self, k, v)

        self.bert = pt.BertModel.from_pretrained(self.pretrained_data_dir)

        self.embedding_dim = self.bert.config.to_dict()['hidden_size']

        self.dropout = nn.Dropout(self.dropout)

        self.sentence_rnn = PytorchSeq2VecWrapper(
            nn.GRU(self.embedding_dim,
                   self.hidden_dim,
                   batch_first=True,
                   bidirectional=True))

        self.review_rnn = PytorchSeq2VecWrapper(
            nn.GRU(self.sentence_rnn.get_output_dim(),
                   self.hidden_dim,
                   batch_first=True,
                   bidirectional=True))

        self.product_rnn = nn.GRU(self.hidden_dim * 2 + self.user_feats_dim,
                                  self.hidden_dim,
                                  batch_first=True,
                                  bidirectional=True)

        self.review_feedforward = nn.Sequential(
            nn.Linear(self.hidden_dim * 2 + self.user_feats_dim,
                      self.hidden_dim // 2), self.dropout, nn.ELU(),
            nn.Linear(self.hidden_dim // 2, 1))

        self.product_feedforward = nn.Sequential(
            nn.Linear(self.hidden_dim * 2, self.hidden_dim // 2), self.dropout,
            nn.ELU(), nn.Linear(self.hidden_dim // 2, self.output_dim))

        if self.user_feats_dim > 0:
            self.user_feats_weights = nn.Parameter(
                torch.ones(self.user_feats_dim))
Пример #23
0
def get_encoder(voc: Vocabulary,
                embed_out_dim: int,
                name: str = config.embedder):
    if name == 'bert':
        bert = BertSentencePooler(voc)
        bert.out_dim = embed_out_dim
        return bert
    else:
        return PytorchSeq2VecWrapper(
            module=nn.GRU(embed_out_dim,
                          dropout=config.dropout,
                          hidden_size=config.lstm_hid_size,
                          bidirectional=True,
                          batch_first=True))
def main():
    cuda_device = -1

    torch.manual_seed(SEED)

    elmo_embedder = ElmoTokenEmbedder(OPTION_FILE, WEIGHT_FILE)
    word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})

    lstm = PytorchSeq2VecWrapper(
        torch.nn.LSTM(word_embeddings.get_output_dim(),
                      HIDDEN_DIM,
                      bidirectional=True,
                      batch_first=True))

    train_dataset, dev_dataset = dataset_reader(train=True, elmo=True)
    vocab = Vocabulary()

    model = BaseModel(word_embeddings=word_embeddings,
                      encoder=lstm,
                      vocabulary=vocab)

    if torch.cuda.is_available():
        cuda_device = 0
        model = model.cuda(cuda_device)

    iterator = data_iterator(vocab)
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=LEARNING_RATE,
                                 weight_decay=WEIGHT_DECAY)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      cuda_device=cuda_device,
                      num_epochs=EPOCHS,
                      patience=5)

    trainer.train()

    print("*******Save Model*******\n")

    output_elmo_model_file = os.path.join(PRETRAINED_ELMO,
                                          "lstm_elmo_model.bin")
    torch.save(model.state_dict(), output_elmo_model_file)
Пример #25
0
    def __init__(self, conf_a):
        super(Generator, self).__init__()
        self.cf_a = conf_a
        self.LSTM_mode = 0
        
        if(self.LSTM_mode == 0):
            self.encLinear1 = nn.Linear(conf_a.Z_dim, conf_a.H_enc1)
        else:
            self.encLinear1  = PytorchSeq2VecWrapper(torch.nn.LSTM(conf_a.Z_dim, hidden_size = conf_a.H_enc1, 
                                                   batch_first=True, bidirectional = False,
                                                   num_layers = 1, dropout = 0.0))
        # GENERATOR
        self.activation_func_enc1 = conf_a.activation_func_enc1
        self.hidden_to_signal = nn.Linear(conf_a.H_enc1, conf_a.D_in)

        ## Optimizer
        self.optimizer = pytut.get_optimizers(self, self.cf_a)
Пример #26
0
    def __init__(self,
                 indexer: DocumentIndexer,
                 embedding_matrix: torch.Tensor,
                 dims=None):
        super(SampleEncoder, self).__init__()
        if dims is None:
            dims = default_dims
        self.dims = dims
        words_emb_size = embedding_matrix.size(1)
        self.word_embedder = nn.Embedding.from_pretrained(embedding_matrix)
        self.word_dropout = nn.Dropout(dims['dropout_input'])

        self.char_embedder = nn.Embedding(len(indexer.char_vocab),
                                          dims['char_emb_size'])
        self.case_embedder = nn.Embedding(len(indexer.case_vocab),
                                          dims['case_emb_size'])
        self.pos_embedder = nn.Embedding(len(indexer.pos_vocab),
                                         dims['pos_emb_size'])
        self.ner_embedder = nn.Embedding(len(indexer.ner_vocab),
                                         dims['ner_emb_size'])
        self.char_encoder = PytorchSeq2VecWrapper(
            nn.LSTM(dims['char_emb_size'],
                    dims['chars_hidden'],
                    batch_first=True,
                    bidirectional=True))

        total_emb_size = words_emb_size + dims['case_emb_size'] + 2 * dims['chars_hidden'] \
                         + dims['pos_emb_size'] + dims['ner_emb_size']

        self.encoder = PytorchSeq2SeqWrapper(
            nn.LSTM(total_emb_size,
                    dims['hidden'],
                    batch_first=True,
                    bidirectional=True,
                    num_layers=2))
        self.sent_dropout = nn.Dropout(dims['dropout_lstm'])

        self.feedforward = FeedForward(2 * dims['hidden'],
                                       1,
                                       dims['feedforward'],
                                       activations=nn.Tanh())
        self.attention = nn.Linear(2 * dims['hidden'], dims['attention_dim'])
        self.scores = nn.Linear(dims['attention_dim'], 1)
        self.hidden2tag = nn.Linear(2 * dims['hidden'],
                                    len(indexer.relation_type_vocab))
        self.out_dropout = nn.Dropout(dims['dropout_lstm'])
Пример #27
0
def main():
    reader = StanfordSentimentTreeBankDatasetReader()

    train_dataset = reader.read('data/stanfordSentimentTreebank/trees/train.txt')
    dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt')

    # You can optionally specify the minimum count of tokens/labels.
    # `min_count={'tokens':3}` here means that any tokens that appear less than three times
    # will be ignored and not included in the vocabulary.
    vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                      min_count={'tokens': 3})

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)

    # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens,
    # not for labels, which are used as-is as the "answer" of the sentence classification
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

    lstm = PytorchSeq2VecWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(word_embeddings, lstm, vocab)
    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

    iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      patience=10,
                      num_epochs=20)

    trainer.train()

    tokens = ['This', 'is', 'the', 'best', 'movie', 'ever', '!']
    predictor = SentenceClassifierPredictor(model, dataset_reader=reader)
    logits = predictor.predict(tokens)['logits']
    label_id = np.argmax(logits)

    print(model.vocab.get_token_from_index(label_id, 'labels'))
Пример #28
0
def main():
    reader = StanfordSentimentTreeBankDatasetReader()

    train_dataset = reader.read(
        'data/stanfordSentimentTreebank/trees/train.txt')
    dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt')

    # You can optionally specify the minimum count of tokens/labels.
    # `min_count={'tokens':3}` here means that any tokens that appear less than three times
    # will be ignored and not included in the vocabulary.
    vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                      min_count={'tokens': 3})

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)

    # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens,
    # not for labels, which are used as-is as the "answer" of the sentence classification
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

    # Seq2VecEncoder is a neural network abstraction that takes a sequence of something
    # (usually a sequence of embedded word vectors), processes it, and returns a single
    # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but
    # AllenNLP also supports CNNs and other simple architectures (for example,
    # just averaging over the input vectors).
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(word_embeddings, encoder, vocab)
    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      patience=10,
                      num_epochs=20)

    trainer.train()
Пример #29
0
    def __init__(self, args, out_sz: int, vocab: Vocabulary):
        super().__init__(vocab)

        # prepare embeddings
        token_embedding = Embedding(num_embeddings=args.max_vocab_size + 2,
                                    embedding_dim=300,
                                    padding_index=0)
        self.word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder(
            {"tokens": token_embedding})

        self.encoder: Seq2VecEncoder = PytorchSeq2VecWrapper(
            nn.LSTM(self.word_embeddings.get_output_dim(),
                    hidden_size=64,
                    bidirectional=True,
                    batch_first=True))

        self.projection = nn.Linear(self.encoder.get_output_dim(), out_sz)
        self.loss = nn.CrossEntropyLoss()
Пример #30
0
def main():
    reader = TatoebaSentenceReader()
    train_set = reader.read('data/tatoeba/sentences.top10langs.train.tsv')
    dev_set = reader.read('data/tatoeba/sentences.top10langs.dev.tsv')

    vocab = Vocabulary.from_instances(train_set, min_count={'tokens': 3})
    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    positive_label = vocab.get_token_index('eng', namespace='labels')
    model = LstmClassifier(word_embeddings,
                           encoder,
                           vocab,
                           positive_label=positive_label)

    optimizer = optim.Adam(model.parameters())

    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_set,
                      validation_dataset=dev_set,
                      num_epochs=10)

    trainer.train()

    classify('Take your raincoat in case it rains.', model)
    classify('Tu me recuerdas a mi padre.', model)
    classify('Wie organisierst du das Essen am Mittag?', model)
    classify("Il est des cas où cette règle ne s'applique pas.", model)
    classify('Estou fazendo um passeio em um parque.', model)
    classify('Ve, postmorgaŭ jam estas la limdato.', model)
    classify('Credevo che sarebbe venuto.', model)
    classify('Nem tudja, hogy én egy macska vagyok.', model)
    classify('Nella ur nli qrib acemma deg tenwalt.', model)
    classify('Kurşun kalemin yok, değil mi?', model)