def setUp(self):
        super(SemanticRoleLabelerTest, self).setUp()

        dataset = SrlReader().read('tests/fixtures/conll_2012/')
        vocab = Vocabulary.from_dataset(dataset)
        self.vocab = vocab
        dataset.index_instances(vocab)
        self.dataset = dataset

        params = Params({
            "text_field_embedder": {
                "tokens": {
                    "type": "embedding",
                    "embedding_dim": 5
                }
            },
            "stacked_encoder": {
                "type": "lstm",
                "input_size": 6,
                "hidden_size": 7,
                "num_layers": 2
            }
        })

        self.model = SemanticRoleLabeler.from_params(self.vocab, params)
示例#2
0
    def test_forward(self):
        lr = 0.5
        batch_size = 16
        embedding_dim = 50

        squad_reader = SquadReader()
        # Read SQuAD train set (use the test set, since it's smaller)
        train_dataset = squad_reader.read(self.squad_test)
        vocab = Vocabulary.from_dataset(train_dataset)
        train_dataset.index_instances(vocab)

        # Random embeddings for test
        test_embed_matrix = torch.rand(vocab.get_vocab_size(), embedding_dim)
        test_cbow = CBOW(test_embed_matrix)
        optimizer = optim.Adadelta(filter(lambda p: p.requires_grad,
                                          test_cbow.parameters()),
                                   lr=lr)

        iterator = BucketIterator(batch_size=batch_size,
                                  sorting_keys=[("passage", "num_tokens"),
                                                ("question", "num_tokens")])
        for batch in iterator(train_dataset, num_epochs=1):
            passage = batch["passage"]["tokens"]
            question = batch["question"]["tokens"]
            span_start = batch["span_start"]
            span_end = batch["span_end"]
            output_dict = test_cbow(passage, question)
            softmax_start_logits = output_dict["softmax_start_logits"]
            softmax_end_logits = output_dict["softmax_end_logits"]
            loss = nll_loss(softmax_start_logits, span_start.view(-1))
            loss += nll_loss(softmax_end_logits, span_end.view(-1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
示例#3
0
    def test_batch_predictions_are_consistent(self):
        # The CNN encoder has problems with this kind of test - it's not properly masked yet, so
        # changing the amount of padding in the batch will result in small differences in the
        # output of the encoder.  Because BiDAF is so deep, these differences get magnified through
        # the network and make this test impossible.  So, we'll remove the CNN encoder entirely
        # from the model for this test.  If/when we fix the CNN encoder to work correctly with
        # masking, we can change this back to how the other models run this test, with just a
        # single line.
        # pylint: disable=protected-access,attribute-defined-outside-init

        # Save some state.
        saved_dataset = self.dataset
        saved_model = self.model

        # Modify the state, run the test with modified state.
        params = Params.from_file(self.param_file)
        reader = DatasetReader.from_params(params['dataset_reader'])
        reader._token_indexers = {'tokens': reader._token_indexers['tokens']}
        self.dataset = reader.read('tests/fixtures/data/squad.json')
        vocab = Vocabulary.from_dataset(self.dataset)
        self.dataset.index_instances(vocab)
        del params['model']['text_field_embedder']['token_characters']
        params['model']['phrase_layer']['input_size'] = 2
        self.model = Model.from_params(vocab, params['model'])

        self.ensure_batch_predictions_are_consistent()

        # Restore the state.
        self.model = saved_model
        self.dataset = saved_dataset
示例#4
0
 def setUp(self):
     super(TestTrainer, self).setUp()
     dataset = SequenceTaggingDatasetReader().read(
         'tests/fixtures/data/sequence_tagging.tsv')
     vocab = Vocabulary.from_dataset(dataset)
     self.vocab = vocab
     dataset.index_instances(vocab)
     self.dataset = dataset
     self.model_params = Params({
         "text_field_embedder": {
             "tokens": {
                 "type": "embedding",
                 "embedding_dim": 5
             }
         },
         "stacked_encoder": {
             "type": "lstm",
             "input_size": 5,
             "hidden_size": 7,
             "num_layers": 2
         }
     })
     self.model = SimpleTagger.from_params(self.vocab, self.model_params)
     self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01)
     self.iterator = BasicIterator(batch_size=2)
示例#5
0
    def setUp(self):
        super(SimpleTaggerTest, self).setUp()
        dataset = SequenceTaggingDatasetReader().read(
            'tests/fixtures/data/sequence_tagging.tsv')
        vocab = Vocabulary.from_dataset(dataset)
        self.vocab = vocab
        dataset.index_instances(vocab)
        self.dataset = dataset

        params = Params({
            "text_field_embedder": {
                "tokens": {
                    "type": "embedding",
                    "embedding_dim": 5
                }
            },
            "stacked_encoder": {
                "type": "lstm",
                "input_size": 5,
                "hidden_size": 7,
                "num_layers": 2
            }
        })

        self.model = SimpleTagger.from_params(self.vocab, params)
示例#6
0
文件: data.py 项目: liuz37/NLP-HW4
def read_data(squad_train_path, squad_dev_path, max_passage_length,
              max_question_length, min_token_count):
    """
    Read SQuAD data, and filter by passage and question length.
    """
    squad_reader = SquadReader()
    # Read SQuAD train set
    train_dataset = squad_reader.read(squad_train_path)
    logger.info("Read {} training examples".format(len(
        train_dataset.instances)))

    # Filter out examples with passage length greater than max_passage_length
    # or question length greater than max_question_length
    logger.info("Filtering out examples in train set with passage length "
                "greater than {} or question length greater than {}".format(
                    max_passage_length, max_question_length))
    train_dataset.instances = [
        instance for instance in tqdm(train_dataset.instances)
        if len(instance.fields["passage"].tokens) <= max_passage_length
        and len(instance.fields["question"].tokens) <= max_question_length
    ]
    logger.info("{} training examples remain after filtering".format(
        len(train_dataset.instances)))

    # Make a vocabulary object from the train set
    train_vocab = Vocabulary.from_dataset(train_dataset,
                                          min_count=min_token_count)

    # Index the instances with the train vocabulary.
    # This converts string tokens to numerical indices.
    train_dataset.index_instances(train_vocab)

    # Read SQuAD validation set
    logger.info("Reading SQuAD validation set at {}".format(squad_dev_path))
    validation_dataset = squad_reader.read(squad_dev_path)
    logger.info("Read {} validation examples".format(
        len(validation_dataset.instances)))

    # Filter out examples with passage length greater than max_passage_length
    # or question length greater than max_question_length
    logger.info("Filtering out examples in validation set with passage length "
                "greater than {} or question length greater than {}".format(
                    max_passage_length, max_question_length))
    validation_dataset.instances = [
        instance for instance in tqdm(validation_dataset.instances)
        if len(instance.fields["passage"].tokens) <= max_passage_length
        and len(instance.fields["question"].tokens) <= max_question_length
    ]
    logger.info("{} validation examples remain after filtering".format(
        len(validation_dataset.instances)))

    # Index the instances with the train vocabulary.
    # This converts string tokens to numerical indices.
    validation_dataset.index_instances(train_vocab)
    return train_dataset, train_vocab, validation_dataset
示例#7
0
    def set_up_model(self, param_file, dataset_file):
        # pylint: disable=attribute-defined-outside-init
        self.param_file = param_file
        params = Params.from_file(self.param_file)

        reader = DatasetReader.from_params(params['dataset_reader'])
        dataset = reader.read(dataset_file)
        vocab = Vocabulary.from_dataset(dataset)
        self.vocab = vocab
        dataset.index_instances(vocab)
        self.dataset = dataset
        self.model = Model.from_params(self.vocab, params['model'])
    def setUp(self):
        super(TestDecomposableAttention, self).setUp()

        constants.GLOVE_PATH = 'tests/fixtures/glove.6B.300d.sample.txt.gz'
        dataset = SnliReader().read('tests/fixtures/data/snli.jsonl')
        vocab = Vocabulary.from_dataset(dataset)
        self.vocab = vocab
        dataset.index_instances(vocab)
        self.dataset = dataset
        self.token_indexers = {'tokens': SingleIdTokenIndexer()}

        self.model = DecomposableAttention.from_params(self.vocab, Params({}))
        initializer = InitializerApplicator()
        initializer(self.model)
示例#9
0
    def test_forward(self):
        lr = 0.5
        batch_size = 16
        embedding_dim = 50
        hidden_size = 15
        dropout = 0.2

        squad_reader = SquadReader()
        # Read SQuAD train set (use the test set, since it's smaller)
        train_dataset = squad_reader.read(self.squad_test)
        vocab = Vocabulary.from_dataset(train_dataset)
        train_dataset.index_instances(vocab)

        # Random embeddings for test
        test_embed_matrix = torch.rand(vocab.get_vocab_size(), embedding_dim)
        test_attention_rnn = AttentionRNN(test_embed_matrix, hidden_size,
                                          dropout)
        try:
            optimizer = optim.Adadelta(filter(lambda p: p.requires_grad,
                                              test_attention_rnn.parameters()),
                                       lr=lr)
        except ValueError:
            # Likely there are no parameters to optimize, because
            # the code is not complete.
            pass

        iterator = BucketIterator(batch_size=batch_size,
                                  sorting_keys=[("passage", "num_tokens"),
                                                ("question", "num_tokens")])
        for batch in iterator(train_dataset, num_epochs=1):
            passage = batch["passage"]["tokens"]
            question = batch["question"]["tokens"]
            span_start = batch["span_start"]
            span_end = batch["span_end"]
            try:
                output_dict = test_attention_rnn(passage, question)
                softmax_start_logits = output_dict["softmax_start_logits"]
                softmax_end_logits = output_dict["softmax_end_logits"]
                loss = nll_loss(softmax_start_logits, span_start.view(-1))
                loss += nll_loss(softmax_end_logits, span_end.view(-1))
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            except NotImplementedError:
                # AttentionRNN.forward() not implemented yet, don't fail tests.
                pass
示例#10
0
    def setUp(self):
        super(SimpleTaggerTest, self).setUp()
        self.write_sequence_tagging_data()

        dataset = SequenceTaggingDatasetReader().read(self.TRAIN_FILE)
        vocab = Vocabulary.from_dataset(dataset)
        self.vocab = vocab
        dataset.index_instances(vocab)
        self.dataset = dataset

        params = Params({
            "text_field_embedder": {
                "tokens": {
                    "type": "embedding",
                    "embedding_dim": 5
                }
            },
            "hidden_size": 7,
            "num_layers": 2
        })

        self.model = SimpleTagger.from_params(self.vocab, params)
示例#11
0
    def setUp(self):
        super(BidirectionalAttentionFlowTest, self).setUp()

        constants.GLOVE_PATH = 'tests/fixtures/glove.6B.100d.sample.txt.gz'
        reader_params = Params({
            'token_indexers': {
                'tokens': {
                    'type': 'single_id'
                },
                'token_characters': {
                    'type': 'characters'
                }
            }
        })
        dataset = SquadReader.from_params(reader_params).read(
            'tests/fixtures/data/squad.json')
        vocab = Vocabulary.from_dataset(dataset)
        self.vocab = vocab
        dataset.index_instances(vocab)
        self.dataset = dataset
        self.token_indexers = {
            'tokens': SingleIdTokenIndexer(),
            'token_characters': TokenCharactersIndexer()
        }

        self.model = BidirectionalAttentionFlow.from_params(
            self.vocab, Params({}))

        small_params = Params({
            'text_field_embedder': {
                'tokens': {
                    'type': 'embedding',
                    'pretrained_file': constants.GLOVE_PATH,
                    'trainable': False,
                    'projection_dim': 4
                },
                'token_characters': {
                    'type': 'character_encoding',
                    'embedding': {
                        'embedding_dim': 8
                    },
                    'encoder': {
                        'type': 'cnn',
                        'embedding_dim': 8,
                        'num_filters': 4,
                        'ngram_filter_sizes': [5]
                    }
                }
            },
            'phrase_layer': {
                'type': 'lstm',
                'bidirectional': True,
                'input_size': 8,
                'hidden_size': 4,
                'num_layers': 1,
            },
            'similarity_function': {
                'type': 'linear',
                'combination': 'x,y,x*y',
                'tensor_1_dim': 8,
                'tensor_2_dim': 8
            },
            'modeling_layer': {
                'type': 'lstm',
                'bidirectional': True,
                'input_size': 32,
                'hidden_size': 4,
                'num_layers': 1,
            },
            'span_end_encoder': {
                'type': 'lstm',
                'bidirectional': True,
                'input_size': 56,
                'hidden_size': 4,
                'num_layers': 1,
            },
        })
        self.small_model = BidirectionalAttentionFlow.from_params(
            self.vocab, small_params)