示例#1
0
 def setUp(self):
     super().setUp()
     self.instances = SequenceTaggingDatasetReader().read(
         self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv")
     vocab = Vocabulary.from_instances(self.instances)
     self.vocab = vocab
     self.model_params = Params({
         "text_field_embedder": {
             "token_embedders": {
                 "tokens": {
                     "type": "embedding",
                     "embedding_dim": 5
                 }
             }
         },
         "encoder": {
             "type": "lstm",
             "input_size": 5,
             "hidden_size": 7,
             "num_layers": 2
         },
     })
     self.model = SimpleTagger.from_params(vocab=self.vocab,
                                           params=self.model_params)
     self.optimizer = torch.optim.SGD(self.model.parameters(),
                                      0.01,
                                      momentum=0.9)
     self.iterator = BasicIterator(batch_size=2)
     self.iterator.index_with(vocab)
示例#2
0
 def setUp(self):
     super(TestTrainer, self).setUp()
     dataset = SequenceTaggingDatasetReader().read(
         'tests/fixtures/data/sequence_tagging.tsv')
     vocab = Vocabulary.from_dataset(dataset)
     self.vocab = vocab
     dataset.index_instances(vocab)
     self.dataset = dataset
     self.model_params = Params({
         "text_field_embedder": {
             "tokens": {
                 "type": "embedding",
                 "embedding_dim": 5
             }
         },
         "stacked_encoder": {
             "type": "lstm",
             "input_size": 5,
             "hidden_size": 7,
             "num_layers": 2
         }
     })
     self.model = SimpleTagger.from_params(self.vocab, self.model_params)
     self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01)
     self.iterator = BasicIterator(batch_size=2)
示例#3
0
    def setUp(self):
        super().setUp()

        # A lot of the tests want access to the metric tracker
        # so we add a property that gets it by grabbing it from
        # the relevant callback.
        def metric_tracker(self: CallbackTrainer):
            for callback in self.handler.callbacks():
                if isinstance(callback, TrackMetrics):
                    return callback.metric_tracker
            return None

        setattr(CallbackTrainer, "metric_tracker", property(metric_tracker))

        self.instances = SequenceTaggingDatasetReader().read(
            self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"
        )
        vocab = Vocabulary.from_instances(self.instances)
        self.vocab = vocab
        self.model_params = Params(
            {
                "text_field_embedder": {
                    "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}}
                },
                "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2},
            }
        )
        self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params)
        self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01, momentum=0.9)
        self.iterator = BasicIterator(batch_size=2)
        self.iterator.index_with(vocab)
示例#4
0
 def init_crf_model(self) -> Model:
     """init crf tagger model
     """
     # 1. import related modules
     from allennlp
     bert_text_field_embedder = PretrainedTransformerEmbedder(model_name=self.config.model_name)
     bert_text_field_embedder
     tagger = SimpleTagger(
         vocab=self.vocab,
         text_field_embedder=BasicTextFieldEmbedder(
             token_embedders={
                 'tokens': bert_text_field_embedder
             }
         ),
         encoder=PassThroughEncoder(bert_text_field_embedder.get_output_dim()),
         verbose_metrics=True,
         calculate_span_f1=True,
         label_encoding="BMES",
     )
     
     tagger.to(device=self.config.device)
     return tagger
示例#5
0
 def setup_method(self):
     super().setup_method()
     self.instances = SequenceTaggingDatasetReader().read(
         self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"
     )
     vocab = Vocabulary.from_instances(self.instances)
     self.model_params = Params(
         {
             "text_field_embedder": {
                 "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}}
             },
             "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2},
         }
     )
     self.model = SimpleTagger.from_params(vocab=vocab, params=self.model_params)
示例#6
0
def simple_tagger_model() -> Model:
    """create a simple tagger model."""
    # this is a bad hack to get the same data as the test case
    # TODO(joelgrus): replace this
    test_case = AllenNlpTestCase()
    test_case.setUp()
    test_case.write_sequence_tagging_data()
    dataset = SequenceTaggingDatasetReader().read(test_case.TRAIN_FILE)

    vocab = Vocabulary.from_dataset(dataset)
    dataset.index_instances(vocab)

    params = Params({
        "text_field_embedder": {
            "tokens": {
                "type": "embedding",
                "embedding_dim": 5
            }
        },
        "hidden_size": 7,
        "num_layers": 2
    })
    model = SimpleTagger.from_params(vocab, params)
    tokenizer = WordTokenizer()

    def run(blob: JSON):
        sentence = blob.get("input", "")
        tokens = tokenizer.tokenize(sentence)
        text = TextField(tokens,
                         token_indexers={"tokens": SingleIdTokenIndexer()})
        output = model.tag(text)

        # convert np array to serializable list
        output['class_probabilities'] = output['class_probabilities'].tolist()

        possible_tags = list(
            vocab.get_index_to_token_vocabulary("tags").values())
        return {
            'model_name': 'simple_tagger',
            'input': sentence,
            'output': output,
            'tokens': tokens,
            'possible_tags': possible_tags
        }

    return run
示例#7
0
    def setUp(self):
        super().setUp()
        # TODO make this a set of dataset readers
        # Classification may be easier in this case. Same dataset reader but with different paths
        self.instances_list = []
        self.instances_list.append(SequenceTaggingDatasetReader().read(
            self.FIXTURES_ROOT / 'data' / 'meta_seq' / 'sequence_tagging.tsv'))
        self.instances_list.append(SequenceTaggingDatasetReader().read(
            self.FIXTURES_ROOT / 'data' / 'meta_seq' /
            'sequence_tagging1.tsv'))
        self.instances_list.append(SequenceTaggingDatasetReader().read(
            self.FIXTURES_ROOT / 'data' / 'meta_seq' /
            'sequence_tagging2.tsv'))
        # loop through dataset readers and extend vocab
        combined_vocab = Vocabulary.from_instances(self.instances_list[0])

        for instance in self.instances_list:
            combined_vocab.extend_from_instances(Params({}),
                                                 instances=instance)
        self.vocab = combined_vocab
        # Figure out params TODO
        self.model_params = Params({
            "text_field_embedder": {
                "token_embedders": {
                    "tokens": {
                        "type": "embedding",
                        "embedding_dim": 5
                    }
                }
            },
            "encoder": {
                "type": "lstm",
                "input_size": 5,
                "hidden_size": 7,
                "num_layers": 2
            }
        })
        self.model = SimpleTagger.from_params(vocab=self.vocab,
                                              params=self.model_params)
        self.optimizer = torch.optim.SGD(self.model.parameters(),
                                         0.01,
                                         momentum=0.9)
        self.iterator = BasicIterator(batch_size=2)
        self.iterator.index_with(combined_vocab)
示例#8
0
 def setUp(self):
     super(TestOptimizer, self).setUp()
     self.instances = SequenceTaggingDatasetReader().read(self.FIXTURES_ROOT / u'data' / u'sequence_tagging.tsv')
     vocab = Vocabulary.from_instances(self.instances)
     self.model_params = Params({
             u"text_field_embedder": {
                     u"tokens": {
                             u"type": u"embedding",
                             u"embedding_dim": 5
                             }
                     },
             u"encoder": {
                     u"type": u"lstm",
                     u"input_size": 5,
                     u"hidden_size": 7,
                     u"num_layers": 2
                     }
             })
     self.model = SimpleTagger.from_params(vocab=vocab, params=self.model_params)
示例#9
0
 def setUp(self):
     super(TestOptimizer, self).setUp()
     self.instances = SequenceTaggingDatasetReader().read(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')
     vocab = Vocabulary.from_instances(self.instances)
     self.model_params = Params({
             "text_field_embedder": {
                     "tokens": {
                             "type": "embedding",
                             "embedding_dim": 5
                             }
                     },
             "encoder": {
                     "type": "lstm",
                     "input_size": 5,
                     "hidden_size": 7,
                     "num_layers": 2
                     }
             })
     self.model = SimpleTagger.from_params(vocab=vocab, params=self.model_params)
示例#10
0
 def setup_method(self):
     super().setup_method()
     self.data_path = str(self.FIXTURES_ROOT / "data" /
                          "sequence_tagging.tsv")
     self.reader = SequenceTaggingDatasetReader(max_instances=4)
     self.data_loader = MultiProcessDataLoader(self.reader,
                                               self.data_path,
                                               batch_size=2)
     self.data_loader_lazy = MultiProcessDataLoader(
         self.reader,
         self.data_path,
         batch_size=2,
         max_instances_in_memory=10)
     self.instances = list(self.data_loader.iter_instances())
     self.vocab = Vocabulary.from_instances(self.instances)
     self.data_loader.index_with(self.vocab)
     self.data_loader_lazy.index_with(self.vocab)
     self.model_params = Params({
         "text_field_embedder": {
             "token_embedders": {
                 "tokens": {
                     "type": "embedding",
                     "embedding_dim": 5
                 }
             }
         },
         "encoder": {
             "type": "lstm",
             "input_size": 5,
             "hidden_size": 7,
             "num_layers": 2
         },
     })
     self.model = SimpleTagger.from_params(vocab=self.vocab,
                                           params=self.model_params)
     self.optimizer = torch.optim.SGD(self.model.parameters(),
                                      0.01,
                                      momentum=0.9)
     self.validation_data_loader = MultiProcessDataLoader(self.reader,
                                                          self.data_path,
                                                          batch_size=2)
     self.validation_data_loader.index_with(self.vocab)
示例#11
0
 def setUp(self):
     super(TestOptimizer, self).setUp()
     self.instances = SequenceTaggingDatasetReader().read(
         'tests/fixtures/data/sequence_tagging.tsv')
     vocab = Vocabulary.from_instances(self.instances)
     self.model_params = Params({
         "text_field_embedder": {
             "tokens": {
                 "type": "embedding",
                 "embedding_dim": 5
             }
         },
         "stacked_encoder": {
             "type": "lstm",
             "input_size": 5,
             "hidden_size": 7,
             "num_layers": 2
         }
     })
     self.model = SimpleTagger.from_params(vocab, self.model_params)
def test_sequence_tagging_reader():
    model_name = 'bert-base-chinese'

    bert_token_indexers = PretrainedTransformerIndexer(model_name=model_name)
    reader = SequenceTaggingDatasetReader(
        token_indexers={"tokens": bert_token_indexers})

    train_file = './data/weibo/train.corpus'
    dev_file = './data/weibo/dev.corpus'
    test_file = './data/weibo/dev.corpus'
    train_instances = list(reader.read(train_file))
    dev_instances = list(reader.read(dev_file))
    test_instances = list(reader.read(test_file))

    vocab: Vocabulary = Vocabulary.from_instances(train_instances)
    assert vocab.get_namespaces() is not None

    bert_text_field_embedder = PretrainedTransformerEmbedder(
        model_name=model_name)
    tagger = SimpleTagger(
        vocab=vocab,
        text_field_embedder=BasicTextFieldEmbedder(
            token_embedders={'tokens': bert_text_field_embedder}),
        encoder=PassThroughEncoder(bert_text_field_embedder.get_output_dim()),
        calculate_span_f1=True,
        label_encoding="BMES",
        # verbose_metrics=True
    )

    train_data_loader, dev_data_loader = build_data_loaders(
        train_instances, dev_instances)
    train_data_loader.index_with(vocab)
    dev_data_loader.index_with(vocab)

    trainer = build_trainer(model=tagger,
                            serialization_dir='./output',
                            train_loader=train_data_loader,
                            dev_loader=dev_data_loader)
    print("Starting training")
    trainer.train()
    print("Finished training")
示例#13
0
 def setup_method(self):
     super().setup_method()
     self.instances = SequenceTaggingDatasetReader().read(
         self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv")
     self.instances_lazy = SequenceTaggingDatasetReader(lazy=True).read(
         self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv")
     vocab = Vocabulary.from_instances(self.instances)
     self.vocab = vocab
     self.model_params = Params({
         "text_field_embedder": {
             "token_embedders": {
                 "tokens": {
                     "type": "embedding",
                     "embedding_dim": 5
                 }
             }
         },
         "encoder": {
             "type": "lstm",
             "input_size": 5,
             "hidden_size": 7,
             "num_layers": 2
         },
     })
     self.model = SimpleTagger.from_params(vocab=self.vocab,
                                           params=self.model_params)
     self.optimizer = torch.optim.SGD(self.model.parameters(),
                                      0.01,
                                      momentum=0.9)
     self.data_loader = DataLoader(self.instances,
                                   batch_size=2,
                                   collate_fn=allennlp_collate)
     self.data_loader_lazy = DataLoader(self.instances_lazy,
                                        batch_size=2,
                                        collate_fn=allennlp_collate)
     self.validation_data_loader = DataLoader(self.instances,
                                              batch_size=2,
                                              collate_fn=allennlp_collate)
     self.instances.index_with(vocab)
     self.instances_lazy.index_with(vocab)
示例#14
0
    def setUp(self):
        super(SimpleTaggerTest, self).setUp()
        self.write_sequence_tagging_data()

        dataset = SequenceTaggingDatasetReader().read(self.TRAIN_FILE)
        vocab = Vocabulary.from_dataset(dataset)
        self.vocab = vocab
        dataset.index_instances(vocab)
        self.dataset = dataset

        params = Params({
            "text_field_embedder": {
                "tokens": {
                    "type": "embedding",
                    "embedding_dim": 5
                }
            },
            "hidden_size": 7,
            "num_layers": 2
        })

        self.model = SimpleTagger.from_params(self.vocab, params)
示例#15
0
 def setUp(self):
     super(TestDenseSparseAdam, self).setUp()
     self.instances = SequenceTaggingDatasetReader().read(
         self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')
     self.vocab = Vocabulary.from_instances(self.instances)
     self.model_params = Params({
         "text_field_embedder": {
             "tokens": {
                 "type": "embedding",
                 "embedding_dim": 5,
                 "sparse": True
             }
         },
         "encoder": {
             "type": "lstm",
             "input_size": 5,
             "hidden_size": 7,
             "num_layers": 2
         }
     })
     self.model = SimpleTagger.from_params(vocab=self.vocab,
                                           params=self.model_params)
示例#16
0
 def setUp(self):
     super(TestTrainer, self).setUp()
     self.instances = SequenceTaggingDatasetReader().read(self.FIXTURES_ROOT / u'data' / u'sequence_tagging.tsv')
     vocab = Vocabulary.from_instances(self.instances)
     self.vocab = vocab
     self.model_params = Params({
             u"text_field_embedder": {
                     u"tokens": {
                             u"type": u"embedding",
                             u"embedding_dim": 5
                             }
                     },
             u"encoder": {
                     u"type": u"lstm",
                     u"input_size": 5,
                     u"hidden_size": 7,
                     u"num_layers": 2
                     }
             })
     self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params)
     self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01)
     self.iterator = BasicIterator(batch_size=2)
     self.iterator.index_with(vocab)
示例#17
0
 def setUp(self):
     super(TestTrainer, self).setUp()
     self.instances = SequenceTaggingDatasetReader().read(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')
     vocab = Vocabulary.from_instances(self.instances)
     self.vocab = vocab
     self.model_params = Params({
             "text_field_embedder": {
                     "tokens": {
                             "type": "embedding",
                             "embedding_dim": 5
                             }
                     },
             "encoder": {
                     "type": "lstm",
                     "input_size": 5,
                     "hidden_size": 7,
                     "num_layers": 2
                     }
             })
     self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params)
     self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01)
     self.iterator = BasicIterator(batch_size=2)
     self.iterator.index_with(vocab)