def setUp(self): super().setUp() self.instances = SequenceTaggingDatasetReader().read( self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv") vocab = Vocabulary.from_instances(self.instances) self.vocab = vocab self.model_params = Params({ "text_field_embedder": { "token_embedders": { "tokens": { "type": "embedding", "embedding_dim": 5 } } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 }, }) self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params) self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01, momentum=0.9) self.iterator = BasicIterator(batch_size=2) self.iterator.index_with(vocab)
def setUp(self): super(TestTrainer, self).setUp() dataset = SequenceTaggingDatasetReader().read( 'tests/fixtures/data/sequence_tagging.tsv') vocab = Vocabulary.from_dataset(dataset) self.vocab = vocab dataset.index_instances(vocab) self.dataset = dataset self.model_params = Params({ "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "stacked_encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }) self.model = SimpleTagger.from_params(self.vocab, self.model_params) self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01) self.iterator = BasicIterator(batch_size=2)
def setUp(self): super().setUp() # A lot of the tests want access to the metric tracker # so we add a property that gets it by grabbing it from # the relevant callback. def metric_tracker(self: CallbackTrainer): for callback in self.handler.callbacks(): if isinstance(callback, TrackMetrics): return callback.metric_tracker return None setattr(CallbackTrainer, "metric_tracker", property(metric_tracker)) self.instances = SequenceTaggingDatasetReader().read( self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv" ) vocab = Vocabulary.from_instances(self.instances) self.vocab = vocab self.model_params = Params( { "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, } ) self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params) self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01, momentum=0.9) self.iterator = BasicIterator(batch_size=2) self.iterator.index_with(vocab)
def init_crf_model(self) -> Model: """init crf tagger model """ # 1. import related modules from allennlp bert_text_field_embedder = PretrainedTransformerEmbedder(model_name=self.config.model_name) bert_text_field_embedder tagger = SimpleTagger( vocab=self.vocab, text_field_embedder=BasicTextFieldEmbedder( token_embedders={ 'tokens': bert_text_field_embedder } ), encoder=PassThroughEncoder(bert_text_field_embedder.get_output_dim()), verbose_metrics=True, calculate_span_f1=True, label_encoding="BMES", ) tagger.to(device=self.config.device) return tagger
def setup_method(self): super().setup_method() self.instances = SequenceTaggingDatasetReader().read( self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv" ) vocab = Vocabulary.from_instances(self.instances) self.model_params = Params( { "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, } ) self.model = SimpleTagger.from_params(vocab=vocab, params=self.model_params)
def simple_tagger_model() -> Model: """create a simple tagger model.""" # this is a bad hack to get the same data as the test case # TODO(joelgrus): replace this test_case = AllenNlpTestCase() test_case.setUp() test_case.write_sequence_tagging_data() dataset = SequenceTaggingDatasetReader().read(test_case.TRAIN_FILE) vocab = Vocabulary.from_dataset(dataset) dataset.index_instances(vocab) params = Params({ "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "hidden_size": 7, "num_layers": 2 }) model = SimpleTagger.from_params(vocab, params) tokenizer = WordTokenizer() def run(blob: JSON): sentence = blob.get("input", "") tokens = tokenizer.tokenize(sentence) text = TextField(tokens, token_indexers={"tokens": SingleIdTokenIndexer()}) output = model.tag(text) # convert np array to serializable list output['class_probabilities'] = output['class_probabilities'].tolist() possible_tags = list( vocab.get_index_to_token_vocabulary("tags").values()) return { 'model_name': 'simple_tagger', 'input': sentence, 'output': output, 'tokens': tokens, 'possible_tags': possible_tags } return run
def setUp(self): super().setUp() # TODO make this a set of dataset readers # Classification may be easier in this case. Same dataset reader but with different paths self.instances_list = [] self.instances_list.append(SequenceTaggingDatasetReader().read( self.FIXTURES_ROOT / 'data' / 'meta_seq' / 'sequence_tagging.tsv')) self.instances_list.append(SequenceTaggingDatasetReader().read( self.FIXTURES_ROOT / 'data' / 'meta_seq' / 'sequence_tagging1.tsv')) self.instances_list.append(SequenceTaggingDatasetReader().read( self.FIXTURES_ROOT / 'data' / 'meta_seq' / 'sequence_tagging2.tsv')) # loop through dataset readers and extend vocab combined_vocab = Vocabulary.from_instances(self.instances_list[0]) for instance in self.instances_list: combined_vocab.extend_from_instances(Params({}), instances=instance) self.vocab = combined_vocab # Figure out params TODO self.model_params = Params({ "text_field_embedder": { "token_embedders": { "tokens": { "type": "embedding", "embedding_dim": 5 } } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }) self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params) self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01, momentum=0.9) self.iterator = BasicIterator(batch_size=2) self.iterator.index_with(combined_vocab)
def setUp(self): super(TestOptimizer, self).setUp() self.instances = SequenceTaggingDatasetReader().read(self.FIXTURES_ROOT / u'data' / u'sequence_tagging.tsv') vocab = Vocabulary.from_instances(self.instances) self.model_params = Params({ u"text_field_embedder": { u"tokens": { u"type": u"embedding", u"embedding_dim": 5 } }, u"encoder": { u"type": u"lstm", u"input_size": 5, u"hidden_size": 7, u"num_layers": 2 } }) self.model = SimpleTagger.from_params(vocab=vocab, params=self.model_params)
def setUp(self): super(TestOptimizer, self).setUp() self.instances = SequenceTaggingDatasetReader().read(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv') vocab = Vocabulary.from_instances(self.instances) self.model_params = Params({ "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }) self.model = SimpleTagger.from_params(vocab=vocab, params=self.model_params)
def setup_method(self): super().setup_method() self.data_path = str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv") self.reader = SequenceTaggingDatasetReader(max_instances=4) self.data_loader = MultiProcessDataLoader(self.reader, self.data_path, batch_size=2) self.data_loader_lazy = MultiProcessDataLoader( self.reader, self.data_path, batch_size=2, max_instances_in_memory=10) self.instances = list(self.data_loader.iter_instances()) self.vocab = Vocabulary.from_instances(self.instances) self.data_loader.index_with(self.vocab) self.data_loader_lazy.index_with(self.vocab) self.model_params = Params({ "text_field_embedder": { "token_embedders": { "tokens": { "type": "embedding", "embedding_dim": 5 } } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 }, }) self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params) self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01, momentum=0.9) self.validation_data_loader = MultiProcessDataLoader(self.reader, self.data_path, batch_size=2) self.validation_data_loader.index_with(self.vocab)
def setUp(self): super(TestOptimizer, self).setUp() self.instances = SequenceTaggingDatasetReader().read( 'tests/fixtures/data/sequence_tagging.tsv') vocab = Vocabulary.from_instances(self.instances) self.model_params = Params({ "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "stacked_encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }) self.model = SimpleTagger.from_params(vocab, self.model_params)
def test_sequence_tagging_reader(): model_name = 'bert-base-chinese' bert_token_indexers = PretrainedTransformerIndexer(model_name=model_name) reader = SequenceTaggingDatasetReader( token_indexers={"tokens": bert_token_indexers}) train_file = './data/weibo/train.corpus' dev_file = './data/weibo/dev.corpus' test_file = './data/weibo/dev.corpus' train_instances = list(reader.read(train_file)) dev_instances = list(reader.read(dev_file)) test_instances = list(reader.read(test_file)) vocab: Vocabulary = Vocabulary.from_instances(train_instances) assert vocab.get_namespaces() is not None bert_text_field_embedder = PretrainedTransformerEmbedder( model_name=model_name) tagger = SimpleTagger( vocab=vocab, text_field_embedder=BasicTextFieldEmbedder( token_embedders={'tokens': bert_text_field_embedder}), encoder=PassThroughEncoder(bert_text_field_embedder.get_output_dim()), calculate_span_f1=True, label_encoding="BMES", # verbose_metrics=True ) train_data_loader, dev_data_loader = build_data_loaders( train_instances, dev_instances) train_data_loader.index_with(vocab) dev_data_loader.index_with(vocab) trainer = build_trainer(model=tagger, serialization_dir='./output', train_loader=train_data_loader, dev_loader=dev_data_loader) print("Starting training") trainer.train() print("Finished training")
def setup_method(self): super().setup_method() self.instances = SequenceTaggingDatasetReader().read( self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv") self.instances_lazy = SequenceTaggingDatasetReader(lazy=True).read( self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv") vocab = Vocabulary.from_instances(self.instances) self.vocab = vocab self.model_params = Params({ "text_field_embedder": { "token_embedders": { "tokens": { "type": "embedding", "embedding_dim": 5 } } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 }, }) self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params) self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01, momentum=0.9) self.data_loader = DataLoader(self.instances, batch_size=2, collate_fn=allennlp_collate) self.data_loader_lazy = DataLoader(self.instances_lazy, batch_size=2, collate_fn=allennlp_collate) self.validation_data_loader = DataLoader(self.instances, batch_size=2, collate_fn=allennlp_collate) self.instances.index_with(vocab) self.instances_lazy.index_with(vocab)
def setUp(self): super(SimpleTaggerTest, self).setUp() self.write_sequence_tagging_data() dataset = SequenceTaggingDatasetReader().read(self.TRAIN_FILE) vocab = Vocabulary.from_dataset(dataset) self.vocab = vocab dataset.index_instances(vocab) self.dataset = dataset params = Params({ "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "hidden_size": 7, "num_layers": 2 }) self.model = SimpleTagger.from_params(self.vocab, params)
def setUp(self): super(TestDenseSparseAdam, self).setUp() self.instances = SequenceTaggingDatasetReader().read( self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv') self.vocab = Vocabulary.from_instances(self.instances) self.model_params = Params({ "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5, "sparse": True } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }) self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params)
def setUp(self): super(TestTrainer, self).setUp() self.instances = SequenceTaggingDatasetReader().read(self.FIXTURES_ROOT / u'data' / u'sequence_tagging.tsv') vocab = Vocabulary.from_instances(self.instances) self.vocab = vocab self.model_params = Params({ u"text_field_embedder": { u"tokens": { u"type": u"embedding", u"embedding_dim": 5 } }, u"encoder": { u"type": u"lstm", u"input_size": 5, u"hidden_size": 7, u"num_layers": 2 } }) self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params) self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01) self.iterator = BasicIterator(batch_size=2) self.iterator.index_with(vocab)
def setUp(self): super(TestTrainer, self).setUp() self.instances = SequenceTaggingDatasetReader().read(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv') vocab = Vocabulary.from_instances(self.instances) self.vocab = vocab self.model_params = Params({ "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }) self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params) self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01) self.iterator = BasicIterator(batch_size=2) self.iterator.index_with(vocab)