def setUp(self): super().setUp() self.instances = SequenceTaggingDatasetReader().read( self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv" ) self.instances_lazy = SequenceTaggingDatasetReader(lazy=True).read( self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv" ) vocab = Vocabulary.from_instances(self.instances) self.vocab = vocab self.model_params = Params( { "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, } ) self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params) self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01, momentum=0.9) self.data_loader = DataLoader(self.instances, batch_size=2, collate_fn=allennlp_collate) self.data_loader_lazy = DataLoader( self.instances_lazy, batch_size=2, collate_fn=allennlp_collate ) self.validation_data_loader = DataLoader( self.instances, batch_size=2, collate_fn=allennlp_collate ) self.instances.index_with(vocab) self.instances_lazy.index_with(vocab)
def setUp(self): super().setUp() # TODO make this a set of dataset readers # Classification may be easier in this case. Same dataset reader but with different paths self.instances_list = [] self.instances_list.append(SequenceTaggingDatasetReader().read(self.FIXTURES_ROOT / 'data' / 'meta_seq' / 'sequence_tagging.tsv')) self.instances_list.append(SequenceTaggingDatasetReader().read(self.FIXTURES_ROOT / 'data' / 'meta_seq' / 'sequence_tagging1.tsv')) self.instances_list.append(SequenceTaggingDatasetReader().read(self.FIXTURES_ROOT / 'data' / 'meta_seq' / 'sequence_tagging2.tsv')) # loop through dataset readers and extend vocab combined_vocab = Vocabulary.from_instances(self.instances_list[0]) for instance in self.instances_list: combined_vocab.extend_from_instances(Params({}), instances=instance) self.vocab = combined_vocab # Figure out params TODO self.model_params = Params({ "text_field_embedder": { "token_embedders": { "tokens": { "type": "embedding", "embedding_dim": 5 } } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }) self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params) self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01, momentum=0.9) self.iterator = BasicIterator(batch_size=2) self.iterator.index_with(combined_vocab)
def setup_method(self) -> None: super().setup_method() # use SequenceTaggingDatasetReader as the base reader self.base_reader = SequenceTaggingDatasetReader(lazy=True) self.base_reader_multi_process = SequenceTaggingDatasetReader( lazy=True) base_file_path = AllenNlpTestCase.FIXTURES_ROOT / "data" / "sequence_tagging.tsv" # Make 100 copies of the data raw_data = open(base_file_path).read() for i in range(100): file_path = self.TEST_DIR / f"identical_{i}.tsv" with open(file_path, "w") as f: f.write(raw_data) self.identical_files_glob = str(self.TEST_DIR / "identical_*.tsv") # Also create an archive with all of these files to ensure that we can # pass the archive directory. current_dir = os.getcwd() os.chdir(self.TEST_DIR) self.archive_filename = self.TEST_DIR / "all_data.tar.gz" with tarfile.open(self.archive_filename, "w:gz") as archive: for file_path in glob.glob("identical_*.tsv"): archive.add(file_path) os.chdir(current_dir) self.reader = ShardedDatasetReader(base_reader=self.base_reader) self.reader_multi_process = ShardedDatasetReader( base_reader=self.base_reader_multi_process, multi_process=True)
def setUp(self): super().setUp() # A lot of the tests want access to the metric tracker # so we add a property that gets it by grabbing it from # the relevant callback. def metric_tracker(self: CallbackTrainer): for callback in self.handler.callbacks(): if isinstance(callback, TrackMetrics): return callback.metric_tracker return None setattr(CallbackTrainer, 'metric_tracker', property(metric_tracker)) self.instances = SequenceTaggingDatasetReader().read(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv') vocab = Vocabulary.from_instances(self.instances) self.vocab = vocab self.model_params = Params({ "text_field_embedder": { "token_embedders": { "tokens": { "type": "embedding", "embedding_dim": 5 } } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }) self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params) self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01, momentum=0.9)
def setUp(self) -> None: super().setUp() # use SequenceTaggingDatasetReader as the base reader self.base_reader = SequenceTaggingDatasetReader(lazy=True) base_file_path = AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv' # Make 100 copies of the data raw_data = open(base_file_path).read() for i in range(100): file_path = self.TEST_DIR / f'identical_{i}.tsv' with open(file_path, 'w') as f: f.write(raw_data) self.all_distinct_path = str(self.TEST_DIR / 'all_distinct.tsv') with open(self.all_distinct_path, 'w') as all_distinct: for i in range(100): file_path = self.TEST_DIR / f'distinct_{i}.tsv' line = f"This###DT\tis###VBZ\tsentence###NN\t{i}###CD\t.###.\n" with open(file_path, 'w') as f: f.write(line) all_distinct.write(line) self.identical_files_glob = str(self.TEST_DIR / 'identical_*.tsv') self.distinct_files_glob = str(self.TEST_DIR / 'distinct_*.tsv') # For some of the tests we need a vocab, we'll just use the base_reader for that. self.vocab = Vocabulary.from_instances(self.base_reader.read(str(base_file_path)))
def setup_method(self): super().setup_method() self.data_path = str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv") self.reader = SequenceTaggingDatasetReader() self.data_loader = MultiProcessDataLoader(self.reader, self.data_path, batch_size=2) self.data_loader_lazy = MultiProcessDataLoader( self.reader, self.data_path, batch_size=2, max_instances_in_memory=10 ) self.instances = list(self.data_loader.iter_instances()) self.vocab = Vocabulary.from_instances(self.instances) self.data_loader.index_with(self.vocab) self.data_loader_lazy.index_with(self.vocab) self.model_params = Params( { "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, } ) self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params) self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01, momentum=0.9) self.validation_data_loader = MultiProcessDataLoader( self.reader, self.data_path, batch_size=2 ) self.validation_data_loader.index_with(self.vocab)
def setUp(self): super(SimpleTaggerTest, self).setUp() dataset = SequenceTaggingDatasetReader().read( 'tests/fixtures/data/sequence_tagging.tsv') vocab = Vocabulary.from_dataset(dataset) self.vocab = vocab dataset.index_instances(vocab) self.dataset = dataset params = Params({ "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "stacked_encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }) self.model = SimpleTagger.from_params(self.vocab, params)
def setUp(self): super().setUp() self.instances = SequenceTaggingDatasetReader().read( self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv") vocab = Vocabulary.from_instances(self.instances) self.vocab = vocab self.model = ConstantModel(vocab)
def setUp(self): super(TestTrainer, self).setUp() dataset = SequenceTaggingDatasetReader().read( 'tests/fixtures/data/sequence_tagging.tsv') vocab = Vocabulary.from_instances(dataset) self.vocab = vocab dataset.index_instances(vocab) self.dataset = dataset self.model_params = Params({ "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "stacked_encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }) self.model = SimpleTagger.from_params(self.vocab, self.model_params) self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01) self.iterator = BasicIterator(batch_size=2)
def setUp(self): super(TestTrainer, self).setUp() self.instances = SequenceTaggingDatasetReader().read( self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv') vocab = Vocabulary.from_instances(self.instances) self.vocab = vocab self.model_params = Params({ "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }) self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params) self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01) self.iterator = BasicIterator(batch_size=2) self.iterator.index_with(vocab)
def setUp(self) -> None: super().setUp() # use SequenceTaggingDatasetReader as the base reader self.base_reader = SequenceTaggingDatasetReader(lazy=True) base_file_path = AllenNlpTestCase.FIXTURES_ROOT / "data" / "sequence_tagging.tsv" # Make 100 copies of the data raw_data = open(base_file_path).read() for i in range(100): file_path = self.TEST_DIR / f"identical_{i}.tsv" with open(file_path, "w") as f: f.write(raw_data) self.identical_files_glob = str(self.TEST_DIR / "identical_*.tsv")
def setup_method(self): super().setup_method() self.instances = SequenceTaggingDatasetReader().read( self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv" ) vocab = Vocabulary.from_instances(self.instances) self.model_params = Params( { "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, } ) self.model = SimpleTagger.from_params(vocab=vocab, params=self.model_params)
def test_default_format(self): reader = SequenceTaggingDatasetReader() dataset = reader.read('tests/fixtures/data/sequence_tagging.tsv') assert len(dataset.instances) == 4 fields = dataset.instances[0].fields assert fields["tokens"].tokens == ["cats", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = dataset.instances[1].fields assert fields["tokens"].tokens == ["dogs", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = dataset.instances[2].fields assert fields["tokens"].tokens == ["snakes", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = dataset.instances[3].fields assert fields["tokens"].tokens == ["birds", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"]
def test_brown_corpus_format(self): reader = SequenceTaggingDatasetReader(word_tag_delimiter='/') dataset = reader.read('tests/fixtures/data/brown_corpus.txt') assert len(dataset.instances) == 4 fields = dataset.instances[0].fields assert fields["tokens"].tokens == ["cats", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = dataset.instances[1].fields assert fields["tokens"].tokens == ["dogs", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = dataset.instances[2].fields assert fields["tokens"].tokens == ["snakes", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = dataset.instances[3].fields assert fields["tokens"].tokens == ["birds", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"]
def setUp(self): super().setUp() self.base_reader = SequenceTaggingDatasetReader(lazy=True) base_file_path = AllenNlpTestCase.FIXTURES_ROOT / "data" / "sequence_tagging.tsv" # Make 100 copies of the data raw_data = open(base_file_path).read() for i in range(100): file_path = self.TEST_DIR / f"sequence_tagging_{i}.tsv" with open(file_path, "w") as f: f.write(raw_data) self.glob = str(self.TEST_DIR / "sequence_tagging_*.tsv") # For some of the tests we need a vocab, we'll just use the base_reader for that. self.vocab = Vocabulary.from_instances(self.base_reader.read(str(base_file_path)))
def test_brown_corpus_format(self): reader = SequenceTaggingDatasetReader(word_tag_delimiter=u'/') instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / u'data' / u'brown_corpus.txt') instances = ensure_list(instances) assert len(instances) == 4 fields = instances[0].fields assert [t.text for t in fields[u"tokens"].tokens] == [u"cats", u"are", u"animals", u"."] assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"] fields = instances[1].fields assert [t.text for t in fields[u"tokens"].tokens] == [u"dogs", u"are", u"animals", u"."] assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"] fields = instances[2].fields assert [t.text for t in fields[u"tokens"].tokens] == [u"snakes", u"are", u"animals", u"."] assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"] fields = instances[3].fields assert [t.text for t in fields[u"tokens"].tokens] == [u"birds", u"are", u"animals", u"."] assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"]
def test_default_format(self, lazy): reader = SequenceTaggingDatasetReader(lazy=lazy) instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / u'data' / u'sequence_tagging.tsv') instances = ensure_list(instances) assert len(instances) == 4 fields = instances[0].fields assert [t.text for t in fields[u"tokens"].tokens] == [u"cats", u"are", u"animals", u"."] assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"] fields = instances[1].fields assert [t.text for t in fields[u"tokens"].tokens] == [u"dogs", u"are", u"animals", u"."] assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"] fields = instances[2].fields assert [t.text for t in fields[u"tokens"].tokens] == [u"snakes", u"are", u"animals", u"."] assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"] fields = instances[3].fields assert [t.text for t in fields[u"tokens"].tokens] == [u"birds", u"are", u"animals", u"."] assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"]
def test_read_from_file(self): reader = SequenceTaggingDatasetReader() dataset = reader.read(self.TRAIN_FILE) assert len(dataset.instances) == 4 fields = dataset.instances[0].fields() assert fields["tokens"].tokens() == ["cats", "are", "animals", "."] assert fields["tags"].tags() == ["N", "V", "N", "N"] fields = dataset.instances[1].fields() assert fields["tokens"].tokens() == ["dogs", "are", "animals", "."] assert fields["tags"].tags() == ["N", "V", "N", "N"] fields = dataset.instances[2].fields() assert fields["tokens"].tokens() == ["snakes", "are", "animals", "."] assert fields["tags"].tags() == ["N", "V", "N", "N"] fields = dataset.instances[3].fields() assert fields["tokens"].tokens() == ["birds", "are", "animals", "."] assert fields["tags"].tags() == ["N", "V", "N", "N"]
def test_default_format(self, lazy): reader = SequenceTaggingDatasetReader(lazy=lazy) instances = reader.read('tests/fixtures/data/sequence_tagging.tsv') instances = ensure_list(instances) assert len(instances) == 4 fields = instances[0].fields assert [t.text for t in fields["tokens"].tokens] == ["cats", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = instances[1].fields assert [t.text for t in fields["tokens"].tokens] == ["dogs", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = instances[2].fields assert [t.text for t in fields["tokens"].tokens] == ["snakes", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = instances[3].fields assert [t.text for t in fields["tokens"].tokens] == ["birds", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"]
def setUp(self): super(TestOptimizer, self).setUp() self.instances = SequenceTaggingDatasetReader().read(self.FIXTURES_ROOT / u'data' / u'sequence_tagging.tsv') vocab = Vocabulary.from_instances(self.instances) self.model_params = Params({ u"text_field_embedder": { u"tokens": { u"type": u"embedding", u"embedding_dim": 5 } }, u"encoder": { u"type": u"lstm", u"input_size": 5, u"hidden_size": 7, u"num_layers": 2 } }) self.model = SimpleTagger.from_params(vocab=vocab, params=self.model_params)
def setUp(self): super(TestOptimizer, self).setUp() self.instances = SequenceTaggingDatasetReader().read( 'tests/fixtures/data/sequence_tagging.tsv') vocab = Vocabulary.from_instances(self.instances) self.model_params = Params({ "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }) self.model = SimpleTagger.from_params(vocab, self.model_params)
def setUp(self): super(SimpleTaggerTest, self).setUp() self.write_sequence_tagging_data() dataset = SequenceTaggingDatasetReader().read(self.TRAIN_FILE) vocab = Vocabulary.from_dataset(dataset) self.vocab = vocab dataset.index_instances(vocab) self.dataset = dataset params = Params({ "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "hidden_size": 7, "num_layers": 2 }) self.model = SimpleTagger.from_params(self.vocab, params)
def setUp(self): super(TestDenseSparseAdam, self).setUp() self.instances = SequenceTaggingDatasetReader().read( self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv') self.vocab = Vocabulary.from_instances(self.instances) self.model_params = Params({ "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5, "sparse": True } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }) self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params)
def test_default_format(self): reader = SequenceTaggingDatasetReader(max_instances=4) instances = list( reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" / "sequence_tagging.tsv")) assert len(instances) == 4 fields = instances[0].fields assert [t.text for t in fields["tokens"].tokens ] == ["cats", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = instances[1].fields assert [t.text for t in fields["tokens"].tokens ] == ["dogs", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = instances[2].fields assert [t.text for t in fields["tokens"].tokens ] == ["snakes", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = instances[3].fields assert [t.text for t in fields["tokens"].tokens ] == ["birds", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"]
def test_brown_corpus_format(self): reader = SequenceTaggingDatasetReader(word_tag_delimiter="/") instances = list( reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" / "brown_corpus.txt")) assert len(instances) == 4 fields = instances[0].fields assert [t.text for t in fields["tokens"].tokens ] == ["cats", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = instances[1].fields assert [t.text for t in fields["tokens"].tokens ] == ["dogs", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = instances[2].fields assert [t.text for t in fields["tokens"].tokens ] == ["snakes", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = instances[3].fields assert [t.text for t in fields["tokens"].tokens ] == ["birds", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"]
def test_run_steps_programmatically(step_cache_class): from allennlp.data.dataset_readers import SequenceTaggingDatasetReader from allennlp.tango.dataset import DatasetReaderAdapterStep from allennlp.tango import TrainingStep from allennlp.common import Lazy from allennlp.training.optimizers import AdamOptimizer from allennlp.tango.dataloader import BatchSizeDataLoader from allennlp.models import SimpleTagger from allennlp.tango import EvaluationStep dataset_step = DatasetReaderAdapterStep( reader=SequenceTaggingDatasetReader(), splits={ "train": "test_fixtures/data/sequence_tagging.tsv", "validation": "test_fixtures/data/sequence_tagging.tsv", }, ) training_step = TrainingStep( model=Lazy( SimpleTagger, Params({ "text_field_embedder": { "token_embedders": { "tokens": { "type": "embedding", "projection_dim": 2, "pretrained_file": "test_fixtures/embeddings/glove.6B.100d.sample.txt.gz", "embedding_dim": 100, "trainable": True, } } }, "encoder": { "type": "lstm", "input_size": 2, "hidden_size": 4, "num_layers": 1 }, }), ), dataset=dataset_step, data_loader=Lazy(BatchSizeDataLoader, Params({"batch_size": 2})), optimizer=Lazy(AdamOptimizer), ) evaluation_step = EvaluationStep(dataset=dataset_step, model=training_step, step_name="evaluation") with TemporaryDirectory(prefix="test_run_steps_programmatically-") as d: if step_cache_class == DirectoryStepCache: cache = DirectoryStepCache(d) else: cache = step_cache_class() assert "random object" not in cache assert dataset_step not in cache assert training_step not in cache assert evaluation_step not in cache assert len(cache) == 0 with pytest.raises(KeyError): _ = cache[evaluation_step] assert tango_dry_run(evaluation_step, cache) == [ (dataset_step, False), (training_step, False), (evaluation_step, False), ] training_step.ensure_result(cache) assert tango_dry_run(evaluation_step, cache) == [ (dataset_step, True), (training_step, True), (evaluation_step, False), ] assert "random object" not in cache assert dataset_step in cache assert training_step in cache assert evaluation_step not in cache assert len(cache) == 2 with pytest.raises(KeyError): _ = cache[evaluation_step]