def test_language_model_data_collator(): """ Ensure `LanguageModelingDataCollator` works """ norm_loader = MultiProcessDataLoader(MockDatasetReader(), "some path", batch_size=16) vocab = Vocabulary.from_instances(norm_loader.iter_instances()) norm_loader.index_with(vocab) batch0 = list(norm_loader)[0] model_name = "epwalsh/bert-xsmall-dummy" data_collate = LanguageModelingDataCollator(model_name) mlm_loader = MultiProcessDataLoader(MockDatasetReader(), "some path", batch_size=16, collate_fn=data_collate) vocab = Vocabulary.from_instances(mlm_loader.iter_instances()) mlm_loader.index_with(vocab) batch1 = list(mlm_loader)[0] norm_inputs = batch0["source"]["tokens"]["token_ids"] mlm_inputs = batch1["source"]["tokens"]["token_ids"] mlm_labels = batch1["source"]["tokens"]["labels"] # if we replace the mlm inputs with their labels, should be same as origin inputs assert torch.where(mlm_labels != -100, mlm_labels, mlm_inputs).tolist() == norm_inputs.tolist()
def main(): reader = StanfordSentimentTreeBankDatasetReader() train_path = 'https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/train.txt' dev_path = 'https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/dev.txt' sampler = BucketBatchSampler(batch_size=32, sorting_keys=["tokens"]) train_data_loader = MultiProcessDataLoader(reader, train_path, batch_sampler=sampler) dev_data_loader = MultiProcessDataLoader(reader, dev_path, batch_sampler=sampler) # You can optionally specify the minimum count of tokens/labels. # `min_count={'tokens':3}` here means that any tokens that appear less than three times # will be ignored and not included in the vocabulary. vocab = Vocabulary.from_instances(chain(train_data_loader.iter_instances(), dev_data_loader.iter_instances()), min_count={'tokens': 3}) train_data_loader.index_with(vocab) dev_data_loader.index_with(vocab) token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens, # not for labels, which are used as-is as the "answer" of the sentence classification word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) # Seq2VecEncoder is a neural network abstraction that takes a sequence of something # (usually a sequence of embedded word vectors), processes it, and returns a single # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but # AllenNLP also supports CNNs and other simple architectures (for example, # just averaging over the input vectors). encoder = PytorchSeq2VecWrapper( torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) model = LstmClassifier(word_embeddings, encoder, vocab) optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5) trainer = GradientDescentTrainer(model=model, optimizer=optimizer, data_loader=train_data_loader, validation_data_loader=dev_data_loader, patience=10, num_epochs=20, cuda_device=-1) trainer.train() predictor = SentenceClassifierPredictor(model, dataset_reader=reader) logits = predictor.predict('This is the best movie ever!')['logits'] label_id = np.argmax(logits) print(model.vocab.get_token_from_index(label_id, 'labels'))
class TrainerTestBase(AllenNlpTestCase): def setup_method(self): super().setup_method() self.data_path = str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv") self.reader = SequenceTaggingDatasetReader() self.data_loader = MultiProcessDataLoader(self.reader, self.data_path, batch_size=2) self.data_loader_lazy = MultiProcessDataLoader( self.reader, self.data_path, batch_size=2, max_instances_in_memory=10 ) self.instances = list(self.data_loader.iter_instances()) self.vocab = Vocabulary.from_instances(self.instances) self.data_loader.index_with(self.vocab) self.data_loader_lazy.index_with(self.vocab) self.model_params = Params( { "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, } ) self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params) self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01, momentum=0.9) self.validation_data_loader = MultiProcessDataLoader( self.reader, self.data_path, batch_size=2 ) self.validation_data_loader.index_with(self.vocab)
def read_and_check_instances(self, filepath: str, num_workers: int = 0): data_loader = MultiProcessDataLoader(self.reader, filepath, num_workers=num_workers, batch_size=1, start_method="spawn") all_instances = [] for instance in data_loader.iter_instances(): all_instances.append(instance) # 100 files * 4 sentences / file assert len(all_instances) == 100 * 4 counts = Counter(fingerprint(instance) for instance in all_instances) # should have the exact same data 100 times assert len(counts) == 4 assert counts[("cats", "are", "animals", ".", "N", "V", "N", "N")] == 100 assert counts[("dogs", "are", "animals", ".", "N", "V", "N", "N")] == 100 assert counts[("snakes", "are", "animals", ".", "N", "V", "N", "N")] == 100 assert counts[("birds", "are", "animals", ".", "N", "V", "N", "N")] == 100
def test_batches_per_epoch(): loader = MultiProcessDataLoader(MockDatasetReader(), "some path", batch_size=4, batches_per_epoch=10) vocab = Vocabulary.from_instances(loader.iter_instances()) loader.index_with(vocab) assert len(loader) == 10 assert len(list(loader)) == 10
def test_load_to_cuda(options): reader = MockDatasetReader() loader = MultiProcessDataLoader( reader=reader, data_path="this doens't matter", cuda_device=0, **options, ) vocab = Vocabulary.from_instances(loader.iter_instances()) loader.index_with(vocab) for batch in loader: assert batch["tensor"].device == torch.device("cuda:0")
def test_drop_last(): """ Ensures that the `drop_last` option is respected. """ loader = MultiProcessDataLoader(MockDatasetReader(), "some path", batch_size=16, drop_last=True) vocab = Vocabulary.from_instances(loader.iter_instances()) loader.index_with(vocab) # Should still load all instances. `drop_last` only affects batches. assert len(list( loader.iter_instances())) == MockDatasetReader.NUM_INSTANCES # Just here because the assertions below depend on the exact value of NUM_INSTANCES. assert MockDatasetReader.NUM_INSTANCES == 100 batches = list(loader) for batch in batches: assert len(batch["index"]) == 16 assert len(batches) == 6
def test_with_multi_process_loading(self, lazy): readers = {"a": PlainTextReader(), "b": PlainTextReader(), "c": PlainTextReader()} reader = InterleavingDatasetReader(readers) data_dir = self.FIXTURES_ROOT / "data" file_path = { "a": data_dir / "babi.txt", "b": data_dir / "conll2003.txt", "c": data_dir / "conll2003.txt", } vocab = Vocabulary.from_instances(reader.read(file_path)) loader = MultiProcessDataLoader( reader, file_path, num_workers=1, batch_size=1, max_instances_in_memory=2 if lazy else None, ) loader.index_with(vocab) list(loader.iter_instances()) list(loader)
def test_error_raised_when_text_fields_contain_token_indexers( max_instances_in_memory): """ This tests that the MultiProcessDataLoader raises an error when num_workers > 0 but the dataset reader doesn't implement apply_token_indexers(). It also tests that errors raised within a worker process are propogated upwards to the main process, and that when that happens, all workers will be successfully killed. """ with pytest.raises( WorkerError, match="Make sure your dataset reader's text_to_instance()"): loader = MultiProcessDataLoader( MockOldDatasetReader(), "this-path-doesn't-matter", num_workers=2, max_instances_in_memory=max_instances_in_memory, batch_size=1, ) list(loader.iter_instances())
def test_multiprocess_data_loader(options): reader = MockDatasetReader() data_path = "this doesn't matter" loader = MultiProcessDataLoader(reader=reader, data_path=data_path, **options) if not options.get("max_instances_in_memory"): # Instances should be loaded immediately if max_instances_in_memory is None. assert loader._instances instances: Iterable[Instance] = loader.iter_instances() # This should be a generator. assert not isinstance(instances, (list, tuple)) instances = list(instances) assert len(instances) == MockDatasetReader.NUM_INSTANCES # Now build vocab. vocab = Vocabulary.from_instances(instances) # Before indexing the loader, trying to iterate through batches will raise an error. with pytest.raises(ValueError, match="Did you forget to call DataLoader.index_with"): list(loader) loader.index_with(vocab) # Run through a couple epochs to make sure we collect all of the instances. for epoch in range(2): indices: List[int] = [] for batch in loader: for index in batch["index"]: indices.append(index) # type: ignore # Ensure no duplicates. assert len(indices) == len(set(indices)), indices # Ensure all collected. assert len(indices) == MockDatasetReader.NUM_INSTANCES, epoch