Exemplo n.º 1
0
    def test_sort(self):
        data = Data(
            self.data_source,
            self.tensorizers,
            Batcher(train_batch_size=5),
            sort_key="tokens",
        )

        def assert_sorted(batch):
            _, seq_lens, _ = batch["tokens"]
            seq_lens = seq_lens.tolist()
            for i in range(len(seq_lens) - 1):
                self.assertTrue(seq_lens[i] >= seq_lens[i + 1])

        batches = iter(list(data.batches(Stage.TRAIN)))
        first_raw_batch, first_batch = next(batches)
        assert_sorted(first_batch)
        # make sure labels are also in the same order of sorted tokens
        self.assertEqual(
            self.tensorizers["labels"].vocab[first_batch["labels"][1]],
            "alarm/set_alarm",
        )
        self.assertEqual(first_raw_batch[1][RawExampleFieldName.ROW_INDEX], 1)
        second_raw_batch, second_batch = next(batches)
        assert_sorted(second_batch)
        self.assertEqual(
            self.tensorizers["labels"].vocab[second_batch["labels"][1]],
            "alarm/time_left_on_alarm",
        )
        self.assertEqual(second_raw_batch[0][RawExampleFieldName.ROW_INDEX], 6)
        self.assertEqual(second_raw_batch[1][RawExampleFieldName.ROW_INDEX], 5)
Exemplo n.º 2
0
 def test_create_data_no_batcher_provided(self):
     data = Data(self.data_source, self.tensorizers)
     batches = list(data.batches(Stage.TRAIN))
     # We should have made at least one non-empty batch
     self.assertTrue(batches)
     raw_batch, batch = next(iter(batches))
     self.assertTrue(batch)
Exemplo n.º 3
0
 def test_create_batches_different_tensorizers(self):
     tensorizers = {"tokens": WordTensorizer(column="text")}
     data = Data(self.data_source, tensorizers, Batcher(train_batch_size=16))
     batches = list(data.batches(Stage.TRAIN))
     self.assertEqual(1, len(batches))
     batch = next(iter(batches))
     self.assertEqual({"tokens"}, set(batch))
     tokens, seq_lens = batch["tokens"]
     self.assertEqual((10,), seq_lens.size())
     self.assertEqual(10, len(tokens))
Exemplo n.º 4
0
 def test_create_batches(self):
     data = Data(self.data_source, self.tensorizers, Batcher(train_batch_size=16))
     batches = list(data.batches(Stage.TRAIN))
     self.assertEqual(1, len(batches))
     batch = next(iter(batches))
     self.assertEqual(set(self.tensorizers), set(batch))
     tokens, seq_lens = batch["tokens"]
     self.assertEqual((10,), seq_lens.size())
     self.assertEqual((10,), batch["labels"].size())
     self.assertEqual({"tokens", "labels"}, set(batch))
     self.assertEqual(10, len(tokens))
Exemplo n.º 5
0
 def test_create_batches(self):
     data = Data(self.data_source, self.tensorizers,
                 RawBatcher(batch_size=16))
     batches = list(data.batches(Stage.TRAIN))
     self.assertEqual(1, len(batches))
     batch, batch_tensors = next(iter(batches))
     self.assertEqual(set(self.tensorizers), set(batch_tensors))
     tokens, seq_lens = batch_tensors["tokens"]
     self.assertEqual((10, ), seq_lens.size())
     self.assertEqual((10, ), batch_tensors["labels"].size())
     self.assertEqual(10, len(batch))
     example = next(iter(batch))
     self.assertEqual({"text", "label"}, set(example))
Exemplo n.º 6
0
 def test_data_iterate_multiple_times(self):
     data = Data(self.data_source, self.tensorizers)
     batches = data.batches(Stage.TRAIN)
     data1 = list(batches)
     data2 = list(batches)
     # We should have made at least one non-empty batch
     self.assertTrue(data1)
     self.assertTrue(data2)
     _, (batch1, _) = data1[0]
     _, (batch2, _) = data2[0]
     # pytorch tensors don't have equals comparisons, so comparing the tensor
     # dicts is non-trivial, but they should also be equal
     self.assertEqual(batch1, batch2)
Exemplo n.º 7
0
 def test_create_batches(self):
     data = Data(self.data_source, self.tensorizers,
                 Batcher(train_batch_size=16))
     batches = list(data.batches(Stage.TRAIN))
     self.assertEqual(1, len(batches))
     raw_batch, batch = next(iter(batches))
     self.assertEqual(set(self.tensorizers), set(batch))
     tokens, seq_lens, _ = batch["tokens"]
     self.assertEqual(10, len(raw_batch))
     self.assertEqual({"text", "label", RawExampleFieldName.ROW_INDEX},
                      set(raw_batch[0]))
     self.assertEqual((10, ), seq_lens.size())
     self.assertEqual((10, ), batch["labels"].size())
     self.assertEqual({"tokens", "labels"}, set(batch))
     self.assertEqual(10, len(tokens))
 def _get_config_with_export_list(
     self,
     task_class: Type[NewTask],
     model_class: Type[Model],
     test_file_metadata: TestFileMetadata,
 ) -> PyTextConfig:
     return PyTextConfig(
         task=task_class.Config(
             data=Data.Config(
                 source=TSVDataSource.Config(
                     train_filename=test_file_metadata.filename,
                     eval_filename=test_file_metadata.filename,
                     test_filename=test_file_metadata.filename,
                     field_names=test_file_metadata.field_names,
                 ),
                 batcher=PoolingBatcher.Config(train_batch_size=1,
                                               test_batch_size=1),
             ),
             trainer=TaskTrainer.Config(epochs=1),
             model=model_class.Config(
                 inputs=type(model_class.Config.inputs)(
                     dense=FloatListTensorizer.Config(
                         column=test_file_metadata.dense_col_name,
                         error_check=True,
                         dim=test_file_metadata.dense_feat_dim,
                     ))),
         ),
         use_tensorboard=False,
         use_cuda_if_available=False,
         export=ExportConfig(
             export_torchscript_path="/tmp/model_torchscript.pt"),
         version=LATEST_VERSION,
     )
Exemplo n.º 9
0
 def _get_pytext_config(
     self,
     test_file_name: TestFileName,
     task_class: Type[NewTask],
     model_class: Type[Model],
 ) -> PyTextConfig:
     test_file_metadata = get_test_file_metadata(test_file_name)
     return PyTextConfig(
         task=task_class.Config(
             data=Data.Config(
                 source=TSVDataSource.Config(
                     train_filename=test_file_metadata.filename,
                     eval_filename=test_file_metadata.filename,
                     test_filename=test_file_metadata.filename,
                     field_names=test_file_metadata.field_names,
                 ),
                 batcher=Batcher.Config(
                 ),  # Use Batcher to avoid shuffling.
             ),
             trainer=TaskTrainer.Config(epochs=1),
             model=model_class.Config(
                 inputs=type(model_class.Config.inputs)(
                     dense=FloatListTensorizer.Config(
                         column=test_file_metadata.dense_col_name,
                         dim=test_file_metadata.dense_feat_dim,
                     ))),
         ),
         use_tensorboard=False,
         use_cuda_if_available=False,
         version=LATEST_VERSION,
     )
Exemplo n.º 10
0
    def test_batch_predict_caffe2_model(self):
        with tempfile.NamedTemporaryFile(
        ) as snapshot_file, tempfile.NamedTemporaryFile() as caffe2_model_file:
            train_data = tests_module.test_file("train_data_tiny.tsv")
            eval_data = tests_module.test_file("test_data_tiny.tsv")
            config = PyTextConfig(
                task=DocumentClassificationTask.Config(data=Data.Config(
                    source=TSVDataSource.Config(
                        train_filename=train_data,
                        eval_filename=eval_data,
                        test_filename=eval_data,
                        field_names=["label", "slots", "text"],
                    ))),
                version=LATEST_VERSION,
                save_snapshot_path=snapshot_file.name,
                export_caffe2_path=caffe2_model_file.name,
            )
            task = create_task(config.task)
            task.export(task.model, caffe2_model_file.name)
            model = task.model
            save(config, model, meta=None, tensorizers=task.data.tensorizers)

            results = batch_predict_caffe2_model(snapshot_file.name,
                                                 caffe2_model_file.name)
            self.assertEqual(4, len(results))
Exemplo n.º 11
0
    def test_load_saved_model(self):
        with tempfile.NamedTemporaryFile() as snapshot_file:
            train_data = tests_module.test_file("train_data_tiny.tsv")
            eval_data = tests_module.test_file("test_data_tiny.tsv")
            config = PyTextConfig(
                task=DocumentClassificationTask.Config(
                    data=Data.Config(
                        source=TSVDataSource.Config(
                            train_filename=train_data,
                            eval_filename=eval_data,
                            field_names=["label", "slots", "text"],
                        )
                    )
                ),
                version=LATEST_VERSION,
                save_snapshot_path=snapshot_file.name,
            )
            task = create_task(config.task)
            model = task.model

            save(config, model, meta=None, tensorizers=task.data.tensorizers)
            task2, config2 = load(snapshot_file.name)

            self.assertEqual(config, config2)
            self.assertModulesEqual(model, task2.model)

            model.eval()
            task2.model.eval()

            inputs = torch.LongTensor([[1, 2, 3]]), torch.LongTensor([3])
            self.assertEqual(model(*inputs).tolist(), task2.model(*inputs).tolist())
Exemplo n.º 12
0
    def test_batch_predict_caffe2_model(self):
        with tempfile.NamedTemporaryFile() as snapshot_file, tempfile.NamedTemporaryFile() as caffe2_model_file:
            train_data = tests_module.test_file("train_data_tiny.tsv")
            eval_data = tests_module.test_file("test_data_tiny.tsv")
            config = PyTextConfig(
                task=DocumentClassificationTask.Config(
                    model=DocModel.Config(
                        inputs=DocModel.Config.ModelInput(
                            tokens=TokenTensorizer.Config(),
                            dense=FloatListTensorizer.Config(
                                column="dense", dim=1, error_check=True
                            ),
                            labels=LabelTensorizer.Config(),
                        )
                    ),
                    data=Data.Config(
                        source=TSVDataSource.Config(
                            train_filename=train_data,
                            eval_filename=eval_data,
                            test_filename=eval_data,
                            field_names=["label", "slots", "text", "dense"],
                        )
                    ),
                ),
                version=21,
                save_snapshot_path=snapshot_file.name,
                export_caffe2_path=caffe2_model_file.name,
            )
            task = create_task(config.task)
            task.export(task.model, caffe2_model_file.name)
            model = task.model
            save(config, model, meta=None, tensorizers=task.data.tensorizers)

            pt_results = task.predict(task.data.data_source.test)

            def assert_caffe2_results_correct(caffe2_results):
                for pt_res, res in zip(pt_results, caffe2_results):
                    np.testing.assert_array_almost_equal(
                        pt_res["score"].tolist()[0],
                        [score[0] for score in res.values()],
                    )

            results = batch_predict_caffe2_model(
                snapshot_file.name, caffe2_model_file.name
            )
            self.assertEqual(4, len(results))
            assert_caffe2_results_correct(results)

            results = batch_predict_caffe2_model(
                snapshot_file.name, caffe2_model_file.name, cache_size=2
            )
            self.assertEqual(4, len(results))
            assert_caffe2_results_correct(results)

            results = batch_predict_caffe2_model(
                snapshot_file.name, caffe2_model_file.name, cache_size=-1
            )
            self.assertEqual(4, len(results))
            assert_caffe2_results_correct(results)
Exemplo n.º 13
0
        def test_load_checkpoint(self):
            with tempfile.NamedTemporaryFile() as checkpoint_file:
                train_data = tests_module.test_file("train_data_tiny.tsv")
                eval_data = tests_module.test_file("test_data_tiny.tsv")
                config = PyTextConfig(
                    task=DocumentClassificationTask.Config(data=Data.Config(
                        source=TSVDataSource.Config(
                            train_filename=train_data,
                            eval_filename=eval_data,
                            field_names=["label", "slots", "text"],
                        ))),
                    version=LATEST_VERSION,
                    save_snapshot_path=checkpoint_file.name,
                )
                task = create_task(config.task)
                model = task.model
                # test checkpoint saving and loading
                optimizer = create_optimizer(Adam.Config(), model)
                scheduler = create_scheduler(Scheduler.Config(), optimizer)
                training_state = TrainingState(
                    model=model,
                    optimizer=optimizer,
                    scheduler=scheduler,
                    start_time=0,
                    epoch=0,
                    rank=0,
                    stage=Stage.TRAIN,
                    epochs_since_last_improvement=0,
                    best_model_state=None,
                    best_model_metric=None,
                    tensorizers=None,
                )

                checkpoint_path = checkpoint_file.name
                save(
                    config,
                    model,
                    None,
                    task.data.tensorizers,
                    training_state,
                    checkpoint_file,
                )
                task_restored, config_restored, training_state_restored = load(
                    checkpoint_path)
                optimizer_restored = training_state_restored.optimizer
                scheduler_restored = training_state_restored.scheduler
                self.assertOptimizerEqual(optimizer, optimizer_restored)
                self.assertNotNone(scheduler_restored)
                self.assertEqual(config, config_restored)
                self.assertModulesEqual(model, task_restored.model)
                model.eval()
                task_restored.model.eval()

                inputs = torch.LongTensor([[1, 2, 3]]), torch.LongTensor([3])
                self.assertEqual(
                    model(*inputs).tolist(),
                    task_restored.model(*inputs).tolist())
Exemplo n.º 14
0
 def test_data_initializes_tensorsizers(self):
     tensorizers = {
         "tokens": TokenTensorizer(text_column="text"),
         "labels": LabelTensorizer(label_column="label"),
     }
     # verify TokenTensorizer isn't in an initialized state yet
     assert tensorizers["tokens"].vocab is None
     Data(self.data_source, tensorizers)
     # Tensorizers should have been initialized
     self.assertEqual(49, len(tensorizers["tokens"].vocab))
     self.assertEqual(7, len(tensorizers["labels"].vocab))
Exemplo n.º 15
0
 def test_sort(self):
     data = Data(
         self.data_source,
         self.tensorizers,
         Batcher(train_batch_size=16),
         sort_key="tokens",
     )
     batches = list(data.batches(Stage.TRAIN))
     batch = next(iter(batches))
     _, seq_lens, _ = batch["tokens"]
     seq_lens = seq_lens.tolist()
     for i in range(len(seq_lens) - 1):
         self.assertTrue(seq_lens[i] >= seq_lens[i + 1])
     # make sure labels are also in the same order of sorted tokens
     self.assertEqual(
         self.tensorizers["labels"].vocab[batch["labels"][1]],
         "reminder/set_reminder",
     )
     self.assertEqual(self.tensorizers["labels"].vocab[batch["labels"][8]],
                      "alarm/snooze_alarm")
Exemplo n.º 16
0
 def test_data_initializes_tensorsizers(self):
     tensorizers = {
         "tokens": WordTensorizer(column="text"),
         "labels": LabelTensorizer(column="label"),
     }
     with self.assertRaises(AttributeError):
         # verify WordTensorizer isn't in an initialized state yet
         tensorizers["tokens"].vocab
     Data(self.data_source, tensorizers)
     # Tensorizers should have been initialized
     self.assertEqual(49, len(tensorizers["tokens"].vocab))
     self.assertEqual(7, len(tensorizers["labels"].labels))
Exemplo n.º 17
0
    def test_load_checkpoint_in_dist_training(self):
        with tempfile.NamedTemporaryFile() as checkpoint_file:
            train_data = tests_module.test_file("train_data_tiny.tsv")
            eval_data = tests_module.test_file("test_data_tiny.tsv")
            config = PyTextConfig(
                task=DocumentClassificationTask.Config(data=Data.Config(
                    source=BlockShardedTSVDataSource.Config(
                        train_filename=train_data,
                        eval_filename=eval_data,
                        field_names=["label", "slots", "text"],
                    ))),
                version=LATEST_VERSION,
                save_snapshot_path=checkpoint_file.name,
            )
            task = create_task(config.task)
            model = task.model
            # test checkpoint saving and loading
            optimizer = create_optimizer(Adam.Config(), model)
            scheduler = create_scheduler(Scheduler.Config(), optimizer)
            training_state = TrainingState(
                model=model,
                optimizer=optimizer,
                scheduler=scheduler,
                start_time=0,
                epoch=0,
                rank=0,
                stage=Stage.TRAIN,
                epochs_since_last_improvement=0,
                best_model_state=None,
                best_model_metric=None,
                tensorizers=task.data.tensorizers,
            )

            id = "epoch-1"
            saved_path = save(config, model, None, task.data.tensorizers,
                              training_state, id)
            new_rank = 2
            new_world_size = 4
            task_restored, config_restored, training_state_restored = load(
                saved_path, rank=new_rank, world_size=new_world_size)
            self.assertCheckpointEqual(
                model,
                config,
                training_state,
                task_restored.model,
                config_restored,
                training_state_restored,
            )
            self.assertEqual(task_restored.data.data_source.rank, new_rank)
            self.assertEqual(task_restored.data.data_source.world_size,
                             new_world_size)
Exemplo n.º 18
0
    def test_create_batches_with_cache(self):
        data = Data(
            self.data_source,
            self.tensorizers,
            Batcher(train_batch_size=1),
            in_memory=True,
        )
        list(data.batches(Stage.TRAIN))
        self.assertEqual(10, len(data.numberized_cache[Stage.TRAIN]))

        data1 = Data(
            self.data_source,
            self.tensorizers,
            Batcher(train_batch_size=1),
            in_memory=True,
        )
        with self.assertRaises(Exception):
            # Concurrent iteration not supported
            batches1 = data1.batches(Stage.TRAIN)
            batches2 = data1.batches(Stage.TRAIN)
            for _ in batches1:
                for _ in batches2:
                    continue
Exemplo n.º 19
0
    def test_load_saved_model(self):
        with tempfile.NamedTemporaryFile() as snapshot_file:
            train_data = tests_module.test_file("train_data_tiny.tsv")
            eval_data = tests_module.test_file("test_data_tiny.tsv")
            config = PyTextConfig(
                task=DocumentClassificationTask.Config(
                    data=Data.Config(
                        source=TSVDataSource.Config(
                            train_filename=train_data,
                            eval_filename=eval_data,
                            field_names=["label", "slots", "text"],
                        )
                    )
                ),
                version=LATEST_VERSION,
                save_snapshot_path=snapshot_file.name,
            )
            task = create_task(config.task)
            model = task.model

            save(config, model, meta=None, tensorizers=task.data.tensorizers)
            task2, config2, training_state_none = load(snapshot_file.name)

            self.assertEqual(config, config2)
            self.assertModulesEqual(model, task2.model)
            self.assertIsNone(training_state_none)
            model.eval()
            task2.model.eval()

            inputs = torch.LongTensor([[1, 2, 3]]), torch.LongTensor([3])
            self.assertEqual(model(*inputs).tolist(), task2.model(*inputs).tolist())

        def assertOptimizerEqual(self, optim_1, optim_2, msg=None):
            self.assertTrue(optim_1 is Optimizer and optim_2 is Optimizer, msg)
            state_dict_1 = optim_1.state_dict()
            state_dict_2 = optim_2.state_dict()
            self.assertEqual(len(state_dict_1), len(state_dict_2))
            for key_1, val_1 in optim_1.state_dict().items():
                self.assertEqualt(val_1, state_dict_2[key_1], msg)

        def test_load_checkpoint(self):
            with tempfile.NamedTemporaryFile() as checkpoint_file:
                train_data = tests_module.test_file("train_data_tiny.tsv")
                eval_data = tests_module.test_file("test_data_tiny.tsv")
                config = PyTextConfig(
                    task=DocumentClassificationTask.Config(
                        data=Data.Config(
                            source=TSVDataSource.Config(
                                train_filename=train_data,
                                eval_filename=eval_data,
                                field_names=["label", "slots", "text"],
                            )
                        )
                    ),
                    version=LATEST_VERSION,
                    save_snapshot_path=checkpoint_file.name,
                )
                task = create_task(config.task)
                model = task.model
                # test checkpoint saving and loading
                optimizer = create_optimizer(Adam.Config(), model)
                scheduler = create_scheduler(Scheduler.Config(), optimizer)
                training_state = TrainingState(
                    model=model,
                    optimizer=optimizer,
                    scheduler=scheduler,
                    start_time=0,
                    epoch=0,
                    rank=0,
                    stage=Stage.TRAIN,
                    epochs_since_last_improvement=0,
                    best_model_state=None,
                    best_model_metric=None,
                    tensorizers=task.data.tensorizers,
                )

                checkpoint_path = checkpoint_file.name

                save(
                    config,
                    model,
                    None,
                    task.data.tensorizers,
                    training_state,
                    "epoch-1",
                )
                task_restored, config_restored, training_state_restored = load(
                    checkpoint_path
                )
                optimizer_restored = training_state_restored.optimizer
                scheduler_restored = training_state_restored.scheduler
                self.assertOptimizerEqual(optimizer, optimizer_restored)
                self.assertNotNone(scheduler_restored)
                self.assertEqual(config, config_restored)
                self.assertModulesEqual(model, task_restored.model)
                model.eval()
                task_restored.model.eval()

                inputs = torch.LongTensor([[1, 2, 3]]), torch.LongTensor([3])
                self.assertEqual(
                    model(*inputs).tolist(), task_restored.model(*inputs).tolist()
                )