def test_create_float_list_tensor(self): tensorizer = FloatListTensorizer( column="dense", dim=2, error_check=True, normalize=False ) tests = [ ("[0.1,0.2]", [0.1, 0.2]), # comma ("[0.1, 0.2]", [0.1, 0.2]), # comma with single space ("[0.1, 0.2]", [0.1, 0.2]), # comma with multiple spaces ("[0.1 0.2]", [0.1, 0.2]), # space ("[0.1 0.2]", [0.1, 0.2]), # multiple spaces ("[ 0.1 0.2]", [0.1, 0.2]), # space after [ ("[0.1 0.2 ]", [0.1, 0.2]), # space before ] ("[0. 1.]", [0.0, 1.0]), # 0., 1. ] for raw, expected in tests: row = {"dense": load_float_list(raw)} numberized = tensorizer.numberize(row) self.assertEqual(expected, numberized) precision.FP16_ENABLED = True batch = [] for raw, _ in tests: row = {"dense": load_float_list(raw)} batch.append(tensorizer.numberize(row)) tensor = tensorizer.tensorize(batch) self.assertEqual(list(tensor.size()), [8, 2]) self.assertEqual(tensor.dtype, torch.float16) precision.FP16_ENABLED = False
def test_float_list_tensor_prepare_input(self): tensorizer = FloatListTensorizer( column="dense", dim=2, error_check=True, normalize=False ) tests = [("[0.1,0.2]", [0.1, 0.2])] for raw, expected in tests: row = {"dense": load_float_list(raw)} numberized = tensorizer.prepare_input(row) self.assertEqual(expected, numberized)
def test_create_float_list_tensor(self): tensorizer = FloatListTensorizer(column="dense", dim=2, error_check=True) rows = [ {"dense": "[0.1,0.2]"}, # comma {"dense": "[0.1, 0.2]"}, # comma with single space {"dense": "[0.1, 0.2]"}, # comma with multiple spaces {"dense": "[0.1 0.2]"}, # space {"dense": "[0.1 0.2]"}, # multiple spaces {"dense": "[ 0.1 0.2]"}, # space after [ {"dense": "[0.1 0.2 ]"}, # space before ] ] tensors = (tensorizer.numberize(row) for row in rows) for tensor in tensors: self.assertEqual([0.1, 0.2], tensor)
def _get_config_with_export_list( self, task_class: Type[NewTask], model_class: Type[Model], test_file_metadata: TestFileMetadata, ) -> PyTextConfig: return PyTextConfig( task=task_class.Config( data=Data.Config( source=TSVDataSource.Config( train_filename=test_file_metadata.filename, eval_filename=test_file_metadata.filename, test_filename=test_file_metadata.filename, field_names=test_file_metadata.field_names, ), batcher=PoolingBatcher.Config(train_batch_size=1, test_batch_size=1), ), trainer=TaskTrainer.Config(epochs=1), model=model_class.Config( inputs=type(model_class.Config.inputs)( dense=FloatListTensorizer.Config( column=test_file_metadata.dense_col_name, error_check=True, dim=test_file_metadata.dense_feat_dim, ))), ), use_tensorboard=False, use_cuda_if_available=False, export=ExportConfig( export_torchscript_path="/tmp/model_torchscript.pt"), version=LATEST_VERSION, )
def _get_pytext_config( self, test_file_name: TestFileName, task_class: Type[NewTask], model_class: Type[Model], ) -> PyTextConfig: test_file_metadata = get_test_file_metadata(test_file_name) return PyTextConfig( task=task_class.Config( data=Data.Config( source=TSVDataSource.Config( train_filename=test_file_metadata.filename, eval_filename=test_file_metadata.filename, test_filename=test_file_metadata.filename, field_names=test_file_metadata.field_names, ), batcher=Batcher.Config( ), # Use Batcher to avoid shuffling. ), trainer=TaskTrainer.Config(epochs=1), model=model_class.Config( inputs=type(model_class.Config.inputs)( dense=FloatListTensorizer.Config( column=test_file_metadata.dense_col_name, dim=test_file_metadata.dense_feat_dim, ))), ), use_tensorboard=False, use_cuda_if_available=False, version=LATEST_VERSION, )
def test_create_float_list_tensor(self): tensorizer = FloatListTensorizer(column="dense", dim=2, error_check=True) tests = [ ("[0.1,0.2]", [0.1, 0.2]), # comma ("[0.1, 0.2]", [0.1, 0.2]), # comma with single space ("[0.1, 0.2]", [0.1, 0.2]), # comma with multiple spaces ("[0.1 0.2]", [0.1, 0.2]), # space ("[0.1 0.2]", [0.1, 0.2]), # multiple spaces ("[ 0.1 0.2]", [0.1, 0.2]), # space after [ ("[0.1 0.2 ]", [0.1, 0.2]), # space before ] ("[0. 1.]", [0.0, 1.0]), # 0., 1. ] for raw, expected in tests: row = {"dense": load_float_list(raw)} numberized = tensorizer.numberize(row) self.assertEqual(expected, numberized)
def test_batch_predict_caffe2_model(self): with tempfile.NamedTemporaryFile() as snapshot_file, tempfile.NamedTemporaryFile() as caffe2_model_file: train_data = tests_module.test_file("train_data_tiny.tsv") eval_data = tests_module.test_file("test_data_tiny.tsv") config = PyTextConfig( task=DocumentClassificationTask.Config( model=DocModel.Config( inputs=DocModel.Config.ModelInput( tokens=TokenTensorizer.Config(), dense=FloatListTensorizer.Config( column="dense", dim=1, error_check=True ), labels=LabelTensorizer.Config(), ) ), data=Data.Config( source=TSVDataSource.Config( train_filename=train_data, eval_filename=eval_data, test_filename=eval_data, field_names=["label", "slots", "text", "dense"], ) ), ), version=21, save_snapshot_path=snapshot_file.name, export_caffe2_path=caffe2_model_file.name, ) task = create_task(config.task) task.export(task.model, caffe2_model_file.name) model = task.model save(config, model, meta=None, tensorizers=task.data.tensorizers) pt_results = task.predict(task.data.data_source.test) def assert_caffe2_results_correct(caffe2_results): for pt_res, res in zip(pt_results, caffe2_results): np.testing.assert_array_almost_equal( pt_res["score"].tolist()[0], [score[0] for score in res.values()], ) results = batch_predict_caffe2_model( snapshot_file.name, caffe2_model_file.name ) self.assertEqual(4, len(results)) assert_caffe2_results_correct(results) results = batch_predict_caffe2_model( snapshot_file.name, caffe2_model_file.name, cache_size=2 ) self.assertEqual(4, len(results)) assert_caffe2_results_correct(results) results = batch_predict_caffe2_model( snapshot_file.name, caffe2_model_file.name, cache_size=-1 ) self.assertEqual(4, len(results)) assert_caffe2_results_correct(results)
def test_create_normalized_float_list_tensor(self): def round_list(l): return [float("%.4f" % n) for n in l] data = TSVDataSource( SafeFileWrapper(tests_module.test_file("train_dense_features_tiny.tsv")), eval_file=None, field_names=["label", "slots", "text", "dense_feat"], schema={"text": str, "label": str, "dense_feat": List[float]}, ) tensorizer = FloatListTensorizer( column="dense_feat", dim=10, error_check=True, normalize=True ) self._initialize_tensorizer(tensorizer, data) self.assertEqual(10, tensorizer.normalizer.num_rows) self.assertEqual( round_list( [ 7.56409, 8.2388, 0.5531, 0.2403, 1.03130, 6.2888, 3.1595, 0.1538, 0.2403, 5.3463, ] ), round_list(tensorizer.normalizer.feature_sums), ) self.assertEqual( round_list( [ 5.80172, 7.57586, 0.30591, 0.05774, 0.52762, 5.22811, 2.51727, 0.02365, 0.05774, 4.48798, ] ), round_list(tensorizer.normalizer.feature_squared_sums), ) self.assertEqual( round_list( [ 0.75640, 0.82388, 0.05531, 0.02403, 0.10313, 0.62888, 0.31595, 0.01538, 0.02403, 0.53463, ] ), round_list(tensorizer.normalizer.feature_avgs), ) self.assertEqual( round_list( [ 0.08953, 0.28072, 0.16593, 0.07209, 0.20524, 0.35682, 0.38974, 0.04614, 0.07209, 0.40369, ] ), round_list(tensorizer.normalizer.feature_stddevs), ) row = [0.64840776, 0.7575, 0.5531, 0.2403, 0, 0.9481, 0, 0.1538, 0.2403, 0.3564] output = tensorizer.numberize({"dense_feat": row}) self.assertEqual( round_list( [ -1.20619, -0.23646, 2.99999, 3.0, -0.50246, 0.89462, -0.81066, 2.99999, 3.0, -0.44149, ] ), round_list(output), )