def test_creation_non_float(self): name = 'scale' f_type_str = ft.FEATURE_TYPE_STRING f_type_flt = ft.FEATURE_TYPE_FLOAT sf_flt = ft.FeatureSource('Source', f_type_flt) sf_str = ft.FeatureSource('Source', f_type_str) with self.assertRaises(ft.FeatureDefinitionException): ft.FeatureNormalizeScale(name, f_type_str, sf_flt) with self.assertRaises(ft.FeatureDefinitionException): ft.FeatureNormalizeScale(name, f_type_flt, sf_str)
def test_filtering(self): name_t = 'test-tensor' f1 = ft.FeatureSource('test-feature-1', ft.FEATURE_TYPE_STRING) f2 = ft.FeatureIndex('test-feature-2', ft.FEATURE_TYPE_INT_8, f1) f3 = ft.FeatureSource('test-feature-3', ft.FEATURE_TYPE_STRING) f4 = ft.FeatureOneHot('test-feature-4', ft.FEATURE_TYPE_INT_8, f3) f5 = ft.FeatureSource('test-feature-5', ft.FEATURE_TYPE_FLOAT) f6 = ft.FeatureNormalizeScale('test-feature-6', ft.FEATURE_TYPE_FLOAT, f5) f7 = ft.FeatureNormalizeStandard('test-feature-7', ft.FEATURE_TYPE_FLOAT, f5) f8 = ft.FeatureLabelBinary('test-feature-8', ft.FEATURE_TYPE_INT_8, f2) t = ft.TensorDefinition(name_t, [f1, f2, f3, f4, f5, f6, f7, f8]) self.assertEqual( len(t.learning_categories), 4, f'Should be 4 categories. Got {len(t.learning_categories)}') self.assertListEqual(t.categorical_features(), [f2]) self.assertListEqual(t.binary_features(), [f4]) self.assertListEqual(t.continuous_features(), [f5, f6, f7]) self.assertListEqual(t.label_features(), [f8]) # Should fail because the Tensor Definition is ready for inference. with self.assertRaises(ft.TensorDefinitionException): t.categorical_features(True) t.binary_features(True) t.continuous_features(True) t.label_features(True) t.filter_features(ft.LEARNING_CATEGORY_CATEGORICAL, True)
class TestClassSampler(unittest.TestCase): """Class Sampler test cases """ fraud = ft.FeatureSource('Fraud', ft.FEATURE_TYPE_INT_8) s_features = [ ft.FeatureSource('Amount', ft.FEATURE_TYPE_FLOAT), ft.FeatureSource('Card', ft.FEATURE_TYPE_STRING), ft.FeatureSource('MCC', ft.FEATURE_TYPE_CATEGORICAL), ft.FeatureSource('Country', ft.FEATURE_TYPE_CATEGORICAL), fraud ] d_features = [ ft.FeatureNormalizeScale('Amount_Scale', ft.FEATURE_TYPE_FLOAT_32, s_features[0]), ft.FeatureOneHot('MCC_OH', s_features[2]), ft.FeatureIndex('Country_Index', ft.FEATURE_TYPE_INT_16, s_features[3]), ft.FeatureLabelBinary('Fraud', fraud) ] def test_creation_base(self): file = FILES_DIR + 'engine_test_base_comma.csv' tdb = ft.TensorDefinition('Base', self.s_features) tdd = ft.TensorDefinition('Derived', self.d_features) with en.EnginePandasNumpy() as e: df = e.from_csv(tdb, file, inference=False) df = e.from_df(tdd, df, inference=False) npl = e.to_numpy_list(tdd, df) cs = pt.ClassSampler(tdd, npl) self.assertIsInstance(cs, pt.ClassSampler, f'Was expecting ClassSampler type {type(cs)}') sm = cs.over_sampler(replacement=False) self.assertIsInstance(sm, data.WeightedRandomSampler, f'Was expecting Weighted Random Sampler {type(sm)}') self.assertEqual(len(sm), len(npl), f'Length not correct {len(sm)}') self.assertListEqual(sorted(list(sm)), list(range(len(npl))), f'Each index should be in the weight list') def test_creation_bad(self): file = FILES_DIR + 'engine_test_base_comma.csv' tdb = ft.TensorDefinition('Base', self.s_features) tdd = ft.TensorDefinition('Derived', self.d_features) with en.EnginePandasNumpy() as e: df = e.from_csv(tdb, file, inference=False) df = e.from_df(tdd, df, inference=False) npl = e.to_numpy_list(tdd, df) # Should fail because wrong tensor definition. It does not match the numpy list with self.assertRaises(pt.PyTorchTrainException): _ = pt.ClassSampler(tdb, npl)
def test_creation_base(self): name = 'scale' f_type = ft.FEATURE_TYPE_FLOAT sf = ft.FeatureSource('Source', ft.FEATURE_TYPE_FLOAT) scf = ft.FeatureNormalizeScale(name, f_type, sf) self.assertIsInstance(scf, ft.FeatureNormalizeScale, f'Incorrect Type {type(scf)}') self.assertEqual(scf.name, name, f'Scale feature name incorrect {name}') self.assertEqual(scf.type, f_type, f'Scale feature type should have been {f_type}') self.assertEqual(scf.inference_ready, False, f'Scale feature should NOT be inference ready') self.assertIsNone(scf.minimum, f'Scale minimum should be None') self.assertIsNone(scf.maximum, f'Scale maximum should be None') self.assertEqual(scf.learning_category, ft.LEARNING_CATEGORY_CONTINUOUS, f'Wrong Learning category') self.assertIsInstance(hash(scf), int, f'Hash function not working')
class TestNumpyDataSet(unittest.TestCase): """Numpy Dataset test cases """ fraud = ft.FeatureSource('Fraud', ft.FEATURE_TYPE_INT_8) s_features = [ ft.FeatureSource('Amount', ft.FEATURE_TYPE_FLOAT), ft.FeatureSource('Card', ft.FEATURE_TYPE_STRING), ft.FeatureSource('MCC', ft.FEATURE_TYPE_CATEGORICAL), ft.FeatureSource('Country', ft.FEATURE_TYPE_CATEGORICAL), fraud ] d_features = [ ft.FeatureNormalizeScale('Amount_Scale', ft.FEATURE_TYPE_FLOAT_32, s_features[0]), ft.FeatureOneHot('MCC_OH', s_features[2]), ft.FeatureIndex('Country_Index', ft.FEATURE_TYPE_INT_16, s_features[3]), ft.FeatureLabelBinary('Fraud', fraud) ] def test_creation_base(self): file = FILES_DIR + 'engine_test_base_comma.csv' tdb = ft.TensorDefinition('Base', self.s_features) tdd = ft.TensorDefinition('Derived', self.d_features) with en.EnginePandasNumpy() as e: df = e.from_csv(tdb, file, inference=False) df = e.from_df(tdd, df, inference=False) npl = e.to_numpy_list(tdd, df) ds = pt.NumpyListDataSet(tdd, npl) self.assertEqual(len(ds), len(npl), f'Length of DS is wrong. Got {len(ds)}. Expected {len(npl)}') t = ds[0] self.assertIsInstance(t, list, f'__get_item__ should have returned a list') self.assertIsInstance(t[0], torch.Tensor, f'__get_item__ should have returned a list of Tensors') self.assertEqual(len(t), len(tdd.learning_categories), f'Number of list must be number of Learning cats') # Test Shapes for n, t in zip(npl.lists, ds[0]): ns = n.shape[1] if len(n.shape) > 1 else 0 ts = 0 if len(list(t.shape)) == 0 else list(t.shape)[0] self.assertEqual(ns, ts) # Test data types. for i, d in enumerate(d373c7.pytorch.data._DTypeHelper.get_dtypes(tdd)): self.assertEqual(ds[0][i].dtype, d, f'Default data types don not match {i}, expected {d}') def test_creation_bad(self): file = FILES_DIR + 'engine_test_base_comma.csv' tdb = ft.TensorDefinition('Base', self.s_features) tdd = ft.TensorDefinition('Derived', self.d_features) with en.EnginePandasNumpy() as e: df = e.from_csv(tdb, file, inference=False) df = e.from_df(tdd, df, inference=False) npl = e.to_numpy_list(tdd, df) # Try building off of the wrong tensor definition with self.assertRaises(pt.PyTorchTrainException): _ = pt.NumpyListDataSet(tdb, npl) def test_creation_data_loader(self): file = FILES_DIR + 'engine_test_base_comma.csv' bs = 3 tdb = ft.TensorDefinition('Base', self.s_features) tdd = ft.TensorDefinition('Derived', self.d_features) with en.EnginePandasNumpy() as e: df = e.from_csv(tdb, file, inference=False) df = e.from_df(tdd, df, inference=False) npl = e.to_numpy_list(tdd, df) ds = pt.NumpyListDataSet(tdd, npl) dl = ds.data_loader(torch.device('cpu'), bs) t = next(iter(dl)) self.assertEqual(len(t), len(tdd.learning_categories)) # Test data types. for i, d in enumerate(d373c7.pytorch.data._DTypeHelper.get_dtypes(tdd)): self.assertEqual(t[i].dtype, d, f'Default data types don not match {i}, expected {d}') # Check batch-size for i, te in enumerate(t): self.assertEqual(te.shape[0], bs, f'Batch size does not match item {i}. Got {te.shape[0]}')