def test_creation_bad(self): file = FILES_DIR + 'engine_test_base_comma.csv' tdb = ft.TensorDefinition('Base', self.s_features) tdd = ft.TensorDefinition('Derived', self.d_features) with en.EnginePandasNumpy() as e: df = e.from_csv(tdb, file, inference=False) df = e.from_df(tdd, df, inference=False) npl = e.to_numpy_list(tdd, df) # Should fail because wrong tensor definition. It does not match the numpy list with self.assertRaises(pt.PyTorchTrainException): _ = pt.ClassSampler(tdb, npl)
def test_creation_bad(self): file = FILES_DIR + 'engine_test_base_comma.csv' tdb = ft.TensorDefinition('Base', self.s_features) tdd = ft.TensorDefinition('Derived', self.d_features) with en.EnginePandasNumpy() as e: df = e.from_csv(tdb, file, inference=False) df = e.from_df(tdd, df, inference=False) npl = e.to_numpy_list(tdd, df) # Try building off of the wrong tensor definition with self.assertRaises(pt.PyTorchTrainException): _ = pt.NumpyListDataSet(tdb, npl)
def test_creation_bad(self): name_t1 = 'test-tensor-1' f1 = ft.FeatureSource('test-feature-1', ft.FEATURE_TYPE_FLOAT) f2 = ft.FeatureLabelBinary('test-feature-3', ft.FEATURE_TYPE_INT_8, f1) t1 = ft.TensorDefinition(name_t1, [f1, f2]) name_t2 = 'test-tensor-2' f3 = ft.FeatureSource('test-feature-1', ft.FEATURE_TYPE_FLOAT) f4 = ft.FeatureSource('test-feature-2', ft.FEATURE_TYPE_STRING) f5 = ft.FeatureLabelBinary('test-feature-3', ft.FEATURE_TYPE_INT_8, f3) t2 = ft.TensorDefinition(name_t2, [f3, f4, f5]) # 2 TensorDefinitions with labels with self.assertRaises(ft.TensorDefinitionException): _ = ft.TensorDefinitionMulti([t1, t2])
def test_creation_base(self): file = FILES_DIR + 'engine_test_base_comma.csv' tdb = ft.TensorDefinition('Base', self.s_features) tdd = ft.TensorDefinition('Derived', self.d_features) with en.EnginePandasNumpy() as e: df = e.from_csv(tdb, file, inference=False) df = e.from_df(tdd, df, inference=False) npl = e.to_numpy_list(tdd, df) cs = pt.ClassSampler(tdd, npl) self.assertIsInstance(cs, pt.ClassSampler, f'Was expecting ClassSampler type {type(cs)}') sm = cs.over_sampler(replacement=False) self.assertIsInstance(sm, data.WeightedRandomSampler, f'Was expecting Weighted Random Sampler {type(sm)}') self.assertEqual(len(sm), len(npl), f'Length not correct {len(sm)}') self.assertListEqual(sorted(list(sm)), list(range(len(npl))), f'Each index should be in the weight list')
def test_filtering(self): name_t = 'test-tensor' f1 = ft.FeatureSource('test-feature-1', ft.FEATURE_TYPE_STRING) f2 = ft.FeatureIndex('test-feature-2', ft.FEATURE_TYPE_INT_8, f1) f3 = ft.FeatureSource('test-feature-3', ft.FEATURE_TYPE_STRING) f4 = ft.FeatureOneHot('test-feature-4', ft.FEATURE_TYPE_INT_8, f3) f5 = ft.FeatureSource('test-feature-5', ft.FEATURE_TYPE_FLOAT) f6 = ft.FeatureNormalizeScale('test-feature-6', ft.FEATURE_TYPE_FLOAT, f5) f7 = ft.FeatureNormalizeStandard('test-feature-7', ft.FEATURE_TYPE_FLOAT, f5) f8 = ft.FeatureLabelBinary('test-feature-8', ft.FEATURE_TYPE_INT_8, f2) t = ft.TensorDefinition(name_t, [f1, f2, f3, f4, f5, f6, f7, f8]) self.assertEqual( len(t.learning_categories), 4, f'Should be 4 categories. Got {len(t.learning_categories)}') self.assertListEqual(t.categorical_features(), [f2]) self.assertListEqual(t.binary_features(), [f4]) self.assertListEqual(t.continuous_features(), [f5, f6, f7]) self.assertListEqual(t.label_features(), [f8]) # Should fail because the Tensor Definition is ready for inference. with self.assertRaises(ft.TensorDefinitionException): t.categorical_features(True) t.binary_features(True) t.continuous_features(True) t.label_features(True) t.filter_features(ft.LEARNING_CATEGORY_CATEGORICAL, True)
def test_len(self): name_t = 'test-tensor' f1 = ft.FeatureSource('test-feature-1', ft.FEATURE_TYPE_STRING) f2 = ft.FeatureSource('test-feature-2', ft.FEATURE_TYPE_STRING) t = ft.TensorDefinition(name_t, [f1, f2]) self.assertEqual( len(t), len([f1, f2]), f'Tensor definition length not working. Got {len(t)}')
def test_remove(self): name_t = 'test-tensor' f1 = ft.FeatureSource('test-feature-1', ft.FEATURE_TYPE_STRING) f2 = ft.FeatureSource('test-feature-2', ft.FEATURE_TYPE_STRING) t = ft.TensorDefinition(name_t, [f1, f2]) t.remove(f2) self.assertNotIn(f2, t.features, f'Tensor Definition Feature Removal failed')
def test_overlap_base_feature(self): # Should fail because the base feature is shared name_t = 'test-tensor' f1 = ft.FeatureSource('test-feature-1', ft.FEATURE_TYPE_STRING) f2 = ft.FeatureIndex('test-feature-2', ft.FEATURE_TYPE_INT_8, f1) f3 = ft.FeatureOneHot('test-feature-3', ft.FEATURE_TYPE_INT_8, f1) with self.assertRaises(ft.TensorDefinitionException): _ = ft.TensorDefinition(name_t, [f1, f2, f3])
def test_creation_data_loader(self): file = FILES_DIR + 'engine_test_base_comma.csv' bs = 3 tdb = ft.TensorDefinition('Base', self.s_features) tdd = ft.TensorDefinition('Derived', self.d_features) with en.EnginePandasNumpy() as e: df = e.from_csv(tdb, file, inference=False) df = e.from_df(tdd, df, inference=False) npl = e.to_numpy_list(tdd, df) ds = pt.NumpyListDataSet(tdd, npl) dl = ds.data_loader(torch.device('cpu'), bs) t = next(iter(dl)) self.assertEqual(len(t), len(tdd.learning_categories)) # Test data types. for i, d in enumerate(d373c7.pytorch.data._DTypeHelper.get_dtypes(tdd)): self.assertEqual(t[i].dtype, d, f'Default data types don not match {i}, expected {d}') # Check batch-size for i, te in enumerate(t): self.assertEqual(te.shape[0], bs, f'Batch size does not match item {i}. Got {te.shape[0]}')
def test_creation(self): name_t1 = 'test-tensor-1' f1 = ft.FeatureSource('test-feature-1', ft.FEATURE_TYPE_STRING) f2 = ft.FeatureSource('test-feature-2', ft.FEATURE_TYPE_STRING) t1 = ft.TensorDefinition(name_t1, [f1, f2]) name_t2 = 'test-tensor-2' f3 = ft.FeatureSource('test-feature-1', ft.FEATURE_TYPE_FLOAT) f4 = ft.FeatureSource('test-feature-2', ft.FEATURE_TYPE_STRING) f5 = ft.FeatureLabelBinary('test-feature-3', ft.FEATURE_TYPE_INT_8, f3) t2 = ft.TensorDefinition(name_t2, [f3, f4, f5]) t3 = ft.TensorDefinitionMulti([t1, t2]) self.assertIsInstance(t3, ft.TensorDefinitionMulti, f'Creation failed. Not correct type {type(t3)}') t4, t5 = t3.tensor_definitions self.assertEqual( t1, t4, f'First Tensor Def don not match {t1.name} {t4.name}') self.assertEqual( t2, t5, f'Second Tensor Def don not match {t1.name} {t5.name}') self.assertEqual(t3.label_tensor_definition, t2, f'That is not the tensor def with the label')
def test_highest_precision(self): name_t = 'test-tensor' f1 = ft.FeatureSource('test-feature-1', ft.FEATURE_TYPE_STRING) f2 = ft.FeatureSource('test-feature-4', ft.FEATURE_TYPE_FLOAT) f3 = ft.FeatureIndex('test-feature-2', ft.FEATURE_TYPE_INT_8, f1) t = ft.TensorDefinition(name_t, [f1, f2, f3]) self.assertEqual(t.highest_precision_feature, f2, f'Wrong HP feature {t.highest_precision_feature}') t.remove(f2) t.remove(f3) with self.assertRaises(ft.TensorDefinitionException): _ = t.highest_precision_feature
def test_filter(self): file = FILES_DIR + 'engine_test_base_comma.csv' fa = ft.FeatureSource('Amount', ft.FEATURE_TYPE_FLOAT_32) ff = ft.FeatureSource('Fraud', ft.FEATURE_TYPE_FLOAT_32) fl = ft.FeatureLabelBinary('Fraud_Label', ft.FEATURE_TYPE_INT_8, ff) tb = ft.TensorDefinition('base-features', [fa, ff]) td = ft.TensorDefinition('derived-features', [fa, fl]) with en.EnginePandasNumpy() as e: df = e.from_csv(tb, file, inference=False) df = e.from_df(td, df, tb, inference=False) nl = e.to_numpy_list(td, df) rows = df[df['Fraud_Label'] == 0].index amounts = df[df['Fraud_Label'] == 0]['Amount'] r = nl.filter_label(td, 0) self.assertEqual( len(rows), len(r), f'Lengths do not match. Got {len(rows)}. Expected {len(r)}') self.assertNotIn( 1, list(r.lists[1]), f'There should not have been "1"/Fraud entries entries') self.assertEqual(list(amounts), list(r.lists[0]), 'Amounts do not seem to be filtered')
def test_creation_base(self): file = FILES_DIR + 'engine_test_base_comma.csv' tdb = ft.TensorDefinition('Base', self.s_features) tdd = ft.TensorDefinition('Derived', self.d_features) with en.EnginePandasNumpy() as e: df = e.from_csv(tdb, file, inference=False) df = e.from_df(tdd, df, inference=False) npl = e.to_numpy_list(tdd, df) ds = pt.NumpyListDataSet(tdd, npl) self.assertEqual(len(ds), len(npl), f'Length of DS is wrong. Got {len(ds)}. Expected {len(npl)}') t = ds[0] self.assertIsInstance(t, list, f'__get_item__ should have returned a list') self.assertIsInstance(t[0], torch.Tensor, f'__get_item__ should have returned a list of Tensors') self.assertEqual(len(t), len(tdd.learning_categories), f'Number of list must be number of Learning cats') # Test Shapes for n, t in zip(npl.lists, ds[0]): ns = n.shape[1] if len(n.shape) > 1 else 0 ts = 0 if len(list(t.shape)) == 0 else list(t.shape)[0] self.assertEqual(ns, ts) # Test data types. for i, d in enumerate(d373c7.pytorch.data._DTypeHelper.get_dtypes(tdd)): self.assertEqual(ds[0][i].dtype, d, f'Default data types don not match {i}, expected {d}')
def test_creation(self): name_t = 'test-tensor' f1 = ft.FeatureSource('test-feature-1', ft.FEATURE_TYPE_STRING) f2 = ft.FeatureSource('test-feature-2', ft.FEATURE_TYPE_STRING) t = ft.TensorDefinition(name_t, [f1, f2]) self.assertIsInstance(t, ft.TensorDefinition, f'TensorDefinition creation failed') self.assertEqual(t.name, name_t, f'Tensor Definition name not correct. Got {name_t}') self.assertListEqual([f1, f2], t.features, f'Tensor def feature list incorrect {t.features}') self.assertEqual( t.inference_ready, True, f'Tensor should ready for inference, feature have no inf attributes' ) with self.assertRaises(ft.TensorDefinitionException): _ = t.rank
def test_duplicate_bad(self): name_t = 'test-tensor' f1 = ft.FeatureSource('test-feature-1', ft.FEATURE_TYPE_STRING) with self.assertRaises(ft.TensorDefinitionException): _ = ft.TensorDefinition(name_t, [f1, f1])