예제 #1
0
 def test_creation_non_float(self):
     name = 'scale'
     f_type_str = ft.FEATURE_TYPE_STRING
     f_type_flt = ft.FEATURE_TYPE_FLOAT
     sf_flt = ft.FeatureSource('Source', f_type_flt)
     sf_str = ft.FeatureSource('Source', f_type_str)
     with self.assertRaises(ft.FeatureDefinitionException):
         ft.FeatureNormalizeScale(name, f_type_str, sf_flt)
     with self.assertRaises(ft.FeatureDefinitionException):
         ft.FeatureNormalizeScale(name, f_type_flt, sf_str)
예제 #2
0
 def test_filtering(self):
     name_t = 'test-tensor'
     f1 = ft.FeatureSource('test-feature-1', ft.FEATURE_TYPE_STRING)
     f2 = ft.FeatureIndex('test-feature-2', ft.FEATURE_TYPE_INT_8, f1)
     f3 = ft.FeatureSource('test-feature-3', ft.FEATURE_TYPE_STRING)
     f4 = ft.FeatureOneHot('test-feature-4', ft.FEATURE_TYPE_INT_8, f3)
     f5 = ft.FeatureSource('test-feature-5', ft.FEATURE_TYPE_FLOAT)
     f6 = ft.FeatureNormalizeScale('test-feature-6', ft.FEATURE_TYPE_FLOAT,
                                   f5)
     f7 = ft.FeatureNormalizeStandard('test-feature-7',
                                      ft.FEATURE_TYPE_FLOAT, f5)
     f8 = ft.FeatureLabelBinary('test-feature-8', ft.FEATURE_TYPE_INT_8, f2)
     t = ft.TensorDefinition(name_t, [f1, f2, f3, f4, f5, f6, f7, f8])
     self.assertEqual(
         len(t.learning_categories), 4,
         f'Should be 4 categories. Got {len(t.learning_categories)}')
     self.assertListEqual(t.categorical_features(), [f2])
     self.assertListEqual(t.binary_features(), [f4])
     self.assertListEqual(t.continuous_features(), [f5, f6, f7])
     self.assertListEqual(t.label_features(), [f8])
     # Should fail because the Tensor Definition is ready for inference.
     with self.assertRaises(ft.TensorDefinitionException):
         t.categorical_features(True)
         t.binary_features(True)
         t.continuous_features(True)
         t.label_features(True)
         t.filter_features(ft.LEARNING_CATEGORY_CATEGORICAL, True)
예제 #3
0
class TestClassSampler(unittest.TestCase):
    """Class Sampler test cases
    """
    fraud = ft.FeatureSource('Fraud', ft.FEATURE_TYPE_INT_8)
    s_features = [
        ft.FeatureSource('Amount', ft.FEATURE_TYPE_FLOAT),
        ft.FeatureSource('Card', ft.FEATURE_TYPE_STRING),
        ft.FeatureSource('MCC', ft.FEATURE_TYPE_CATEGORICAL),
        ft.FeatureSource('Country', ft.FEATURE_TYPE_CATEGORICAL),
        fraud
    ]
    d_features = [
        ft.FeatureNormalizeScale('Amount_Scale', ft.FEATURE_TYPE_FLOAT_32, s_features[0]),
        ft.FeatureOneHot('MCC_OH', s_features[2]),
        ft.FeatureIndex('Country_Index', ft.FEATURE_TYPE_INT_16, s_features[3]),
        ft.FeatureLabelBinary('Fraud', fraud)
    ]

    def test_creation_base(self):
        file = FILES_DIR + 'engine_test_base_comma.csv'
        tdb = ft.TensorDefinition('Base', self.s_features)
        tdd = ft.TensorDefinition('Derived', self.d_features)
        with en.EnginePandasNumpy() as e:
            df = e.from_csv(tdb, file, inference=False)
            df = e.from_df(tdd, df, inference=False)
            npl = e.to_numpy_list(tdd, df)
            cs = pt.ClassSampler(tdd, npl)
            self.assertIsInstance(cs, pt.ClassSampler, f'Was expecting ClassSampler type {type(cs)}')
            sm = cs.over_sampler(replacement=False)
            self.assertIsInstance(sm, data.WeightedRandomSampler, f'Was expecting Weighted Random Sampler {type(sm)}')
            self.assertEqual(len(sm), len(npl), f'Length not correct {len(sm)}')
            self.assertListEqual(sorted(list(sm)), list(range(len(npl))), f'Each index should be in the weight list')

    def test_creation_bad(self):
        file = FILES_DIR + 'engine_test_base_comma.csv'
        tdb = ft.TensorDefinition('Base', self.s_features)
        tdd = ft.TensorDefinition('Derived', self.d_features)
        with en.EnginePandasNumpy() as e:
            df = e.from_csv(tdb, file, inference=False)
            df = e.from_df(tdd, df, inference=False)
            npl = e.to_numpy_list(tdd, df)
            # Should fail because wrong tensor definition. It does not match the numpy list
            with self.assertRaises(pt.PyTorchTrainException):
                _ = pt.ClassSampler(tdb, npl)
예제 #4
0
 def test_creation_base(self):
     name = 'scale'
     f_type = ft.FEATURE_TYPE_FLOAT
     sf = ft.FeatureSource('Source', ft.FEATURE_TYPE_FLOAT)
     scf = ft.FeatureNormalizeScale(name, f_type, sf)
     self.assertIsInstance(scf, ft.FeatureNormalizeScale,
                           f'Incorrect Type {type(scf)}')
     self.assertEqual(scf.name, name,
                      f'Scale feature name incorrect {name}')
     self.assertEqual(scf.type, f_type,
                      f'Scale feature type should have been {f_type}')
     self.assertEqual(scf.inference_ready, False,
                      f'Scale feature should NOT be inference ready')
     self.assertIsNone(scf.minimum, f'Scale minimum should be None')
     self.assertIsNone(scf.maximum, f'Scale maximum should be None')
     self.assertEqual(scf.learning_category,
                      ft.LEARNING_CATEGORY_CONTINUOUS,
                      f'Wrong Learning category')
     self.assertIsInstance(hash(scf), int, f'Hash function not working')
예제 #5
0
class TestNumpyDataSet(unittest.TestCase):
    """Numpy Dataset test cases
    """
    fraud = ft.FeatureSource('Fraud', ft.FEATURE_TYPE_INT_8)
    s_features = [
        ft.FeatureSource('Amount', ft.FEATURE_TYPE_FLOAT),
        ft.FeatureSource('Card', ft.FEATURE_TYPE_STRING),
        ft.FeatureSource('MCC', ft.FEATURE_TYPE_CATEGORICAL),
        ft.FeatureSource('Country', ft.FEATURE_TYPE_CATEGORICAL),
        fraud
    ]
    d_features = [
        ft.FeatureNormalizeScale('Amount_Scale', ft.FEATURE_TYPE_FLOAT_32, s_features[0]),
        ft.FeatureOneHot('MCC_OH', s_features[2]),
        ft.FeatureIndex('Country_Index', ft.FEATURE_TYPE_INT_16, s_features[3]),
        ft.FeatureLabelBinary('Fraud', fraud)
    ]

    def test_creation_base(self):
        file = FILES_DIR + 'engine_test_base_comma.csv'
        tdb = ft.TensorDefinition('Base', self.s_features)
        tdd = ft.TensorDefinition('Derived', self.d_features)
        with en.EnginePandasNumpy() as e:
            df = e.from_csv(tdb, file, inference=False)
            df = e.from_df(tdd, df, inference=False)
            npl = e.to_numpy_list(tdd, df)
            ds = pt.NumpyListDataSet(tdd, npl)
            self.assertEqual(len(ds), len(npl), f'Length of DS is wrong. Got {len(ds)}. Expected {len(npl)}')
            t = ds[0]
            self.assertIsInstance(t, list, f'__get_item__ should have returned a list')
            self.assertIsInstance(t[0], torch.Tensor, f'__get_item__ should have returned a list of Tensors')
            self.assertEqual(len(t), len(tdd.learning_categories), f'Number of list must be number of Learning cats')
            # Test Shapes
            for n, t in zip(npl.lists, ds[0]):
                ns = n.shape[1] if len(n.shape) > 1 else 0
                ts = 0 if len(list(t.shape)) == 0 else list(t.shape)[0]
                self.assertEqual(ns, ts)
            # Test data types.
            for i, d in enumerate(d373c7.pytorch.data._DTypeHelper.get_dtypes(tdd)):
                self.assertEqual(ds[0][i].dtype, d, f'Default data types don not match {i}, expected {d}')

    def test_creation_bad(self):
        file = FILES_DIR + 'engine_test_base_comma.csv'
        tdb = ft.TensorDefinition('Base', self.s_features)
        tdd = ft.TensorDefinition('Derived', self.d_features)
        with en.EnginePandasNumpy() as e:
            df = e.from_csv(tdb, file, inference=False)
            df = e.from_df(tdd, df, inference=False)
            npl = e.to_numpy_list(tdd, df)
            # Try building off of the wrong tensor definition
            with self.assertRaises(pt.PyTorchTrainException):
                _ = pt.NumpyListDataSet(tdb, npl)

    def test_creation_data_loader(self):
        file = FILES_DIR + 'engine_test_base_comma.csv'
        bs = 3
        tdb = ft.TensorDefinition('Base', self.s_features)
        tdd = ft.TensorDefinition('Derived', self.d_features)
        with en.EnginePandasNumpy() as e:
            df = e.from_csv(tdb, file, inference=False)
            df = e.from_df(tdd, df, inference=False)
            npl = e.to_numpy_list(tdd, df)
            ds = pt.NumpyListDataSet(tdd, npl)
            dl = ds.data_loader(torch.device('cpu'), bs)
            t = next(iter(dl))
            self.assertEqual(len(t), len(tdd.learning_categories))
            # Test data types.
            for i, d in enumerate(d373c7.pytorch.data._DTypeHelper.get_dtypes(tdd)):
                self.assertEqual(t[i].dtype, d, f'Default data types don not match {i}, expected {d}')
            # Check batch-size
            for i, te in enumerate(t):
                self.assertEqual(te.shape[0], bs, f'Batch size does not match item {i}. Got {te.shape[0]}')