Exemplo n.º 1
0
    def test_encode_and_no_categorical(self):
        dask_data = dd.read_csv('data_encode.csv')
        x = Preprocessor(['feat1', 'feat2', 'feat3'], 'target', dask_data,
                         ['o', 'p', 'n'])
        x.execute(duplicates_invalid=True,
                  missing=True,
                  scale=True,
                  transform=True,
                  encode_target=True,
                  train=True)
        expected_output_dict = {
            'target': {
                0: 2,
                1: 2,
                2: 0,
                3: 0,
                6: 1,
                7: 2,
                8: 0,
                9: 2
            },
            'feat1': {
                0: -0.928,
                1: -0.093,
                2: -0.928,
                3: -0.928,
                6: 0.743,
                7: -0.093,
                8: -0.093,
                9: -0.093
            },
            'feat2': {
                0: -0.844,
                1: 0.998,
                2: -0.844,
                3: -0.844,
                6: -0.23,
                7: -0.844,
                8: 0.384,
                9: 0.0
            },
            'feat3': {
                0: -0.548,
                1: 0.0,
                2: 0.0,
                3: -0.548,
                6: 2.739,
                7: -0.548,
                8: 0.0,
                9: -0.548
            }
        }

        self.assertEqual(expected_output_dict, x.df.round(3).head(8).to_dict())
Exemplo n.º 2
0
    def test_remove_invalid_duplicates(self):
        dask_data = dd.read_csv('data_duplicate.csv')
        x = Preprocessor(['feat1', 'feat2', 'feat3'],
                         'target',
                         dask_data, ['0', '1'],
                         categorical_features=['feat4'])
        x.execute(duplicates_invalid=True,
                  missing=False,
                  scale=False,
                  transform=False,
                  encode_target=False,
                  train=True)
        expected_output_dict = {
            'target': {
                0: '0',
                6: '1',
                7: '0',
                10: '0'
            },
            'feat1': {
                0: 1,
                6: 3,
                7: 2,
                10: 5
            },
            'feat2': {
                0: 2.0,
                6: 3.0,
                7: 2.0,
                10: 7.0
            },
            'feat3': {
                0: 3.0,
                6: 4.0,
                7: 3.0,
                10: 3.0
            },
            'feat4': {
                0: 'a',
                6: 'a',
                7: 'b',
                10: 'a'
            }
        }

        self.assertEqual(expected_output_dict, x.df.head(8).dropna().to_dict())
Exemplo n.º 3
0
    def test_transform_date(self):
        dask_data = dd.read_csv('data_duplicate.csv')
        x = Preprocessor(['feat1', 'feat2', 'feat3'],
                         'target',
                         dask_data, ['0', '1'],
                         categorical_features=['feat4'])
        x.execute(duplicates_invalid=True,
                  missing=True,
                  scale=True,
                  transform=True,
                  encode_target=False,
                  train=True)
        expected_output_dict = {
            'target': {
                0: '0',
                1: '1',
                2: '0',
                6: '1',
                7: '0',
                8: '1',
                9: '1',
                10: '0'
            },
            'feat1': {
                0: -1.043,
                1: -0.209,
                2: -1.043,
                6: 0.626,
                7: -0.209,
                8: -0.209,
                9: -0.209,
                10: 2.294
            },
            'feat2': {
                0: -0.954,
                1: 0.867,
                2: -0.954,
                6: -0.347,
                7: -0.954,
                8: 0.26,
                9: 0.0,
                10: 2.081
            },
            'feat3': {
                0: -0.632,
                1: 0.0,
                2: 0.0,
                6: 2.53,
                7: -0.632,
                8: 0.0,
                9: -0.632,
                10: -0.632
            },
            'feat4': {
                0: 'a',
                1: 'Other',
                2: 'b',
                6: 'a',
                7: 'b',
                8: 'c',
                9: 'c',
                10: 'a'
            }
        }

        self.assertEqual(expected_output_dict, x.df.round(3).head(8).to_dict())