Exemplo n.º 1
0
 def setUp(self):
     df = DataFrame(
         data={
             'label': ['1', '-1', '1'],
             'client_id': ['1234', '', '5678'],
             'visited_domains': ['43534 345345 345345', '345435', '34532'],
             'position_id': ['4567', '324', ''],
             'client_id;position_id': ['1234;4567', '', ''],
             'numerical_feature': ['4.2', '', '']
         })
     self.formatter = VW_Formatter(data_frame=df)
 def setUp(self):
     df = DataFrame(
         data={
             "label": ["1", "-1", "1"],
             "client_id": ["1234", "", "5678"],
             "visited_domains": ["43534 345345 345345", "345435", "34532"],
             "position_id": ["4567", "324", ""],
             "client_id;position_id": ["1234;4567", "", ""],
             "numerical_feature": ["4.2", "", ""],
         }
     )
     self.formatter = VW_Formatter(data_frame=df)
class TestFormatter(unittest.TestCase):
    def setUp(self):
        df = DataFrame(
            data={
                "label": ["1", "-1", "1"],
                "client_id": ["1234", "", "5678"],
                "visited_domains": ["43534 345345 345345", "345435", "34532"],
                "position_id": ["4567", "324", ""],
                "client_id;position_id": ["1234;4567", "", ""],
                "numerical_feature": ["4.2", "", ""],
            }
        )
        self.formatter = VW_Formatter(data_frame=df)

    def test_check_feature_type(self):
        self.assertEquals(self.formatter.get_feature_type("label"), "categorical")
        self.assertEquals(self.formatter.get_feature_type("visited_domains"), "categorical")
        self.assertEquals(self.formatter.get_feature_type("position_id"), "categorical")
        self.assertEquals(self.formatter.get_feature_type("client_id"), "categorical")
        self.assertEquals(self.formatter.get_feature_type("client_id;position_id"), "categorical")
        self.assertEquals(self.formatter.get_feature_type("numerical_feature"), "numerical")

    def test_can_vw_format(self):
        self.assertTrue(self.formatter.can_vw_format(["client_id", "client_id;position_id", "numerical_feature"]))
        self.assertFalse(self.formatter.can_vw_format(["my_random_id", "client_id;position_id", "numerical_feature"]))
        del self.formatter.data_frame["label"]
        self.assertFalse(self.formatter.can_vw_format(["client_id", "client_id;position_id", "numerical_feature"]))

    @nose.tools.raises(ValueError)
    def test_conversion_to_vw_format_when_cannot_vw_format(self):
        self.formatter.to_vw_format(["my_random_id", "client_id;position_id", "numerical_feature"])

    def test_conversion_to_vw_format(self):
        result = Series(data=["1 |client_id 1234 ", "-1 |client_id ", "1 |client_id 5678 "])
        assert_series_equal(self.formatter.to_vw_format(["client_id"]), result)
        result[0], result[1], result[2] = (
            result[0] + "|visited_domains 43534 345345 345345 ",
            result[1] + "|visited_domains 345435 ",
            result[2] + "|visited_domains 34532 ",
        )

        assert_series_equal(self.formatter.to_vw_format(["client_id", "visited_domains"]), result)
        result[0], result[1], result[2] = (
            result[0] + "|position_id 4567 ",
            result[1] + "|position_id 324 ",
            result[2] + "|position_id ",
        )
        assert_series_equal(self.formatter.to_vw_format(["client_id", "visited_domains", "position_id"]), result)

        result[0], result[1], result[2] = (
            result[0] + "|client_id;position_id 1234;4567 ",
            result[1] + "|client_id;position_id ",
            result[2] + "|client_id;position_id ",
        )
        assert_series_equal(
            self.formatter.to_vw_format(["client_id", "visited_domains", "position_id", "client_id;position_id"]),
            result,
        )

        result[0], result[1], result[2] = (
            result[0] + "|numerical_feature numerical_feature_value:4.2 ",
            result[1] + "|numerical_feature ",
            result[2] + "|numerical_feature ",
        )
        assert_series_equal(
            self.formatter.to_vw_format(
                ["client_id", "visited_domains", "position_id", "client_id;position_id", "numerical_feature"]
            ),
            result,
        )
Exemplo n.º 4
0
class TestFormatter(unittest.TestCase):
    def setUp(self):
        df = DataFrame(
            data={
                'label': ['1', '-1', '1'],
                'client_id': ['1234', '', '5678'],
                'visited_domains': ['43534 345345 345345', '345435', '34532'],
                'position_id': ['4567', '324', ''],
                'client_id;position_id': ['1234;4567', '', ''],
                'numerical_feature': ['4.2', '', '']
            })
        self.formatter = VW_Formatter(data_frame=df)

    def test_check_feature_type(self):
        self.assertEquals(self.formatter.get_feature_type('label'),
                          'categorical')
        self.assertEquals(self.formatter.get_feature_type('visited_domains'),
                          'categorical')
        self.assertEquals(self.formatter.get_feature_type('position_id'),
                          'categorical')
        self.assertEquals(self.formatter.get_feature_type('client_id'),
                          'categorical')
        self.assertEquals(
            self.formatter.get_feature_type('client_id;position_id'),
            'categorical')
        self.assertEquals(self.formatter.get_feature_type('numerical_feature'),
                          'numerical')

    def test_can_vw_format(self):
        self.assertTrue(
            self.formatter.can_vw_format(
                ['client_id', 'client_id;position_id', 'numerical_feature']))
        self.assertFalse(
            self.formatter.can_vw_format(
                ['my_random_id', 'client_id;position_id',
                 'numerical_feature']))
        del self.formatter.data_frame['label']
        self.assertFalse(
            self.formatter.can_vw_format(
                ['client_id', 'client_id;position_id', 'numerical_feature']))

    @nose.tools.raises(ValueError)
    def test_conversion_to_vw_format_when_cannot_vw_format(self):
        self.formatter.to_vw_format(
            ['my_random_id', 'client_id;position_id', 'numerical_feature'])

    def test_conversion_to_vw_format(self):
        result = Series(data=[
            '1 |client_id 1234 ', '-1 |client_id ', '1 |client_id 5678 '
        ])
        assert_series_equal(self.formatter.to_vw_format(['client_id']), result)
        result[0], result[1], result[2] = result[0] + '|visited_domains 43534 345345 345345 ', \
            result[1] + '|visited_domains 345435 ', result[2] + '|visited_domains 34532 '

        assert_series_equal(
            self.formatter.to_vw_format(['client_id', 'visited_domains']),
            result)
        result[0], result[1], result[2] = result[0] + '|position_id 4567 ', \
            result[1] + '|position_id 324 ', result[2] + '|position_id '
        assert_series_equal(
            self.formatter.to_vw_format(
                ['client_id', 'visited_domains', 'position_id']), result)

        result[0], result[1], result[2] = result[0] + '|client_id;position_id 1234;4567 ', \
            result[1] + '|client_id;position_id ', result[2] + '|client_id;position_id '
        assert_series_equal(
            self.formatter.to_vw_format([
                'client_id', 'visited_domains', 'position_id',
                'client_id;position_id'
            ]), result)

        result[0], result[1], result[2] = result[0] + '|numerical_feature numerical_feature_value:4.2 ', \
            result[1] + '|numerical_feature ', result[2] + '|numerical_feature '
        assert_series_equal(
            self.formatter.to_vw_format([
                'client_id', 'visited_domains', 'position_id',
                'client_id;position_id', 'numerical_feature'
            ]), result)