示例#1
0
 def test_text_output(self):
     """test basic comparison"""
     filths = [
         MergedFilth(
             PhoneFilth(beg=0, end=4, text='1234', detector_name='phone'),
             KnownFilth(beg=0, end=4, text='1234', comparison_type='phone'),
         ),
         KnownFilth(beg=5, end=10, text='12345', comparison_type='phone'),
         MergedFilth(
             PhoneFilth(beg=5, end=9, text='1234', detector_name='phone'),
             KnownFilth(beg=5, end=9, text='1234', comparison_type='phone'),
         ),
         KnownFilth(beg=15, end=20, text='12345', comparison_type='phone'),
     ]
     text = scrubadub.comparison.get_filth_classification_report(
         filths,
         output_dict=False,
     ).strip()
     print(text)
     self.assertEquals(
         text,
         "                 precision    recall  f1-score   support\n"
         "\n"
         "phone     phone       1.00      0.50      0.67         4\n"
         "\n"
         "      micro avg       1.00      0.50      0.67         4\n"
         "      macro avg       1.00      0.50      0.67         4\n"
         "   weighted avg       1.00      0.50      0.67         4\n".strip(
         ),
     )
示例#2
0
    def test_filth_type_equality(self):
        filth_a = PhoneFilth(
            beg=0, end=4, text='1234', detector_name='phone_a', locale='en_GB', document_name='test.txt'
        )
        filth_b = PhoneFilth(
            beg=2, end=6, text='1234', detector_name='phone_b', locale='en_GB', document_name='test.txt'
        )
        filth_c = PhoneFilth(
            beg=10, end=14, text='1234', detector_name='phone_b', locale='en_GB', document_name='test.txt'
        )

        ft = FilthTypePositions(grouping_function=FilthGrouper.grouping_default, filth_type='phone')
        ft.add_filth(filth_c)
        ft.add_filth(filth_a)
        ft.add_filth(filth_b)

        ft2 = FilthTypePositions(grouping_function=FilthGrouper.grouping_default, filth_type='phone')
        ft2.add_filth(filth_c)
        ft2.add_filth(filth_a)
        ft2.add_filth(filth_b)

        self.assertTrue(ft == ft2)

        ft2 = FilthTypePositions(grouping_function=FilthGrouper.grouping_default, filth_type='phone')
        ft2.add_filth(filth_c)
        ft2.add_filth(filth_a)

        self.assertTrue(ft != ft2)
示例#3
0
    def test_comparison(self):
        """test basic comparison"""
        filths = [
            MergedFilth(
                PhoneFilth(beg=0, end=4, text='1234', detector_name='phone'),
                TaggedEvaluationFilth(beg=0,
                                      end=4,
                                      text='1234',
                                      comparison_type='phone'),
            ),
            TaggedEvaluationFilth(beg=5,
                                  end=10,
                                  text='12345',
                                  comparison_type='phone'),
            MergedFilth(
                PhoneFilth(beg=12, end=16, text='1234', detector_name='phone'),
                TaggedEvaluationFilth(beg=12,
                                      end=16,
                                      text='1234',
                                      comparison_type='phone'),
            ),
            TaggedEvaluationFilth(beg=20,
                                  end=25,
                                  text='12345',
                                  comparison_type='phone'),
        ]

        self.assertEqual(
            {
                'macro avg': {
                    'f1-score': 0.6666666666666666,
                    'precision': 1.0,
                    'recall': 0.5,
                    'support': 4
                },
                'micro avg': {
                    'f1-score': 0.6666666666666666,
                    'precision': 1.0,
                    'recall': 0.5,
                    'support': 4
                },
                'phone:phone:None': {
                    'f1-score': 0.6666666666666666,
                    'precision': 1.0,
                    'recall': 0.5,
                    'support': 4
                },
                'weighted avg': {
                    'f1-score': 0.6666666666666666,
                    'precision': 1.0,
                    'recall': 0.5,
                    'support': 4
                }
            },
            scrubadub.comparison.get_filth_classification_report(
                filths,
                output_dict=True,
            ),
        )
示例#4
0
    def test_false_positive(self):
        """test with incorrect identification"""
        filths = [
            PhoneFilth(beg=0, end=4, text='1234', detector_name='phone_v1'),
            TaggedEvaluationFilth(beg=5,
                                  end=10,
                                  text='12345',
                                  comparison_type='phone'),
            MergedFilth(
                PhoneFilth(beg=12,
                           end=16,
                           text='1234',
                           detector_name='phone_v1'),
                TaggedEvaluationFilth(beg=12,
                                      end=16,
                                      text='1234',
                                      comparison_type='phone'),
            ),
            TaggedEvaluationFilth(beg=20,
                                  end=25,
                                  text='12345',
                                  comparison_type='phone'),
        ]

        self.assertEqual(
            {
                'phone:phone_v1:None': {
                    'precision': 0.5,
                    'recall': 0.3333333333333333,
                    'f1-score': 0.4,
                    'support': 3
                },
                'micro avg': {
                    'precision': 0.5,
                    'recall': 0.3333333333333333,
                    'f1-score': 0.4,
                    'support': 3
                },
                'macro avg': {
                    'precision': 0.5,
                    'recall': 0.3333333333333333,
                    'f1-score': 0.4,
                    'support': 3
                },
                'weighted avg': {
                    'precision': 0.5,
                    'recall': 0.3333333333333333,
                    'f1-score': 0.4000000000000001,
                    'support': 3
                }
            },
            scrubadub.comparison.get_filth_classification_report(
                filths,
                # [PhoneDetector, KnownFilthDetector],
                output_dict=True,
            ),
        )
示例#5
0
    def test_text_position_merge_ranges(self):
        filth_a = PhoneFilth(
            beg=0, end=4, text='1234', detector_name='phone_a', locale='en_GB', document_name='test.txt'
        )
        filth_b = PhoneFilth(
            beg=10, end=14, text='1234', detector_name='phone_b', locale='en_GB', document_name='test.txt'
        )

        tp_a = TextPosition(filth_a, FilthGrouper.grouping_default)
        tp_b = TextPosition(filth_b, FilthGrouper.grouping_default)

        with self.assertRaises(ValueError):
            tp_a.merge(tp_b)
示例#6
0
    def test_filth_grouper(self):
        filths = [
            MergedFilth(
                PhoneFilth(beg=0, end=4, text='1234', detector_name='phone', locale='en_GB'),
                TaggedEvaluationFilth(beg=0, end=4, text='1234', comparison_type='phone', locale='en_GB'),
            ),
            TaggedEvaluationFilth(beg=5, end=10, text='12345', comparison_type='phone', locale='en_GB'),
            MergedFilth(
                PhoneFilth(beg=12, end=16, text='1234', detector_name='phone', locale='en_US'),
                TaggedEvaluationFilth(beg=12, end=16, text='1234', comparison_type='phone', locale='en_US'),
            ),
            TaggedEvaluationFilth(beg=20, end=25, text='12345', comparison_type='phone', locale='en_US'),
            TaggedEvaluationFilth(beg=30, end=35, text='12345', comparison_type='name', locale='en_US'),
        ]
        fg = FilthGrouper(combine_detectors=True, groupby_documents=False, filth_types=None)
        self.assertEqual(fg.grouping_function, FilthGrouper.grouping_combined)
        fg = FilthGrouper(combine_detectors=False, groupby_documents=False, filth_types=None)
        self.assertEqual(fg.grouping_function, FilthGrouper.grouping_default)

        fg.add_filths(filths)
        print(fg)
        self.assertEqual(['phone', 'name'], list(fg.types.keys()))
        self.assertEqual(1, len(fg.types['name'].positions))
        self.assertEqual(6, len(fg.types['phone'].positions))

        fg.merge_positions()
        self.assertEqual(1, len(fg.types['name'].positions))
        self.assertEqual(4, len(fg.types['phone'].positions))

        fg_from_list = FilthGrouper.from_filth_list(filths)
        self.assertEqual(list(fg.types.keys()), list(fg_from_list.types.keys()))

        df = fg.get_counts()
        print(df)
        self.assertEqual(['filth', 'detector', 'locale'], df.columns.names)
        self.assertEqual(
            [
                ('name', 'tagged', 'en_US'),
                ('phone', 'phone', 'en_GB'),
                ('phone', 'phone', 'en_US'),
                ('phone', 'tagged', 'en_GB'),
                ('phone', 'tagged', 'en_US')
            ],
            df.columns.values.tolist(),
        )
        self.assertEqual([0, 0, 0, 0, 1], df[('name', 'tagged', 'en_US')].values.tolist())
        self.assertEqual([1, 0, 0, 0, 0], df[('phone', 'phone', 'en_GB')].values.tolist())
        self.assertEqual([0, 0, 1, 0, 0], df[('phone', 'phone', 'en_US')].values.tolist())
        self.assertEqual([1, 1, 0, 0, 0], df[('phone', 'tagged', 'en_GB')].values.tolist())
        self.assertEqual([0, 0, 1, 1, 0], df[('phone', 'tagged', 'en_US')].values.tolist())
示例#7
0
 def test_text_position_repr(self):
     filth = PhoneFilth(beg=0, end=4, text='1234', detector_name='phone_a', locale='en_GB', document_name='test.txt')
     tp = TextPosition(filth, FilthGrouper.grouping_default)
     self.assertEqual(
         "<TextPosition beg=0 end=4 tagged=set() detected={('phone', 'phone_a', 'en_GB')} document_name='test.txt'>",
         tp.__repr__()
     )
示例#8
0
    def test_text_position_function(self):
        filth = PhoneFilth(beg=0, end=4, text='1234', detector_name='phone', locale='en_GB', document_name='test.txt')
        tp = TextPosition(filth, lambda x: {1:1, 2:2, 3:3})

        self.assertEqual(
            {(1, 2, 3)},
            tp.detected,
        )
示例#9
0
    def test_filth_type(self):
        filth_a = PhoneFilth(
            beg=0, end=4, text='1234', detector_name='phone_a', locale='en_GB', document_name='test.txt'
        )
        filth_b = PhoneFilth(
            beg=2, end=6, text='1234', detector_name='phone_b', locale='en_GB', document_name='test.txt'
        )
        filth_c = PhoneFilth(
            beg=10, end=14, text='1234', detector_name='phone_b', locale='en_GB', document_name='test.txt'
        )

        ft = FilthTypePositions(grouping_function=FilthGrouper.grouping_default, filth_type='phone')
        ft.add_filth(filth_c)
        ft.add_filth(filth_a)
        ft.add_filth(filth_b)
        self.assertEqual(3, len(ft.positions))
        self.assertEqual(10, ft.positions[0].beg)
        self.assertEqual(0, ft.positions[1].beg)
        self.assertEqual(2, ft.positions[2].beg)
        self.assertEqual(['filth', 'detector', 'locale'], ft.column_names)

        ft.merge_positions()
        self.assertEqual(2, len(ft.positions))
        self.assertEqual(10, ft.positions[1].beg)
        self.assertEqual(0, ft.positions[0].beg)
        self.assertEqual(6, ft.positions[0].end)
        self.assertEqual(
            {
                ('phone', 'phone_a', 'en_GB'),
                ('phone', 'phone_b', 'en_GB'),
            },
            ft.positions[0].detected,
        )

        df = ft.get_counts()
        self.assertEqual(['filth', 'detector', 'locale'], df.columns.names)
        self.assertEqual(
            {
                ('phone', 'phone_b', 'en_GB'),
                ('phone', 'phone_a', 'en_GB'),
            },
            set(df.columns.values.tolist()),
        )
        self.assertEqual([1, 0], df[('phone', 'phone_a', 'en_GB')].values.tolist())
        self.assertEqual([1, 1], df[('phone', 'phone_b', 'en_GB')].values.tolist())
示例#10
0
    def test_filth_type_touching(self):
        filth_a = PhoneFilth(
            beg=0, end=4, text='1234', detector_name='phone_a', locale='en_GB', document_name='test.txt'
        )
        filth_b = PhoneFilth(
            beg=2, end=6, text='1234', detector_name='phone_b', locale='en_GB', document_name='test.txt'
        )
        filth_c = PhoneFilth(
            beg=6, end=10, text='1234', detector_name='phone_b', locale='en_GB', document_name='test.txt'
        )

        ft = FilthTypePositions(grouping_function=FilthGrouper.grouping_default, filth_type='phone')
        ft.add_filth(filth_c)
        ft.add_filth(filth_a)
        ft.add_filth(filth_b)
        ft.merge_positions()

        self.assertEqual(2, len(ft.positions))
示例#11
0
 def test_dataframe(self):
     """test basic comparison"""
     filths = [
         MergedFilth(
             PhoneFilth(beg=0, end=4, text='1234', detector_name='phone'),
             KnownFilth(beg=0, end=4, text='1234', comparison_type='phone'),
         ),
         KnownFilth(beg=5, end=10, text='12345', comparison_type='phone'),
         MergedFilth(
             PhoneFilth(beg=4, end=9, text=' 1234', detector_name='phone'),
             KnownFilth(beg=5, end=9, text='1234', comparison_type='phone'),
         ),
         KnownFilth(beg=15, end=20, text='12345', comparison_type='phone'),
     ]
     dataframe = scrubadub.comparison.get_filth_dataframe(filths, )
     self.assertEquals(dataframe.shape[0], 4)
     self.assertEquals(
         dataframe['filth_type'].fillna('none').values.tolist(),
         ['phone', 'phone', 'none', 'none'])
     self.assertEquals(dataframe['beg'].fillna('none').values.tolist(),
                       [0, 4, 'none', 'none'])
     self.assertEquals(dataframe['end'].fillna('none').values.tolist(),
                       [4, 9, 'none', 'none'])
     self.assertEquals(
         dataframe['known_beg'].fillna('none').values.tolist(),
         [0, 5, 5, 15])
     self.assertEquals(
         dataframe['known_end'].fillna('none').values.tolist(),
         [4, 9, 10, 20])
     self.assertEquals(
         dataframe['exact_match'].fillna('none').values.tolist(),
         [True, False, False, False])
     self.assertEquals(
         dataframe['partial_match'].fillna('none').values.tolist(),
         [True, True, False, False])
     self.assertEquals(
         dataframe['true_positive'].fillna('none').values.tolist(),
         [True, True, False, False])
     self.assertEquals(
         dataframe['false_positive'].fillna('none').values.tolist(),
         [False, False, False, False])
     self.assertEquals(
         dataframe['false_negative'].fillna('none').values.tolist(),
         [False, False, True, True])
示例#12
0
    def test_text_equality(self):
        filth = PhoneFilth(beg=0, end=4, text='1234', detector_name='phone', locale='en_GB', document_name='test.txt')
        tp = TextPosition(filth, FilthGrouper.grouping_default)
        tp2 = TextPosition(filth, FilthGrouper.grouping_default)
        self.assertTrue(tp == tp2)

        filth = PhoneFilth(beg=0, end=4, text='1234', detector_name='phone', locale='fr_FR', document_name='test.txt')
        tp2 = TextPosition(filth, FilthGrouper.grouping_default)
        self.assertTrue(tp != tp2)

        filth = PhoneFilth(beg=0, end=4, text='1234', detector_name='phone2', locale='en_GB', document_name='test.txt')
        tp2 = TextPosition(filth, FilthGrouper.grouping_default)
        self.assertTrue(tp != tp2)

        filth = PhoneFilth(beg=0, end=5, text='12345', detector_name='phone', locale='en_GB', document_name='test.txt')
        tp2 = TextPosition(filth, FilthGrouper.grouping_default)
        self.assertTrue(tp != tp2)

        filth = PhoneFilth(beg=0, end=4, text='1234', detector_name='phone', locale='en_GB', document_name='test2.txt')
        tp2 = TextPosition(filth, FilthGrouper.grouping_default)
        self.assertTrue(tp != tp2)
示例#13
0
    def test_text_position(self):
        filth = PhoneFilth(beg=0, end=4, text='1234', detector_name='phone', locale='en_GB', document_name='test.txt')
        tp = TextPosition(filth, FilthGrouper.grouping_default)

        self.assertEqual(filth.beg, tp.beg)
        self.assertEqual(filth.end, tp.end)
        self.assertEqual(
            {('phone', 'phone', 'en_GB')},
            tp.detected,
        )
        self.assertEqual(set(), tp.tagged)
        self.assertEqual(filth.document_name, tp.document_name)
示例#14
0
    def test_with_irrelevant_filth(self):
        """text comparison with irrelevant filths included"""
        class TempFilth(Filth):
            type = 'temp'

        filths = [
            MergedFilth(
                PhoneFilth(beg=0, end=4, text='John', detector_name='phone'),
                TaggedEvaluationFilth(beg=0,
                                      end=4,
                                      text='John',
                                      comparison_type='phone')),
            TaggedEvaluationFilth(beg=5,
                                  end=10,
                                  text='Hello',
                                  comparison_type='name'),
            # KnownFilth(beg=5, end=10, text='Hello', comparison_type='temp'),
            TempFilth(beg=100, end=103, text='123', detector_name='temp'),
        ]

        self.assertEqual(
            {
                'phone:phone:None': {
                    'precision': 1.0,
                    'recall': 1.0,
                    'f1-score': 1.0,
                    'support': 1
                },
                'micro avg': {
                    'precision': 1.0,
                    'recall': 1.0,
                    'f1-score': 1.0,
                    'support': 1
                },
                'macro avg': {
                    'precision': 1.0,
                    'recall': 1.0,
                    'f1-score': 1.0,
                    'support': 1
                },
                'weighted avg': {
                    'precision': 1.0,
                    'recall': 1.0,
                    'f1-score': 1.0,
                    'support': 1
                }
            },
            scrubadub.comparison.get_filth_classification_report(
                filths,
                # [PhoneDetector, KnownFilthDetector],
                output_dict=True,
            ),
        )
示例#15
0
    def test_text_position_merge(self):
        filth_a = PhoneFilth(
            beg=0, end=4, text='1234', detector_name='phone_a', locale='en_GB', document_name='test.txt'
        )
        filth_b = PhoneFilth(
            beg=3, end=6, text='1234', detector_name='phone_b', locale='en_GB', document_name='test.txt'
        )

        tp_a = TextPosition(filth_a, FilthGrouper.grouping_default)
        tp_b = TextPosition(filth_b, FilthGrouper.grouping_default)
        tp_a.merge(tp_b)

        self.assertEqual(0, tp_a.beg)
        self.assertEqual(6, tp_a.end)

        self.assertEqual(
            {
                ('phone', 'phone_a', 'en_GB'),
                ('phone', 'phone_b', 'en_GB'),
            },
            tp_a.detected,
        )
        self.assertEqual(set(), tp_a.tagged)
示例#16
0
    def test_other_predefined_types(self):
        """test comparison with other predefined filth types"""
        filths = [
            MergedFilth(
                PhoneFilth(beg=0, end=4, text='John', detector_name='phone'),
                TaggedEvaluationFilth(beg=0,
                                      end=4,
                                      text='John',
                                      comparison_type='phone')),
            TaggedEvaluationFilth(beg=5,
                                  end=10,
                                  text='Hello',
                                  comparison_type='word'),
        ]

        self.assertEqual(
            {
                'phone:phone:None': {
                    'precision': 1.0,
                    'recall': 1.0,
                    'f1-score': 1.0,
                    'support': 1
                },
                'micro avg': {
                    'precision': 1.0,
                    'recall': 1.0,
                    'f1-score': 1.0,
                    'support': 1
                },
                'macro avg': {
                    'precision': 1.0,
                    'recall': 1.0,
                    'f1-score': 1.0,
                    'support': 1
                },
                'weighted avg': {
                    'precision': 1.0,
                    'recall': 1.0,
                    'f1-score': 1.0,
                    'support': 1
                }
            },
            scrubadub.comparison.get_filth_classification_report(
                filths,
                # [PhoneDetector, KnownFilthDetector],
                output_dict=True,
            ),
        )
示例#17
0
    def test_filth_grouper_equality(self):
        filths = [
            MergedFilth(
                PhoneFilth(beg=0, end=4, text='1234', detector_name='phone', locale='en_GB', document_name='gb.txt'),
                TaggedEvaluationFilth(beg=0, end=4, text='1234', comparison_type='phone', locale='en_GB',
                                      document_name='gb.txt'),
            ),
            TaggedEvaluationFilth(beg=5, end=10, text='12345', comparison_type='phone', locale='en_GB',
                                  document_name='gb.txt'),
            MergedFilth(
                PhoneFilth(beg=12, end=16, text='1234', detector_name='phone', locale='en_US', document_name='us.txt'),
                TaggedEvaluationFilth(beg=12, end=16, text='1234', comparison_type='phone', locale='en_US',
                                      document_name='us.txt'),
            ),
            TaggedEvaluationFilth(beg=20, end=25, text='12345', comparison_type='phone', locale='en_US',
                                  document_name='us.txt'),
            TaggedEvaluationFilth(beg=30, end=35, text='12345', comparison_type='name', locale='en_US',
                                  document_name='us.txt'),
        ]
        fg = FilthGrouper(combine_detectors=True, groupby_documents=True, filth_types=['phone'])
        fg.add_filths(filths)
        fg2 = FilthGrouper(combine_detectors=True, groupby_documents=True, filth_types=['phone'])
        fg2.add_filths(filths)

        self.assertTrue(fg == fg2)

        fg2 = FilthGrouper(combine_detectors=True, groupby_documents=True, filth_types=['phone'])
        fg2.add_filths(filths[1:])

        self.assertTrue(fg != fg2)

        fg2 = FilthGrouper(grouping_function=FilthGrouper.grouping_default, filth_types=['phone'])
        fg2.add_filths(filths[1:])

        self.assertTrue(fg != fg2)

        self.assertEqual(['phone'], list(fg.types.keys()))
        self.assertEqual(6, len(fg.types['phone'].positions))

        fg.merge_positions()
        self.assertEqual(4, len(fg.types['phone'].positions))

        fg_from_list = FilthGrouper.from_filth_list(filths, filth_types=['phone'], combine_detectors=True,
                                                    groupby_documents=True)
        self.assertEqual(list(fg.types.keys()), list(fg_from_list.types.keys()))

        df = fg.get_counts()
        self.assertEqual(['filth', 'document_name', 'detector', 'locale'], df.columns.names)
        self.assertEqual(
            [
                ('phone', 'gb.txt', 'combined', 'en_GB'),
                ('phone', 'gb.txt', 'tagged', 'en_GB'),
                ('phone', 'us.txt', 'combined', 'en_US'),
                ('phone', 'us.txt', 'tagged', 'en_US')
            ],
            df.columns.values.tolist(),
        )
        self.assertEqual([1, 0, 0, 0], df[('phone', 'gb.txt', 'combined', 'en_GB')].values.tolist())
        self.assertEqual([1, 1, 0, 0], df[('phone', 'gb.txt', 'tagged', 'en_GB')].values.tolist())
        self.assertEqual([0, 0, 1, 0], df[('phone', 'us.txt', 'combined', 'en_US')].values.tolist())
        self.assertEqual([0, 0, 1, 1], df[('phone', 'us.txt', 'tagged', 'en_US')].values.tolist())
示例#18
0
    def test_grouper(self):
        filths = [
            MergedFilth(
                PhoneFilth(beg=0,
                           end=4,
                           text='John',
                           detector_name='phone_det'),
                TaggedEvaluationFilth(beg=0,
                                      end=4,
                                      text='John',
                                      comparison_type='phone')),
            TaggedEvaluationFilth(beg=5,
                                  end=10,
                                  text='Hello',
                                  comparison_type='name'),
            AddressFilth(beg=100,
                         end=103,
                         text='123',
                         detector_name='address_det'),
        ]
        grouper = scrubadub.comparison.FilthGrouper()
        grouper.add_filths(filths)

        self.assertEqual(3, len(grouper.types))

        self.assertEqual(2, len(grouper.types['phone'].positions))
        self.assertEqual(0, grouper.types['phone'].positions[0].beg)
        self.assertEqual(4, grouper.types['phone'].positions[0].end)
        self.assertEqual({('phone', 'phone_det', 'None')},
                         grouper.types['phone'].positions[0].detected)
        self.assertEqual(set(), grouper.types['phone'].positions[0].tagged)
        self.assertEqual(set(), grouper.types['phone'].positions[1].detected)
        self.assertEqual({('phone', 'tagged', 'None')},
                         grouper.types['phone'].positions[1].tagged)

        self.assertEqual(1, len(grouper.types['name'].positions))
        self.assertEqual(5, grouper.types['name'].positions[0].beg)
        self.assertEqual(10, grouper.types['name'].positions[0].end)
        self.assertEqual({('name', 'tagged', 'None')},
                         grouper.types['name'].positions[0].tagged)
        self.assertEqual(set(), grouper.types['name'].positions[0].detected)

        self.assertEqual(1, len(grouper.types['address'].positions))
        self.assertEqual(100, grouper.types['address'].positions[0].beg)
        self.assertEqual(103, grouper.types['address'].positions[0].end)
        self.assertEqual(set(), grouper.types['address'].positions[0].tagged)
        self.assertEqual({('address', 'address_det', 'None')},
                         grouper.types['address'].positions[0].detected)

        grouper.merge_positions()

        self.assertEqual(3, len(grouper.types))

        self.assertEqual(1, len(grouper.types['phone'].positions))
        self.assertEqual(0, grouper.types['phone'].positions[0].beg)
        self.assertEqual(4, grouper.types['phone'].positions[0].end)
        self.assertEqual({('phone', 'phone_det', 'None')},
                         grouper.types['phone'].positions[0].detected)
        self.assertEqual({('phone', 'tagged', 'None')},
                         grouper.types['phone'].positions[0].tagged)

        self.assertEqual(1, len(grouper.types['name'].positions))
        self.assertEqual(5, grouper.types['name'].positions[0].beg)
        self.assertEqual(10, grouper.types['name'].positions[0].end)
        self.assertEqual({('name', 'tagged', 'None')},
                         grouper.types['name'].positions[0].tagged)
        self.assertEqual(set(), grouper.types['name'].positions[0].detected)

        self.assertEqual(1, len(grouper.types['address'].positions))
        self.assertEqual(100, grouper.types['address'].positions[0].beg)
        self.assertEqual(103, grouper.types['address'].positions[0].end)
        self.assertEqual(set(), grouper.types['address'].positions[0].tagged)
        self.assertEqual({('address', 'address_det', 'None')},
                         grouper.types['address'].positions[0].detected)
示例#19
0
    def test_two_comparisons(self):
        """test two filths in comparison"""
        class TempFilth(Filth):
            type = 'temp'

        class TempDetector(Detector):
            filth_cls = TempFilth

        filths = [
            MergedFilth(
                PhoneFilth(beg=0, end=4, text='1234', detector_name='phone'),
                TaggedEvaluationFilth(beg=0,
                                      end=4,
                                      text='1234',
                                      comparison_type='phone'),
            ),
            TaggedEvaluationFilth(beg=5,
                                  end=10,
                                  text='12345',
                                  comparison_type='phone'),
            MergedFilth(
                TempFilth(beg=5, end=9, text='1234', detector_name='temp'),
                TaggedEvaluationFilth(beg=5,
                                      end=9,
                                      text='1234',
                                      comparison_type='temp'),
            ),
            TaggedEvaluationFilth(beg=15,
                                  end=20,
                                  text='12345',
                                  comparison_type='temp'),
        ]

        self.assertEqual(
            {
                'phone:phone:None': {
                    'precision': 1.0,
                    'recall': 0.5,
                    'f1-score': 0.6666666666666666,
                    'support': 2
                },
                'temp:temp:None': {
                    'precision': 1.0,
                    'recall': 0.5,
                    'f1-score': 0.6666666666666666,
                    'support': 2
                },
                'micro avg': {
                    'precision': 1.0,
                    'recall': 0.5,
                    'f1-score': 0.6666666666666666,
                    'support': 4
                },
                'macro avg': {
                    'precision': 1.0,
                    'recall': 0.5,
                    'f1-score': 0.6666666666666666,
                    'support': 4
                },
                'weighted avg': {
                    'precision': 1.0,
                    'recall': 0.5,
                    'f1-score': 0.6666666666666666,
                    'support': 4
                },
                'samples avg': {
                    'precision': 0.5,
                    'recall': 0.5,
                    'f1-score': 0.5,
                    'support': 4
                }
            },
            scrubadub.comparison.get_filth_classification_report(
                filths,
                output_dict=True,
            ),
        )
示例#20
0
    def test_overall(self):
        """test comparison with other predefined filth types"""

        filths = [
            MergedFilth(
                PhoneFilth(beg=0,
                           end=4,
                           text='1234',
                           detector_name='phone1',
                           locale='en_GB'),
                TaggedEvaluationFilth(beg=0,
                                      end=4,
                                      text='1234',
                                      comparison_type='phone',
                                      locale='en_GB'),
            ),
            TaggedEvaluationFilth(beg=5,
                                  end=10,
                                  text='12345',
                                  comparison_type='phone',
                                  locale='en_GB'),
            MergedFilth(
                PhoneFilth(beg=12,
                           end=16,
                           text='1234',
                           detector_name='phone2',
                           locale='en_US'),
                TaggedEvaluationFilth(beg=12,
                                      end=16,
                                      text='1234',
                                      comparison_type='phone',
                                      locale='en_US'),
            ),
            TaggedEvaluationFilth(beg=20,
                                  end=25,
                                  text='12345',
                                  comparison_type='phone',
                                  locale='en_US'),
        ]

        self.assertEqual(
            {
                'macro avg': {
                    'f1-score': 0.6666666666666666,
                    'precision': 1.0,
                    'recall': 0.5,
                    'support': 4
                },
                'micro avg': {
                    'f1-score': 0.6666666666666666,
                    'precision': 1.0,
                    'recall': 0.5,
                    'support': 4
                },
                'phone:combined:en_GB': {
                    'f1-score': 0.6666666666666666,
                    'precision': 1.0,
                    'recall': 0.5,
                    'support': 2
                },
                'phone:combined:en_US': {
                    'f1-score': 0.6666666666666666,
                    'precision': 1.0,
                    'recall': 0.5,
                    'support': 2
                },
                'samples avg': {
                    'f1-score': 0.5,
                    'precision': 0.5,
                    'recall': 0.5,
                    'support': 4
                },
                'weighted avg': {
                    'f1-score': 0.6666666666666666,
                    'precision': 1.0,
                    'recall': 0.5,
                    'support': 4
                }
            },
            scrubadub.comparison.get_filth_classification_report(
                filths,
                combine_detectors=True,
                output_dict=True,
            ),
        )
示例#21
0
    def test_groupby_document(self):
        """test grouping by documents"""
        filths = [
            PhoneFilth(beg=0,
                       end=4,
                       text='1234',
                       detector_name='phone_v1',
                       document_name='1.txt'),
            TaggedEvaluationFilth(beg=5,
                                  end=10,
                                  text='12345',
                                  comparison_type='phone',
                                  document_name='1.txt'),
            MergedFilth(
                PhoneFilth(beg=12,
                           end=16,
                           text='1234',
                           detector_name='phone_v1',
                           document_name='1.txt'),
                TaggedEvaluationFilth(beg=12,
                                      end=16,
                                      text='1234',
                                      comparison_type='phone',
                                      document_name='1.txt'),
            ),
            TaggedEvaluationFilth(beg=20,
                                  end=25,
                                  text='12345',
                                  comparison_type='phone',
                                  document_name='1.txt'),
            PhoneFilth(beg=0,
                       end=4,
                       text='1234',
                       detector_name='phone_v1',
                       document_name='2.txt'),
            TaggedEvaluationFilth(beg=5,
                                  end=10,
                                  text='12345',
                                  comparison_type='phone',
                                  document_name='2.txt'),
            MergedFilth(
                PhoneFilth(beg=12,
                           end=16,
                           text='1234',
                           detector_name='phone_v1',
                           document_name='2.txt'),
                TaggedEvaluationFilth(beg=12,
                                      end=16,
                                      text='1234',
                                      comparison_type='phone',
                                      document_name='2.txt'),
            ),
            TaggedEvaluationFilth(beg=20,
                                  end=25,
                                  text='12345',
                                  comparison_type='phone',
                                  document_name='2.txt'),
        ]

        self.assertEqual(
            {
                'phone:1.txt:phone_v1:None': {
                    'precision': 0.5,
                    'recall': 0.3333333333333333,
                    'f1-score': 0.4,
                    'support': 3
                },
                'phone:2.txt:phone_v1:None': {
                    'precision': 0.5,
                    'recall': 0.3333333333333333,
                    'f1-score': 0.4,
                    'support': 3
                },
                'micro avg': {
                    'precision': 0.5,
                    'recall': 0.3333333333333333,
                    'f1-score': 0.4,
                    'support': 6
                },
                'macro avg': {
                    'precision': 0.5,
                    'recall': 0.3333333333333333,
                    'f1-score': 0.4,
                    'support': 6
                },
                'samples avg': {
                    'f1-score': 0.25,
                    'precision': 0.25,
                    'recall': 0.25,
                    'support': 6
                },
                'weighted avg': {
                    'precision': 0.5,
                    'recall': 0.3333333333333333,
                    'f1-score': 0.4000000000000001,
                    'support': 6
                }
            },
            scrubadub.comparison.get_filth_classification_report(
                filths,
                output_dict=True,
                groupby_documents=True,
            ),
        )
示例#22
0
 def test_dataframe(self):
     """test basic comparison"""
     # test to ensure it doesn't crash if no filth is given to get_filth_dataframe
     scrubadub.comparison.get_filth_dataframe([])
     # setup some filths for the other tests
     filths = [
         MergedFilth(
             PhoneFilth(beg=0, end=4, text='1234', detector_name='phone'),
             TaggedEvaluationFilth(beg=0,
                                   end=4,
                                   text='1234',
                                   comparison_type='phone'),
         ),
         TaggedEvaluationFilth(beg=5,
                               end=10,
                               text='12345',
                               comparison_type='phone'),
         MergedFilth(
             PhoneFilth(beg=4, end=9, text=' 1234', detector_name='phone'),
             TaggedEvaluationFilth(beg=5,
                                   end=9,
                                   text='1234',
                                   comparison_type='phone'),
         ),
         TaggedEvaluationFilth(beg=15,
                               end=20,
                               text='12345',
                               comparison_type='phone'),
     ]
     dataframe = scrubadub.comparison.get_filth_dataframe(filths, )
     self.assertEqual(4, dataframe.shape[0])
     self.assertEqual(
         ['phone', 'phone', 'none', 'none'],
         dataframe['filth_type'].fillna('none').values.tolist(),
     )
     self.assertEqual(
         [0, 4, 'none', 'none'],
         dataframe['beg'].fillna('none').values.tolist(),
     )
     self.assertEqual(
         [4, 9, 'none', 'none'],
         dataframe['end'].fillna('none').values.tolist(),
     )
     self.assertEqual(
         [0, 5, 5, 15],
         dataframe['known_beg'].fillna('none').values.tolist(),
     )
     self.assertEqual(
         [4, 9, 10, 20],
         dataframe['known_end'].fillna('none').values.tolist(),
     )
     self.assertEqual(
         [True, False, False, False],
         dataframe['exact_match'].fillna('none').values.tolist(),
     )
     self.assertEqual(
         [True, True, False, False],
         dataframe['partial_match'].fillna('none').values.tolist(),
     )
     self.assertEqual(
         [True, True, False, False],
         dataframe['true_positive'].fillna('none').values.tolist(),
     )
     self.assertEqual(
         [False, False, False, False],
         dataframe['false_positive'].fillna('none').values.tolist(),
     )
     self.assertEqual(
         [False, False, True, True],
         dataframe['false_negative'].fillna('none').values.tolist(),
     )