Exemplo n.º 1
0
    def test_compiler_unstructured_reports(self, *mocks):
        data = pd.Series(["Hello Hello", "This is a test grant"])
        compiler = col_pro_compilers.UnstructuredCompiler(data)
        unstructured_options = UnstructuredOptions()
        unstructured_options.text.vocab.is_enabled = False
        compiler._create_profile(data, unstructured_options)

        report = compiler.report(remove_disabled_flag=True)
        self.assertNotIn("vocab", report["statistics"])
        self.assertIn("words", report["statistics"])

        report = compiler.report(remove_disabled_flag=False)
        self.assertIn("vocab", report["statistics"])
        self.assertIn("words", report["statistics"])

        unstructured_options.text.vocab.is_enabled = True
        compiler._create_profile(data, unstructured_options)
        report = compiler.report(remove_disabled_flag=True)
        self.assertIn("vocab", report["statistics"])
        self.assertIn("words", report["statistics"])

        unstructured_options.text.words.is_enabled = False
        compiler._create_profile(data, unstructured_options)
        report = compiler.report(remove_disabled_flag=True)
        self.assertIn("vocab", report["statistics"])
        self.assertNotIn("words", report["statistics"])
Exemplo n.º 2
0
    def test_compiler_stats_diff(self, *mocks):
        data1 = pd.Series(["Hello Hello", "This is a test grant"])
        data2 = pd.Series(["This is unknown", "my name grant", "9", "9"])

        # Test normal diff
        compiler1 = col_pro_compilers.UnstructuredCompiler(data1)
        compiler2 = col_pro_compilers.UnstructuredCompiler(data2)
        labeler_1 = compiler1._profiles["data_labeler"]
        labeler_2 = compiler2._profiles["data_labeler"]

        labeler_1.char_sample_size = 20
        labeler_1.word_sample_size = 15
        entity_counts = {
            "word_level": {
                "UNKNOWN": 5,
                "TEST": 5,
                "UNIQUE1": 5
            },
            "true_char_level": {
                "UNKNOWN": 4,
                "TEST": 8,
                "UNIQUE1": 8
            },
            "postprocess_char_level": {
                "UNKNOWN": 5,
                "TEST": 10,
                "UNIQUE1": 5
            },
        }
        labeler_1.entity_counts = entity_counts
        labeler_1.update(pd.Series(["a"]))

        labeler_2.char_sample_size = 20
        labeler_2.word_sample_size = 10
        entity_counts = {
            "word_level": {
                "UNKNOWN": 2,
                "TEST": 4,
                "UNIQUE2": 4
            },
            "true_char_level": {
                "UNKNOWN": 8,
                "TEST": 8,
                "UNIQUE2": 4
            },
            "postprocess_char_level": {
                "UNKNOWN": 5,
                "TEST": 10,
                "UNIQUE2": 5
            },
        }
        labeler_2.entity_counts = entity_counts
        labeler_2.update(pd.Series(["a"]))

        expected_diff = {
            "statistics": {
                "vocab": [
                    ["H", "l"],
                    [
                        "e", "o", " ", "T", "h", "i", "s", "a", "t", "g", "r",
                        "n"
                    ],
                    ["u", "k", "w", "m", "y", "9"],
                ],
                "vocab_count": [
                    {
                        "l": 4,
                        "H": 2
                    },
                    {
                        " ": 1,
                        "e": 2,
                        "s": 1,
                        "t": 2,
                        "o": 1,
                        "i": "unchanged",
                        "a": "unchanged",
                        "T": "unchanged",
                        "h": "unchanged",
                        "g": "unchanged",
                        "r": "unchanged",
                        "n": -4,
                    },
                    {
                        "m": 2,
                        "9": 2,
                        "u": 1,
                        "k": 1,
                        "w": 1,
                        "y": 1
                    },
                ],
                "words": [["Hello", "test"], ["grant"],
                          ["unknown", "name", "9"]],
                "word_count": [
                    {
                        "Hello": 2,
                        "test": 1
                    },
                    {
                        "grant": "unchanged"
                    },
                    {
                        "9": 2,
                        "unknown": 1,
                        "name": 1
                    },
                ],
            },
            "data_label": {
                "entity_counts": {
                    "word_level": {
                        "UNKNOWN": 3,
                        "TEST": 1,
                        "UNIQUE1": [5, None],
                        "UNIQUE2": [None, 4],
                    },
                    "true_char_level": {
                        "UNKNOWN": -4,
                        "TEST": "unchanged",
                        "UNIQUE1": [8, None],
                        "UNIQUE2": [None, 4],
                    },
                    "postprocess_char_level": {
                        "UNKNOWN": "unchanged",
                        "TEST": "unchanged",
                        "UNIQUE1": [5, None],
                        "UNIQUE2": [None, 5],
                    },
                },
                "entity_percentages": {
                    "word_level": {
                        "UNKNOWN": 0.1333333333333333,
                        "TEST": -0.06666666666666671,
                        "UNIQUE1": [0.3333333333333333, None],
                        "UNIQUE2": [None, 0.4],
                    },
                    "true_char_level": {
                        "UNKNOWN": -0.2,
                        "TEST": "unchanged",
                        "UNIQUE1": [0.4, None],
                        "UNIQUE2": [None, 0.2],
                    },
                    "postprocess_char_level": {
                        "UNKNOWN": "unchanged",
                        "TEST": "unchanged",
                        "UNIQUE1": [0.25, None],
                        "UNIQUE2": [None, 0.25],
                    },
                },
            },
        }
        self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

        # Test while disabling a column
        options = UnstructuredOptions()
        options.data_labeler.is_enabled = False
        compiler2 = col_pro_compilers.UnstructuredCompiler(data2, options)
        expected_diff = {
            "statistics": {
                "vocab": [
                    ["H", "l"],
                    [
                        "e", "o", " ", "T", "h", "i", "s", "a", "t", "g", "r",
                        "n"
                    ],
                    ["u", "k", "w", "m", "y", "9"],
                ],
                "vocab_count": [
                    {
                        "l": 4,
                        "H": 2
                    },
                    {
                        " ": 1,
                        "e": 2,
                        "s": 1,
                        "t": 2,
                        "o": 1,
                        "i": "unchanged",
                        "a": "unchanged",
                        "T": "unchanged",
                        "h": "unchanged",
                        "g": "unchanged",
                        "r": "unchanged",
                        "n": -4,
                    },
                    {
                        "m": 2,
                        "9": 2,
                        "u": 1,
                        "k": 1,
                        "w": 1,
                        "y": 1
                    },
                ],
                "words": [["Hello", "test"], ["grant"],
                          ["unknown", "name", "9"]],
                "word_count": [
                    {
                        "Hello": 2,
                        "test": 1
                    },
                    {
                        "grant": "unchanged"
                    },
                    {
                        "9": 2,
                        "unknown": 1,
                        "name": 1
                    },
                ],
            }
        }
        self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

        # Test while disabling 2 columns
        options.text.is_enabled = False
        compiler2 = col_pro_compilers.UnstructuredCompiler(data2, options)
        expected_diff = {}
        self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

        # Test while disabling all columns
        compiler1 = col_pro_compilers.UnstructuredCompiler(data1, options)
        self.assertDictEqual(expected_diff, compiler1.diff(compiler2))
Exemplo n.º 3
0
    def test_compiler_stats_diff(self, *mocks):
        data1 = pd.Series(['Hello Hello', 'This is a test grant'])
        data2 = pd.Series(['This is unknown', 'my name grant', '9', '9'])

        # Test normal diff
        compiler1 = col_pro_compilers.UnstructuredCompiler(data1)
        compiler2 = col_pro_compilers.UnstructuredCompiler(data2)
        labeler_1 = compiler1._profiles["data_labeler"]
        labeler_2 = compiler2._profiles["data_labeler"]

        labeler_1.char_sample_size = 20
        labeler_1.word_sample_size = 15
        entity_counts = {
            'word_level': {
                'UNKNOWN': 5,
                'TEST': 5,
                'UNIQUE1': 5
            },
            'true_char_level': {
                'UNKNOWN': 4,
                'TEST': 8,
                'UNIQUE1': 8
            },
            'postprocess_char_level': {
                'UNKNOWN': 5,
                'TEST': 10,
                'UNIQUE1': 5
            }
        }
        labeler_1.entity_counts = entity_counts
        labeler_1.update(pd.Series(["a"]))

        labeler_2.char_sample_size = 20
        labeler_2.word_sample_size = 10
        entity_counts = {
            'word_level': {
                'UNKNOWN': 2,
                'TEST': 4,
                'UNIQUE2': 4
            },
            'true_char_level': {
                'UNKNOWN': 8,
                'TEST': 8,
                'UNIQUE2': 4
            },
            'postprocess_char_level': {
                'UNKNOWN': 5,
                'TEST': 10,
                'UNIQUE2': 5
            }
        }
        labeler_2.entity_counts = entity_counts
        labeler_2.update(pd.Series(["a"]))

        expected_diff = {
            'statistics': {
                'vocab':
                [['H', 'l'],
                 ['e', 'o', ' ', 'T', 'h', 'i', 's', 'a', 't', 'g', 'r', 'n'],
                 ['u', 'k', 'w', 'm', 'y', '9']],
                'vocab_count': [{
                    'l': 4,
                    'H': 2
                }, {
                    ' ': 1,
                    'e': 2,
                    's': 1,
                    't': 2,
                    'o': 1,
                    'i': 'unchanged',
                    'a': 'unchanged',
                    'T': 'unchanged',
                    'h': 'unchanged',
                    'g': 'unchanged',
                    'r': 'unchanged',
                    'n': -4
                }, {
                    'm': 2,
                    '9': 2,
                    'u': 1,
                    'k': 1,
                    'w': 1,
                    'y': 1
                }],
                'words': [['Hello', 'test'], ['grant'],
                          ['unknown', 'name', '9']],
                'word_count': [{
                    'Hello': 2,
                    'test': 1
                }, {
                    'grant': 'unchanged'
                }, {
                    '9': 2,
                    'unknown': 1,
                    'name': 1
                }]
            },
            'data_label': {
                'entity_counts': {
                    'word_level': {
                        'UNKNOWN': 3,
                        'TEST': 1,
                        'UNIQUE1': [5, None],
                        'UNIQUE2': [None, 4]
                    },
                    'true_char_level': {
                        'UNKNOWN': -4,
                        'TEST': 'unchanged',
                        'UNIQUE1': [8, None],
                        'UNIQUE2': [None, 4]
                    },
                    'postprocess_char_level': {
                        'UNKNOWN': 'unchanged',
                        'TEST': 'unchanged',
                        'UNIQUE1': [5, None],
                        'UNIQUE2': [None, 5]
                    }
                },
                'entity_percentages': {
                    'word_level': {
                        'UNKNOWN': 0.1333333333333333,
                        'TEST': -0.06666666666666671,
                        'UNIQUE1': [0.3333333333333333, None],
                        'UNIQUE2': [None, 0.4]
                    },
                    'true_char_level': {
                        'UNKNOWN': -0.2,
                        'TEST': 'unchanged',
                        'UNIQUE1': [0.4, None],
                        'UNIQUE2': [None, 0.2]
                    },
                    'postprocess_char_level': {
                        'UNKNOWN': 'unchanged',
                        'TEST': 'unchanged',
                        'UNIQUE1': [0.25, None],
                        'UNIQUE2': [None, 0.25]
                    }
                }
            }
        }
        self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

        # Test while disabling a column
        options = UnstructuredOptions()
        options.data_labeler.is_enabled = False
        compiler2 = col_pro_compilers.UnstructuredCompiler(data2, options)
        expected_diff = {
            'statistics': {
                'vocab':
                [['H', 'l'],
                 ['e', 'o', ' ', 'T', 'h', 'i', 's', 'a', 't', 'g', 'r', 'n'],
                 ['u', 'k', 'w', 'm', 'y', '9']],
                'vocab_count': [{
                    'l': 4,
                    'H': 2
                }, {
                    ' ': 1,
                    'e': 2,
                    's': 1,
                    't': 2,
                    'o': 1,
                    'i': 'unchanged',
                    'a': 'unchanged',
                    'T': 'unchanged',
                    'h': 'unchanged',
                    'g': 'unchanged',
                    'r': 'unchanged',
                    'n': -4
                }, {
                    'm': 2,
                    '9': 2,
                    'u': 1,
                    'k': 1,
                    'w': 1,
                    'y': 1
                }],
                'words': [['Hello', 'test'], ['grant'],
                          ['unknown', 'name', '9']],
                'word_count': [{
                    'Hello': 2,
                    'test': 1
                }, {
                    'grant': 'unchanged'
                }, {
                    '9': 2,
                    'unknown': 1,
                    'name': 1
                }]
            }
        }
        self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

        # Test while disabling 2 columns
        options.text.is_enabled = False
        compiler2 = col_pro_compilers.UnstructuredCompiler(data2, options)
        expected_diff = {}
        self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

        # Test while disabling all columns
        compiler1 = col_pro_compilers.UnstructuredCompiler(data1, options)
        self.assertDictEqual(expected_diff, compiler1.diff(compiler2))
Exemplo n.º 4
0
    def test_base(self, *mocks):
        from collections import defaultdict

        import pandas as pd

        df_series = pd.Series(["test", "hi my name is John Doe. 123-432-1234"])

        time_array = [float(i) for i in range(100, 0, -1)]
        with mock.patch("time.time", side_effect=lambda: time_array.pop()):
            compiler = col_pro_compilers.UnstructuredCompiler(df_series)

        expected_dict = {
            "data_label": {
                "entity_counts": {
                    "postprocess_char_level": defaultdict(int),
                    "true_char_level": defaultdict(int),
                    "word_level": defaultdict(int),
                },
                "entity_percentages": {
                    "postprocess_char_level": defaultdict(int),
                    "true_char_level": defaultdict(int),
                    "word_level": defaultdict(int),
                },
                "times": {
                    "data_labeler_predict": 1.0
                },
            },
            "statistics": {
                "times": {
                    "vocab": 1.0,
                    "words": 1.0
                },
                "vocab_count": {
                    " ": 6,
                    "-": 2,
                    ".": 1,
                    "1": 2,
                    "2": 3,
                    "3": 3,
                    "4": 2,
                    "D": 1,
                    "J": 1,
                    "a": 1,
                    "e": 3,
                    "h": 2,
                    "i": 2,
                    "m": 2,
                    "n": 2,
                    "o": 2,
                    "s": 2,
                    "t": 2,
                    "y": 1,
                },
                "vocab": [
                    " ",
                    "-",
                    ".",
                    "1",
                    "2",
                    "3",
                    "4",
                    "D",
                    "J",
                    "a",
                    "e",
                    "h",
                    "i",
                    "m",
                    "n",
                    "o",
                    "s",
                    "t",
                    "y",
                ],
                "word_count": {
                    "123-432-1234": 1,
                    "Doe": 1,
                    "John": 1,
                    "hi": 1,
                    "name": 1,
                    "test": 1,
                },
                "words": ["test", "hi", "name", "John", "Doe", "123-432-1234"],
            },
        }

        output_profile = compiler.profile

        # because vocab uses a set, it will be random order every time, hence
        # we need to sort to check exact match between profiles
        if "statistics" in output_profile and "vocab" in output_profile[
                "statistics"]:
            output_profile["statistics"]["vocab"] = sorted(
                output_profile["statistics"]["vocab"])

        self.assertDictEqual(expected_dict, output_profile)
Exemplo n.º 5
0
    def test_base(self, *mocks):
        import pandas as pd
        from collections import defaultdict
        df_series = pd.Series(['test', 'hi my name is John Doe. 123-432-1234'])

        time_array = [float(i) for i in range(100, 0, -1)]
        with mock.patch('time.time', side_effect=lambda: time_array.pop()):
            compiler = col_pro_compilers.UnstructuredCompiler(df_series)

        expected_dict = {
            'data_label': {
                'entity_counts': {
                    'postprocess_char_level': defaultdict(int),
                    'true_char_level': defaultdict(int),
                    'word_level': defaultdict(int)
                },
                'entity_percentages': {
                    'postprocess_char_level': defaultdict(int),
                    'true_char_level': defaultdict(int),
                    'word_level': defaultdict(int)
                },
                'times': {
                    'data_labeler_predict': 1.0
                }
            },
            'statistics': {
                'times': {
                    'vocab': 1.0,
                    'words': 1.0
                },
                'vocab_count': {
                    ' ': 6,
                    '-': 2,
                    '.': 1,
                    '1': 2,
                    '2': 3,
                    '3': 3,
                    '4': 2,
                    'D': 1,
                    'J': 1,
                    'a': 1,
                    'e': 3,
                    'h': 2,
                    'i': 2,
                    'm': 2,
                    'n': 2,
                    'o': 2,
                    's': 2,
                    't': 2,
                    'y': 1
                },
                'vocab': [
                    ' ', '-', '.', '1', '2', '3', '4', 'D', 'J', 'a', 'e', 'h',
                    'i', 'm', 'n', 'o', 's', 't', 'y'
                ],
                'word_count': {
                    '123-432-1234': 1,
                    'Doe': 1,
                    'John': 1,
                    'hi': 1,
                    'name': 1,
                    'test': 1
                },
                'words': ['test', 'hi', 'name', 'John', 'Doe', '123-432-1234']
            }
        }

        output_profile = compiler.profile

        # because vocab uses a set, it will be random order every time, hence
        # we need to sort to check exact match between profiles
        if ('statistics' in output_profile
                and 'vocab' in output_profile['statistics']):
            output_profile['statistics']['vocab'] = \
                sorted(output_profile['statistics']['vocab'])

        self.assertDictEqual(expected_dict, output_profile)