def test_string_nonregex_init(self):
        hashing_properties = field_formats.FieldHashingProperties(
            ngram=2, k=20)
        spec = field_formats.StringSpec(
            identifier='first name',
            hashing_properties=hashing_properties,
            case='mixed',
            min_length=5)

        # The min should be set, and max length should be None.
        self.assertEqual(spec.min_length, 5)
        self.assertIsNone(spec.max_length)

        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('hi')
        spec.validate('hello this is fine')
        spec.validate('Hello This is FINE!')
        spec.case = 'lower'
        spec.validate('hello you')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('Hello You')
        spec.case = 'upper'
        spec.validate('HELLO SHOUTY')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('Hello You')
        spec.case = 'casey'
        with self.assertRaises(ValueError):
            spec.validate('boomboom')

        # Check random metadata.
        self.assertEqual(spec.identifier, 'first name')
        self.assertIsNone(spec.description)

        # Check the hashing specs.
        self.assertTrue(hasattr(spec, 'hashing_properties'))
        # check invalid field specs
        with self.assertRaises(ValueError):
            field_formats.StringSpec(
                identifier='first name',
                hashing_properties=hashing_properties,
                case='mixed',
                min_length=-5)
        with self.assertRaises(ValueError):
            field_formats.StringSpec(
                identifier='first name',
                hashing_properties=hashing_properties,
                case='mixed',
                max_length=-1)
        with self.assertRaises(ValueError):
            field_formats.StringSpec(
                identifier='first name',
                hashing_properties=hashing_properties,
                case='caseychasey')
Exemplo n.º 2
0
    def test_string_nonregex_init(self):
        hashing_properties = field_formats.FieldHashingProperties(
            ngram=2, encoding='utf-8')
        spec = field_formats.StringSpec(
            identifier='first name',
            hashing_properties=hashing_properties,
            case='mixed',
            min_length=5)

        # The min should be set, and max length should be None.
        self.assertEqual(spec.min_length, 5)
        self.assertIsNone(spec.max_length)

        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('hi')
        spec.validate('hello this is fine')

        # Check random metadata.
        self.assertEqual(spec.identifier, 'first name')
        self.assertIsNone(spec.description)

        # Check the hashing specs.
        self.assertTrue(hasattr(spec, 'hashing_properties'))
    def test_string_regex(self):
        regex_spec = dict(
            identifier='regex',
            format=dict(
                type='string',
                encoding='ascii',
                pattern=r'[5-9',  # This is syntactically incorrect.
                description='foo'),
            hashing=dict(
                ngram=1, strategy=dict(k=20)))

        # Make sure we don't accept bad regular expressions.
        with self.assertRaises(field_formats.InvalidSchemaError):
            field_formats.spec_from_json_dict(regex_spec)

        # Ok, let's fix it. This should not raise.
        regex_spec['format']['pattern'] = r'dog(.dog)*'
        spec = field_formats.spec_from_json_dict(regex_spec)

        # Ensure we accept these.
        spec.validate('dog')
        spec.validate('dogodog')

        # These don't match the pattern.
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('dogs')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('hot dog')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('hot dogs')

        # This should raise since 'ø' can't be represented by our
        # encoding (ASCII).
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate(u'dogødog')

        # Check random metadata.
        self.assertEqual(spec.identifier, 'regex')
        self.assertEqual(spec.description, 'foo')

        # Finally, check the hashing specs.
        self.assertEqual(spec.hashing_properties.ngram, 1)
        self.assertIs(spec.hashing_properties.positional, False)
        self.assertEqual(spec.hashing_properties.k, 20)

        # check with missing values
        regex_spec['hashing']['missingValue'] = dict(sentinel='null')
        spec = field_formats.spec_from_json_dict(regex_spec)
        # validating the sentinel should work
        spec.validate('null')
        self.assertTrue(spec.is_missing_value('null'))
        self.assertFalse(spec.is_missing_value('dog'))
        self.assertEqual('null', spec.hashing_properties.replace_missing_value('null'))
        self.assertEqual('dog', spec.hashing_properties.replace_missing_value('dog'))
        # now with replaceWith value
        regex_spec['hashing']['missingValue']['replaceWith'] = 'cat'
        spec = field_formats.spec_from_json_dict(regex_spec)
        self.assertEqual('cat', spec.hashing_properties.replace_missing_value('null'))
        # check invalid format specs
        hashing_properties = field_formats.FieldHashingProperties(
            ngram=2, k=20)
        with self.assertRaises(ValueError):
            spec = field_formats.StringSpec(
                identifier='regex',
                hashing_properties=hashing_properties,
                case='casey',
                regex=r'dog(.dog)*')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec = field_formats.StringSpec(
                identifier='regex',
                hashing_properties=hashing_properties,
                regex=r'[5-9')