class RFC3743Parser(LGRParser): def unicode_version(self): # No Unicode version defined in file return "" def validate_document(self, schema=None): # No validation of document done for now return "" def parse_document(self): if not self.filename and isinstance(self.source, str): self.filename = os.path.basename(self.source) self._lgr = LGR(name=self.filename) logger.debug('Start parsing of file: %s', self.filename) if hasattr(self.source, "read"): self._parse_doc(self.source) else: with io.open(self.source, 'r', encoding='utf-8') as rule_file: self._parse_doc(rule_file) return self._lgr def _parse_doc(self, rule_file): """ Actual parsing of document. :param rule_file: Content of the rule, as a file-like object. """ line_num = 0 for line in rule_file: line_num += 1 line = line.strip() if len(line) == 0: continue if line[0] == '#': continue reference = REFERENCE_RE.match(line) if reference is not None: ref_id = reference.group('ref_id') value = reference.group('value') comment = reference.group('comment') try: self._lgr.add_reference(value, ref_id=ref_id, comment=comment) except LGRException: logger.error("Invalid reference '%s' on line %d", line, line_num) continue version = VERSION_RE.match(line) if version is not None: version_no = version.group('version_no') date = version.group('date') comment = version.group('comment') try: self._lgr.metadata.version = Version(version_no, comment=comment) self._lgr.metadata.date = date except LGRException: logger.error("Invalid version '%s' on line %d", line, line_num) continue if UNICODE_CODEPOINT_RE.match(line) is None: logger.debug("Skipping non-parsable line %d:\n%s", line_num, line) # Line is not starting with a valid unicode code point, skip continue # Split base character from variant(s) char_variant = line.split(';') char = char_variant[0] try: [(codepoints, references)] = parse_char(char) self._lgr.add_cp(codepoints, ref=references) except ValueError: logger.error("Invalid character '%s' at line %d", char, line_num) except LGRException as exc: logger.error("Cannot add code point '%s' at line %d: %s", format_cp(codepoints), line_num, exc) if len(char_variant) > 1: preferred_variants = char_variant[1].strip() if len(preferred_variants ) > 0 and preferred_variants[0] != '#': # From RFC7940, Section 7.3. Recommended Disposition Values: # activated The resulting string should be activated for use. (This # is the same as a Preferred Variant [RFC3743].) var_type = "activated" self.insert_variant(line_num, codepoints, preferred_variants, var_type) if len(char_variant) > 2: variants = char_variant[2].strip() if len(variants) > 0 and variants[0] != '#': self.insert_variant(line_num, codepoints, variants) def insert_variant(self, line_num, codepoints, var, var_type=None): try: variants = parse_char(var) except ValueError: logger.error("Invalid variant '%s' at line %d", var, line_num) return for (var_codepoints, references) in variants: try: self._lgr.add_variant(codepoints, var_codepoints, ref=references, variant_type=var_type) except LGRException as exc: logger.error( "Cannot add variant '%s' to code point '%s' at line %d: %s", format_cp(var_codepoints), format_cp(codepoints), line_num, exc)
class TestLGRCore(unittest.TestCase): def setUp(self): unidb = IDNADatabase('6.3.0') self.lgr = LGR(unicode_database=unidb) def test_add_single_cp_list(self): self.lgr.add_cp([0x0061]) self.assertIn(0x0061, self.lgr.repertoire) def test_add_single_cp_int(self): self.lgr.add_cp(0x0061) self.assertIn(0x0061, self.lgr.repertoire) def test_add_cp_sequence(self): self.lgr.add_cp([0x0061, 0x0062]) self.assertIn([0x0061, 0x0062], self.lgr.repertoire) self.assertNotIn(0x0061, self.lgr.repertoire) self.assertNotIn(0x0062, self.lgr.repertoire) def test_add_multiple_cp_sequences(self): self.lgr.add_cp([0x0061, 0x0062]) self.lgr.add_cp([0x0061, 0x0062, 0x0063]) self.assertIn([0x0061, 0x0062], self.lgr.repertoire) self.assertIn([0x0061, 0x0062, 0x0063], self.lgr.repertoire) self.assertNotIn(0x0061, self.lgr.repertoire) self.assertNotIn(0x0062, self.lgr.repertoire) self.assertNotIn(0x0063, self.lgr.repertoire) def test_add_cp_in_repertoire(self): self.lgr.add_cp([0x0061]) self.assertRaises(CharAlreadyExists, self.lgr.add_cp, [0x0061]) self.assertRaises(CharAlreadyExists, self.lgr.add_cp, 0x0061) def test_add_cp_validation(self): validation_lgr = LGR() validation_lgr.add_cp([0x0061]) self.lgr.add_cp([0x0061], validating_repertoire=validation_lgr, override_repertoire=False) self.assertRaises(NotInRepertoire, self.lgr.add_cp, [0x0062], validating_repertoire=validation_lgr, override_repertoire=False) def test_add_cp_validation_override(self): validation_lgr = LGR() validation_lgr.add_cp([0x0061]) self.lgr.add_cp([0x0061], validating_repertoire=validation_lgr, override_repertoire=False) self.lgr.add_cp([0x0062], validating_repertoire=validation_lgr, override_repertoire=True) self.assertIn(0x0062, self.lgr.repertoire) def test_del_single_cp_list(self): self.lgr.add_cp(0x0061) self.lgr.del_cp([0x0061]) self.assertNotIn(0x0061, self.lgr.repertoire) def test_del_single_cp_int(self): self.lgr.add_cp([0x0061]) self.lgr.del_cp(0x0061) self.assertNotIn(0x0061, self.lgr.repertoire) def test_del_cp_sequence(self): self.lgr.add_cp([0x0061, 0x0062]) self.lgr.del_cp([0x0061, 0x0062]) self.assertEqual(len(self.lgr.repertoire), 0) def test_del_cp_sequence_with_cp(self): self.lgr.add_cp([0x0061, 0x0062]) self.assertRaises(NotInLGR, self.lgr.del_cp, 0x0061) self.assertRaises(NotInLGR, self.lgr.del_cp, 0x0062) self.assertIn([0x0061, 0x0062], self.lgr.repertoire) def test_add_cp_when_not_when(self): self.lgr.add_cp([0x0061], when='w1') with self.assertRaises(CharInvalidContextRule) as cm: self.lgr.add_cp([0x0062], when='w2', not_when='nw1') the_exception = cm.exception self.assertEqual(the_exception.cp, [0x0062]) self.lgr.add_cp([0x0062], not_when='nw2') with self.assertRaises(CharInvalidContextRule) as cm: self.lgr.add_cp([0x0063], when='w3', not_when='nw3') the_exception = cm.exception self.assertEqual(the_exception.cp, [0x0063]) def test_add_range(self): self.lgr.add_range(0x0061, 0x007A) for cp in range(0x0061, 0x007A + 1): self.assertIn(cp, self.lgr.repertoire) def test_add_range_in_repertoire(self): self.lgr.add_range(0x0061, 0x007A) self.assertRaises(CharAlreadyExists, self.lgr.add_range, 0x0061, 0x007A) def test_add_range_validation(self): validation_lgr = LGR() for cp in range(0x0061, 0x007A + 1): validation_lgr.add_cp(cp) self.lgr.add_range(0x0061, 0x007A, validating_repertoire=validation_lgr, override_repertoire=False) self.assertRaises(NotInRepertoire, self.lgr.add_range, 0x00F8, 0x00FF, validating_repertoire=validation_lgr, override_repertoire=False) def test_add_range_validation_with_range(self): validation_lgr = LGR() validation_lgr.add_range(0x0061, 0x007A) self.lgr.add_range(0x0061, 0x007A, validating_repertoire=validation_lgr, override_repertoire=False) self.assertRaises(NotInRepertoire, self.lgr.add_range, 0x00F8, 0x00FF, validating_repertoire=validation_lgr, override_repertoire=False) def test_add_range_validation_override(self): validation_lgr = LGR() for cp in range(0x0061, 0x007A): validation_lgr.add_cp(cp) self.lgr.add_range(0x0031, 0x0032, validating_repertoire=validation_lgr, override_repertoire=True) self.assertIn(0x0031, self.lgr.repertoire) def test_add_range_when_not_when(self): self.lgr.add_range(0x0061, 0x0065, when='w1') with self.assertRaises(RangeInvalidContextRule) as cm: self.lgr.add_range(0x0066, 0x007A, when='w2', not_when='nw1') the_exception = cm.exception self.assertEqual(the_exception.first_cp, 0x0066) self.assertEqual(the_exception.last_cp, 0x007A) self.lgr.add_range(0x0066, 0x007A, not_when='nw2') with self.assertRaises(RangeInvalidContextRule) as cm: self.lgr.add_range(0x01BD, 0x01C3, when='w3', not_when='nw3') the_exception = cm.exception self.assertEqual(the_exception.first_cp, 0x01BD) self.assertEqual(the_exception.last_cp, 0x01C3) def test_expand_ranges(self): self.lgr.add_range(0x0061, 0x007A) for cp in range(0x0061, 0x007A + 1): self.assertIsInstance(self.lgr.get_char(cp), RangeChar) self.lgr.add_range(0x01BD, 0x01C3) for cp in range(0x01BD, 0x01C3 + 1): self.assertIsInstance(self.lgr.get_char(cp), RangeChar) self.lgr.expand_ranges() for cp in range(0x0061, 0x007A + 1): char = self.lgr.get_char(cp) self.assertIsInstance(char, Char) self.assertNotIsInstance(char, RangeChar) for cp in range(0x01BD, 0x01C3 + 1): char = self.lgr.get_char(cp) self.assertIsInstance(char, Char) self.assertNotIsInstance(char, RangeChar) def test_expand_range(self): self.lgr.add_range(0x0061, 0x007A) for cp in range(0x0061, 0x007A + 1): self.assertIsInstance(self.lgr.get_char(cp), RangeChar) self.lgr.expand_range(0x0061, 0x007A) for cp in range(0x0061, 0x007A + 1): char = self.lgr.get_char(cp) self.assertIsInstance(char, Char) self.assertNotIsInstance(char, RangeChar) def test_add_variant_in_repertoire(self): self.lgr.add_cp([0x0061]) self.lgr.add_variant([0x0061], [0x0030]) self.assertRaises(VariantAlreadyExists, self.lgr.add_variant, [0x0061], [0x0030]) def test_add_variant_validation(self): validation_lgr = LGR() validation_lgr.add_cp([0x0061]) validation_lgr.add_cp([0x0030]) self.lgr.add_cp([0x0061]) self.lgr.add_variant([0x0061], [0x0030]) self.assertRaises(NotInRepertoire, self.lgr.add_variant, [0x0061], [0x0062], validating_repertoire=validation_lgr, override_repertoire=False) def test_add_variant_when_not_when(self): self.lgr.add_cp([0x0061]) self.lgr.add_variant([0x0061], [0x0030], when='w1') with self.assertRaises(VariantInvalidContextRule) as cm: self.lgr.add_variant([0x0061], [0x0031], when='w2', not_when='nw1') the_exception = cm.exception self.assertEqual(the_exception.cp, [0x0061]) self.assertEqual(the_exception.variant, [0x0031]) self.lgr.add_variant([0x0061], [0x0030], not_when='nw2') with self.assertRaises(VariantInvalidContextRule) as cm: self.lgr.add_variant([0x0061], [0x0031], when='w3', not_when='nw3') the_exception = cm.exception self.assertEqual(the_exception.cp, [0x0061]) self.assertEqual(the_exception.variant, [0x0031]) def test_del_cp_validation_override(self): validation_lgr = LGR() validation_lgr.add_cp([0x0061]) validation_lgr.add_cp([0x0030]) self.lgr.add_cp([0x0061]) self.lgr.add_variant([0x0061], [0x0030]) self.lgr.add_variant([0x0061], [0x0062], validating_repertoire=validation_lgr, override_repertoire=True) self.assertIn((0x0062, ), self.lgr.repertoire[0x0061]._variants) def test_get_variants(self): self.lgr.add_cp([0x0061]) self.lgr.add_variant([0x0061], [0x0030]) variants = self.lgr.get_variants([0x0061]) self.assertIsInstance(variants, types.GeneratorType) variant_list = list(variants) self.assertEqual(len(variant_list), 1) def test_check_range_no_modification(self): self.lgr.check_range(0x0060, 0x007F) self.assertEqual(len(self.lgr.repertoire), 0) def test_check_range(self): self.lgr.add_cp([0x0061]) self.lgr.add_cp([0x007A]) codepoints = self.lgr.check_range(0x0060, 0x007F) for result in codepoints: cp = result[0] prop = result[1] if cp == 0x060 or cp >= 0x007B: self.assertIsInstance(prop, CharInvalidIdnaProperty) elif cp == 0x0061 or cp == 0x007A: self.assertIsInstance(prop, CharAlreadyExists) else: self.assertIsNone(prop) def test_add_codepoints(self): self.lgr.add_codepoints([c for c in range(0x0061, 0x007A + 1)] + [0x0107] + [0x0137, 0x0138]) expected_output = [ RangeChar(0x061, 0x0061, 0x007A), Char(0x0107), RangeChar(0x0137, 0x0137, 0x0138) ] self.assertEqual(expected_output, list(self.lgr.repertoire)) def test_tags_on_codepoint(self): self.lgr.add_cp([0x0061], tag=['t1', 't2']) with self.assertRaises(LGRFormatException) as cm: self.lgr.add_cp([0x0062], tag=['t1', 't1']) the_exception = cm.exception self.assertEqual(the_exception.reason, LGRFormatException.LGRFormatReason.DUPLICATE_TAG) def test_tags_on_codepoint_sequence(self): with self.assertRaises(LGRFormatException) as cm: self.lgr.add_cp([0x0061, 0x0062], tag=['t1']) the_exception = cm.exception self.assertEqual(the_exception.reason, LGRFormatException.LGRFormatReason.SEQUENCE_NO_TAG) def test_tags_on_range(self): self.lgr.add_range(0x0061, 0x0062, tag=['t1', 't2']) with self.assertRaises(LGRFormatException) as cm: self.lgr.add_range(0x0063, 0x0064, tag=['t1', 't1']) the_exception = cm.exception self.assertEqual(the_exception.reason, LGRFormatException.LGRFormatReason.DUPLICATE_TAG) def test_list_types(self): self.lgr.add_cp([0x0061]) self.lgr.add_variant([0x0061], [0x0030], variant_type='BLOCK') self.lgr.add_variant([0x0061], [0x0031], variant_type='VALID') self.lgr.add_variant([0x0061], [0x0032], variant_type='BLOCK') self.assertEquals(self.lgr.types, set(['BLOCK', 'VALID'])) def test_del_reference(self): ref_id_1 = self.lgr.add_reference("Test - 1") ref_id_2 = self.lgr.add_reference("Test - 2") self.lgr.add_cp([0x0061], ref=[ref_id_1]) self.lgr.add_cp([0x0062], ref=[ref_id_1, ref_id_2]) self.lgr.del_reference(ref_id_1) self.assertNotIn(ref_id_1, self.lgr.reference_manager) self.assertEquals(self.lgr.get_char([0x0061]).references, []) self.assertEquals(self.lgr.get_char([0x0062]).references, [ref_id_2]) def test_add_cp_duplicate_reference(self): ref_id = self.lgr.add_reference("Test - 1") with self.assertRaises(DuplicateReference) as cm: self.lgr.add_cp([0x0061], ref=[ref_id, ref_id]) the_exception = cm.exception self.assertEqual(the_exception.cp, [0x0061]) def test_add_range_duplicate_reference(self): ref_id = self.lgr.add_reference("Test - 1") with self.assertRaises(DuplicateReference) as cm: self.lgr.add_range(0x0061, 0x0062, ref=[ref_id, ref_id]) the_exception = cm.exception self.assertEqual(the_exception.cp, 0x0061) def test_add_variant_duplicate_reference(self): self.lgr.add_cp([0x0061]) ref_id = self.lgr.add_reference("Test - 1") with self.assertRaises(DuplicateReference) as cm: self.lgr.add_variant([0x0061], [0x0062], ref=[ref_id, ref_id]) the_exception = cm.exception self.assertEqual(the_exception.cp, [0x0061]) def test_generate_variants(self): self.lgr.add_cp([0x0061]) self.lgr.add_cp([0x0062]) self.lgr.add_cp([0x0063]) self.lgr.add_cp([0x0064]) self.lgr.add_variant([0x0061], [0x0070], variant_type="type0") self.lgr.add_variant([0x0062], [0x0071], variant_type="type1") self.lgr.add_variant([0x0062], [0x0072], variant_type="type2") self.assertEqual([], list(self.lgr._generate_label_variants([]))) self.assertEqual([], list(self.lgr._generate_label_variants([0x0063]))) self.assertEqual( [], list(self.lgr._generate_label_variants([0x0063, 0x0064]))) self.assertEqual( set([((0x0071, 0x0063), frozenset(['type1']), False), ((0x0072, 0x0063), frozenset(['type2']), False)]), set(self.lgr._generate_label_variants([0x0062, 0x0063]))) self.assertEqual( set([ ((0x0061, 0x0062), frozenset(), False), ((0x0061, 0x0071), frozenset(['type1']), False), ((0x0061, 0x0072), frozenset(['type2']), False), ((0x0070, 0x0062), frozenset(['type0']), False), ((0x0070, 0x0071), frozenset(['type0', 'type1']), True), ((0x0070, 0x0072), frozenset(['type0', 'type2']), True), ]), set(self.lgr._generate_label_variants([0x0061, 0x0062]))) self.assertEqual( set([ ((0x0061, 0x0062, 0x0062), frozenset(), False), ((0x0061, 0x0062, 0x0071), frozenset(['type1']), False), ((0x0061, 0x0062, 0x0072), frozenset(['type2']), False), ((0x0061, 0x0071, 0x0062), frozenset(['type1']), False), ((0x0061, 0x0071, 0x0071), frozenset(['type1']), False), ((0x0061, 0x0071, 0x0072), frozenset(['type1', 'type2']), False), ((0x0061, 0x0072, 0x0062), frozenset(['type2']), False), ((0x0061, 0x0072, 0x0071), frozenset(['type1', 'type2']), False), ((0x0061, 0x0072, 0x0072), frozenset(['type2']), False), ((0x0070, 0x0062, 0x0062), frozenset(['type0']), False), ((0x0070, 0x0062, 0x0071), frozenset(['type0', 'type1']), False), ((0x0070, 0x0062, 0x0072), frozenset(['type0', 'type2']), False), ((0x0070, 0x0071, 0x0062), frozenset(['type0', 'type1']), False), ((0x0070, 0x0071, 0x0071), frozenset(['type0', 'type1']), True), ((0x0070, 0x0071, 0x0072), frozenset(['type0', 'type1', 'type2']), True), ((0x0070, 0x0072, 0x0062), frozenset(['type0', 'type2']), False), ((0x0070, 0x0072, 0x0071), frozenset(['type0', 'type1', 'type2']), True), ((0x0070, 0x0072, 0x0072), frozenset(['type0', 'type2']), True), ]), set(self.lgr._generate_label_variants([0x0061, 0x0062, 0x0062]))) def test_generate_variants_reflexive(self): self.lgr.add_cp([0x0061]) self.lgr.add_cp([0x0062]) self.lgr.add_cp([0x0063]) self.lgr.add_variant([0x0062], [0x0062], variant_type="reflexive") self.lgr.add_variant([0x0063], [0x0070], variant_type="type") self.assertEqual([], list(self.lgr._generate_label_variants([]))) self.assertEqual([], list(self.lgr._generate_label_variants([0x0061]))) self.assertEqual([((0x0062, ), frozenset(['reflexive']), True)], list(self.lgr._generate_label_variants([0x0062]))) self.assertEqual( set([ ((0x0062, 0x0063), frozenset(['reflexive']), False), ((0x0062, 0x0070), frozenset(['reflexive', 'type']), True), ]), set(self.lgr._generate_label_variants([0x0062, 0x0063]))) def test_label_simple(self): self.lgr.add_cp([0x0061]) self.lgr.add_cp([0x0062, 0x0063]) self.lgr.add_range(0x0064, 0x0068) valid_labels = ([0x0061], [0x0062, 0x0063], [0x0064], [0x0068], [0x0061, 0x0064], [0x0061, 0x0062, 0x0063, 0x0064], [0x0062, 0x0063, 0x0068]) invalid_labels = (([0x0060], [], [0x0060]), ([0x0069], [], [0x0069]), ([0x0062], [], [0x0062]), ([0x0063], [], [0x0063]), ([0x0061, 0x0062], [0x0061], [0x0062])) for label in valid_labels: self.assertEqual((True, label, []), self.lgr._test_preliminary_eligibility(label)) for (label, label_part, not_in_lgr) in invalid_labels: self.assertEqual((False, label_part, not_in_lgr), self.lgr._test_preliminary_eligibility(label)) def test_label_eligibility_multiple_choices(self): self.lgr.add_cp([0x0061]) self.lgr.add_cp([0x0061, 0x0062, 0x0063]) self.lgr.add_cp([0x0064]) self.assertEqual(self.lgr._test_preliminary_eligibility([0x0062]), (False, [], [0x0062])) self.assertEqual( self.lgr._test_preliminary_eligibility( [0x0061, 0x0062, 0x0063, 0x0064]), (True, [0x0061, 0x0062, 0x0063, 0x0064], [])) def test_label_delayed_eligibilty(self): self.lgr.add_cp([0x0061]) self.lgr.add_variant([0x0061], [0x0061], 'block') self.lgr.add_cp([0x0062]) self.lgr.add_variant([0x0062], [0x0062], 'invalid') self.lgr.add_cp([0x0063, 0x0064]) self.lgr.add_variant([0x0063, 0x0064], [0x0063, 0x0064], 'invalid') self.assertEqual(self.lgr._test_label_disposition([0x0062]), ('invalid', 0)) self.assertEqual(self.lgr._test_label_disposition([0x0063, 0x0064]), ('invalid', 0)) self.assertEqual(self.lgr._test_label_disposition([0x0061, 0x0062]), ('invalid', 0)) def test_label_length(self): self.lgr.add_cp([0x0061]) self.lgr.add_variant([0x0061], [0x0061], 'disp') self.lgr.add_cp([0x0062]) self.lgr.add_variant([0x0062], [0x0062], 'disp') self.assertEqual(PROTOCOL_LABEL_MAX_LENGTH, self.lgr.max_label_length()) for i in range(80): self.lgr.add_variant([0x0062], [0x074D + i], 'disp') # 41: mean number of variants per character self.assertEqual(int(math.log(MAX_NUMBER_GENERATED_VARIANTS, 41)), self.lgr.max_label_length())