def test_merge_actions(self): merged_lgr = LGR() lgr = LGR() lgr.add_action(Action(match='rule-name', disp='invalid')) lgr.actions_xml.append("""<action disp="invalid" match="rule-name"/>""") merge_actions(lgr, 'fr', merged_lgr, {}) self.assertEqual(len(merged_lgr.actions), 1) self.assertEqual(len(merged_lgr.actions_xml), 1) self.assertEqual(merged_lgr.actions[0].match, 'fr-rule-name') # Default action should not be merged lgr = LGR() lgr.add_action(Action(disp='invalid', comment="Default action for invalid", any_variant=['invalid'])) lgr.actions_xml.append("""<action disp="invalid" match="rule-name"/>""") merge_actions(lgr, 'fr', merged_lgr, {}) self.assertEqual(len(merged_lgr.actions), 1) self.assertEqual(len(merged_lgr.actions_xml), 1) self.assertEqual(merged_lgr.actions[0].match, 'fr-rule-name')
class XMLParser(LGRParser): # Keep content intact, so do not strip CDATA section # (used in the <meta>/<description> element). # Do not resolve entities. # Skip comment, as we do not care. PARSER_OPTIONS = { 'resolve_entities': False, 'strip_cdata': False, 'remove_comments': True } def __init__(self, *args, **kwargs): if 'force_mode' in kwargs: force_mode = kwargs['force_mode'] del kwargs['force_mode'] else: force_mode = True super(XMLParser, self).__init__(*args, **kwargs) self.force_mode = force_mode self.rfc7940_checks = LGRFormatTestResults() def validate_document(self, rng_schema_path): # Construct the RelaxNG validator schema = etree.RelaxNG(file=rng_schema_path) # Parse the XML file parser = etree.XMLParser(**self.PARSER_OPTIONS) doc = etree.parse(self.source, parser=parser) logger.debug("Validating document '%s' with RNG '%s'", self.source, rng_schema_path) error_log = None if not schema.validate(doc): logger.warning("Validation of document '%s' failed", self.source) self.rfc7940_checks.error('schema') error_log = schema.error_log if len(error_log) == 0: # Bug in LXML, see https://bugs.launchpad.net/lxml/+bug/1526522 error_log = "CANNOT VALIDATE XML" self.rfc7940_checks.tested('schema') return error_log def unicode_version(self): logger.debug("Get unicode version from meta") # Only parse the "meta" element # Skip comment, as we do not care. context = etree.iterparse(self.source, tag=META_TAG, **self.PARSER_OPTIONS) self._fast_iter(context) unicode_version = self._lgr.metadata.unicode_version self._lgr = None # FD is now potentially at the end of the documents, # set it back to start if hasattr(self.source, "seek"): self.source.seek(0) return unicode_version def parse_document(self): logger.debug('Start parsing of file: %s', self.filename) # Keep content intact, so do not strip CDATA section # (used in the <meta>/<description> element). # Do not resolve entities. # Skip comment, as we do not care. context = etree.iterparse(self.source, **self.PARSER_OPTIONS) self._fast_iter(context) # FD is now potentially at the end of the documents, # set it back to start if hasattr(self.source, "seek"): self.source.seek(0) self.rfc7940_checks.tested('parse_xml') return self._lgr def _process_meta(self, elem): """ Process the <meta> element of an LGR XML file. """ metadata = Metadata(self.rfc7940_checks) reference_manager = ReferenceManager() MAPPER = { DATE_TAG: lambda d: metadata.set_date(d, force=self.force_mode), VALIDITY_START_TAG: lambda d: metadata.set_validity_start(d, force=self.force_mode), VALIDITY_END_TAG: lambda d: metadata.set_validity_end(d, force=self.force_mode), UNICODE_VERSION_TAG: lambda d: metadata.set_unicode_version(d, force=self.force_mode), } unicode_version_tag_found = False for child in elem: tag = child.tag logger.debug("Got '%s' element", tag) if tag in MAPPER: MAPPER[tag](child.text) if tag == UNICODE_VERSION_TAG: unicode_version_tag_found = True elif tag == VERSION_TAG: metadata.version = Version(child.text, child.get('comment', None)) elif tag == LANGUAGE_TAG: metadata.add_language(child.text, force=self.force_mode) elif tag == SCOPE_TAG: metadata.scopes.append( Scope(child.text, child.get('type', None))) elif tag == DESCRIPTION_TAG: # Seems to be an issue with CDATA/iterparse: https://bugs.launchpad.net/lxml/+bug/1788449 # For now, manually replace CRLF with LF metadata.description = Description( child.text.replace('\r\n', '\n'), child.get('type', None)) elif tag == REFERENCES_TAG: for reference in child: value = reference.text # Don't convert it to an int since ref_id may be a string ref_id = reference.get('id') comment = reference.get('comment', None) reference_manager.add_reference(value, comment=comment, ref_id=ref_id) # Since we have processed <reference> elements here, let's clean-up child.clear() else: logger.warning("Unhandled '%s' element in <meta> section", tag) self.rfc7940_checks.error('parse_xml') child.clear() self.rfc7940_checks.add_test_result('explicit_unicode_version', unicode_version_tag_found) self._lgr = LGR(name=self.filename, metadata=metadata, reference_manager=reference_manager, unicode_database=self._unicode_database) def _process_data(self, elem): """ Process the <data> element of an LGR XML file. """ # It is RECOMMENDED to list all "char" elements in ascending order of # the "cp" attribute. The below variable is used when verifying that. previous_codepoint = [] for child in elem: comment = child.get('comment', None) when = child.get('when', None) not_when = child.get('not-when', None) # Handle references ref = string_to_list(child.get('ref', '')) # Handle tags tag = string_to_list(child.get('tag', '')) if child.tag == CHAR_TAG: codepoint = [int(c, 16) for c in child.get('cp').split()] if codepoint <= previous_codepoint: if previous_codepoint[0:len(codepoint)] == codepoint: # Not clear what order is to be recommended here self.rfc7940_checks.error( 'char_strict_ascending_order') else: logger.warning( "cp attribute not in ascending order: '%s'", child.get('cp')) self.rfc7940_checks.error('char_ascending_order') previous_codepoint = codepoint try: self._lgr.add_cp(codepoint, comment=comment, ref=ref, tag=tag, when=when, not_when=not_when, force=self.force_mode) except LGRException as exc: logger.error("Cannot add code point '%s': %s", format_cp(codepoint), exc) self.rfc7940_checks.error('parse_xml') self.rfc7940_checks.error('codepoint_valid') if not self.force_mode: raise # Variants of char for variant in child.iter(VARIANT_TAG): var_codepoint = [ int(c, 16) for c in variant.get('cp').split() ] when = variant.get('when', None) not_when = variant.get('not-when', None) variant_type = variant.get('type', None) comment = variant.get('comment', None) # Handle references ref = string_to_list(variant.get('ref', '')) try: self._lgr.add_variant(codepoint, var_codepoint, variant_type=variant_type, when=when, not_when=not_when, comment=comment, ref=ref, force=self.force_mode) except LGRException as exc: logger.error( "Cannot add variant '%s' " "to code point '%s': %s", format_cp(var_codepoint), format_cp(codepoint), exc) self.rfc7940_checks.error('parse_xml') self.rfc7940_checks.error('codepoint_valid') if not self.force_mode: raise elif child.tag == RANGE_TAG: first_cp = int(child.get('first-cp'), 16) last_cp = int(child.get('last-cp'), 16) try: self._lgr.add_range(first_cp, last_cp, comment=comment, ref=ref, tag=tag, when=when, not_when=not_when, force=self.force_mode) except LGRException as exc: self.rfc7940_checks.error('parse_xml') self.rfc7940_checks.error('codepoint_valid') logger.error("Cannot add range '%s-%s': %s", format_cp(first_cp), format_cp(last_cp), exc) if not self.force_mode: raise child.clear() self.rfc7940_checks.tested('char_ascending_order') self.rfc7940_checks.tested('char_strict_ascending_order') def _process_rules(self, elem): """ Process the <rules> element of an LGR XML file. """ # Keep "text" version of the rules since we don't do anything with them. for child in elem: if child.tag in COMBINATOR_TAGS + (CLASS_TAG, ): cls = self._parse_class(child) self._lgr.add_class(cls, force=self.force_mode) child = drop_ns(child) self._lgr.classes_xml.append( etree.tostring(child, encoding=text_type)) elif child.tag == RULE_TAG: rule = self._parse_rule(child) self._lgr.add_rule(rule, force=self.force_mode) child = drop_ns(child) self._lgr.rules_xml.append( etree.tostring(child, encoding=text_type)) elif child.tag == ACTION_TAG: action = self._parse_action(child) self._lgr.add_action(action, force=self.force_mode) child = drop_ns(child) self._lgr.actions_xml.append( etree.tostring(child, encoding=text_type)) else: logger.warning("Unhandled '%s' element in <rules> section", child.tag) self.rfc7940_checks.error("parse_xml") child.clear() def _parse_rule(self, elem): """ Parse a <rule> element. :return: The rule object created. """ rule = Rule(name=elem.get('name', None), comment=elem.get('comment', None), ref=string_to_list(elem.get('ref', '')), by_ref=elem.get('by-ref', None)) for child in elem: self._parse_rule_helper(child, rule) return rule def _parse_rule_helper(self, child, rule): """ Helper to parse the content of a <rule> element. This function is to be called on children of a top-level <rule>. :param child: Child element of a top-level <rule> element. :param rule: The top-level rule element to add the content to. """ tag = child.tag comment = child.get('comment', None) count = child.get('count', None) if tag == ANCHOR_TAG: rule.add_child(AnchorMatcher(comment=comment)) elif tag == ANY_TAG: rule.add_child(AnyMatcher(comment=comment, count=count)) elif tag == CHAR_TAG: rule.add_child( CharMatcher(cp_or_sequence_from_class(child), comment=comment, count=count)) elif tag == CHOICE_TAG: choice = ChoiceMatcher(comment=comment, count=count) for matcher in child: self._parse_rule_helper(matcher, choice) rule.add_child(choice) elif tag == END_TAG: rule.add_child(EndMatcher(comment=comment)) elif tag == LOOKAHEAD_TAG: look_ahead = LookAheadMatcher(comment=comment) for matcher in child: self._parse_rule_helper(matcher, look_ahead) rule.add_child(look_ahead) elif tag == LOOKBEHIND_TAG: look_behind = LookBehindMatcher(comment=comment) for matcher in child: self._parse_rule_helper(matcher, look_behind) rule.add_child(look_behind) elif tag == START_TAG: rule.add_child(StartMatcher(comment=comment)) elif tag == RULE_TAG: child_rule = self._parse_rule(child) rule.add_child( RuleMatcher(child_rule, comment=comment, count=count)) elif tag == CLASS_TAG or tag in COMBINATOR_TAGS: rule.add_child( ClassMatcher(self._parse_class(child), comment=comment, count=count)) else: logger.warning("Unhandled '%s' element in <rule> object", tag) self.rfc7940_checks.error('parse_xml') def _parse_action(self, elem): """ Parse an <action> element. :return: The action object created. """ disp = elem.get('disp') comment = elem.get('comment', None) match = elem.get('match', None) not_match = elem.get('not-match', None) any_variant = string_to_list(elem.get('any-variant', '')) all_variants = string_to_list(elem.get('all-variants', '')) only_variants = string_to_list(elem.get('only-variants', '')) return Action(disp, comment=comment, ref=string_to_list(elem.get('ref', '')), match=match, not_match=not_match, any_variant=any_variant, all_variants=all_variants, only_variants=only_variants) def _parse_class(self, elem): """ Parse an <class> element. :return: The Class object created. """ tag = elem.tag name = elem.get('name', None) comment = elem.get('comment', None) if tag == CLASS_TAG: cls = Class(name=name, comment=comment, ref=string_to_list(elem.get('ref', '')), from_tag=elem.get('from-tag', None), unicode_property=elem.get('property', None), by_ref=elem.get('by-ref', None)) if len(elem) == 0 and elem.text: # No child, code point(s) defined in text cls.add_codepoint(cp_or_sequence_from_class(elem)) for child in elem: cls.add_codepoint(cp_or_sequence_from_class(child)) elif tag in COMBINATOR_TAGS: MAPPING = { UNION_TAG: UnionClass, COMPLEMENT_TAG: ComplementClass, INTERSECTION_TAG: IntersectionClass, DIFFERENCE_TAG: DifferenceClass, SYM_DIFFERENCE_TAG: SymmetricDifferenceClass } cls = MAPPING[tag](name=name, comment=comment) # TODO: ensure number of children for child in elem: cls.add_child(self._parse_class(child)) else: logger.warning("Unhandled '%s' element in <class> object", tag) self.rfc7940_checks.error('parse_xml') return cls def _fast_iter(self, context): """ Iterator used to incrementally parse the XML file. """ metadata_added = False for _, elem in context: if not metadata_added and elem == DATA_TAG: # The optional "meta" element is not present since it must # preceed the required data element. # However, we still have to call _process_meta self._process_meta({}) metadata_added = True if elem.tag == META_TAG: logger.debug("Got 'meta' element") self._process_meta(elem) elif elem.tag == DATA_TAG: logger.debug("Got 'data' element") self._process_data(elem) elif elem.tag == RULES_TAG: logger.debug("Got 'rules' element") self._process_rules(elem) else: continue # Clean-up memory elem.clear() del context