Exemplo n.º 1
0
 def setUp(self):
     self.lgr = LGR()
     # Configure log system to redirect validation logs to local attribute
     self.log_output = StringIO()
     ch = logging.StreamHandler(self.log_output)
     ch.setLevel(logging.DEBUG)
     logging.getLogger('lgr.validate').addHandler(ch)
Exemplo n.º 2
0
    def _process_meta(self, elem):
        """
        Process the <meta> element of an LGR XML file.
        """
        metadata = Metadata(self.rfc7940_checks)
        reference_manager = ReferenceManager()
        MAPPER = {
            DATE_TAG:
            lambda d: metadata.set_date(d, force=self.force_mode),
            VALIDITY_START_TAG:
            lambda d: metadata.set_validity_start(d, force=self.force_mode),
            VALIDITY_END_TAG:
            lambda d: metadata.set_validity_end(d, force=self.force_mode),
            UNICODE_VERSION_TAG:
            lambda d: metadata.set_unicode_version(d, force=self.force_mode),
        }
        unicode_version_tag_found = False
        for child in elem:
            tag = child.tag
            logger.debug("Got '%s' element", tag)
            if tag in MAPPER:
                MAPPER[tag](child.text)
                if tag == UNICODE_VERSION_TAG:
                    unicode_version_tag_found = True
            elif tag == VERSION_TAG:
                metadata.version = Version(child.text,
                                           child.get('comment', None))
            elif tag == LANGUAGE_TAG:
                metadata.add_language(child.text, force=self.force_mode)
            elif tag == SCOPE_TAG:
                metadata.scopes.append(
                    Scope(child.text, child.get('type', None)))
            elif tag == DESCRIPTION_TAG:
                # Seems to be an issue with CDATA/iterparse: https://bugs.launchpad.net/lxml/+bug/1788449
                # For now, manually replace CRLF with LF
                metadata.description = Description(
                    child.text.replace('\r\n', '\n'), child.get('type', None))
            elif tag == REFERENCES_TAG:
                for reference in child:
                    value = reference.text
                    # Don't convert it to an int since ref_id may be a string
                    ref_id = reference.get('id')
                    comment = reference.get('comment', None)
                    reference_manager.add_reference(value,
                                                    comment=comment,
                                                    ref_id=ref_id)
                # Since we have processed <reference> elements here, let's clean-up
                child.clear()
            else:
                logger.warning("Unhandled '%s' element in <meta> section", tag)
                self.rfc7940_checks.error('parse_xml')
            child.clear()

        self.rfc7940_checks.add_test_result('explicit_unicode_version',
                                            unicode_version_tag_found)
        self._lgr = LGR(name=self.filename,
                        metadata=metadata,
                        reference_manager=reference_manager,
                        unicode_database=self._unicode_database)
Exemplo n.º 3
0
 def test_add_range_validation_override(self):
     validation_lgr = LGR()
     for cp in range(0x0061, 0x007A):
         validation_lgr.add_cp(cp)
     self.lgr.add_range(0x0031,
                        0x0032,
                        validating_repertoire=validation_lgr,
                        override_repertoire=True)
     self.assertIn(0x0031, self.lgr.repertoire)
Exemplo n.º 4
0
 def test_add_cp_validation_override(self):
     validation_lgr = LGR()
     validation_lgr.add_cp([0x0061])
     self.lgr.add_cp([0x0061],
                     validating_repertoire=validation_lgr,
                     override_repertoire=False)
     self.lgr.add_cp([0x0062],
                     validating_repertoire=validation_lgr,
                     override_repertoire=True)
     self.assertIn(0x0062, self.lgr.repertoire)
Exemplo n.º 5
0
 def test_add_cp_validation(self):
     validation_lgr = LGR()
     validation_lgr.add_cp([0x0061])
     self.lgr.add_cp([0x0061],
                     validating_repertoire=validation_lgr,
                     override_repertoire=False)
     self.assertRaises(NotInRepertoire,
                       self.lgr.add_cp, [0x0062],
                       validating_repertoire=validation_lgr,
                       override_repertoire=False)
Exemplo n.º 6
0
 def create(cls, name, unicode_version, validating_repertoire_name):
     metadata = Metadata()
     metadata.version = Version('1')
     metadata.set_unicode_version(unicode_version)
     lgr = LGR(name, metadata=metadata)
     lgr.unicode_database = unidb.manager.get_db_by_version(unicode_version)
     validating_repertoire = get_by_name(
         validating_repertoire_name) if validating_repertoire_name else None
     lgr_info = cls(name,
                    lgr=lgr,
                    validating_repertoire=validating_repertoire)
     return lgr_info
Exemplo n.º 7
0
    def parse_document(self):
        self._lgr = LGR(name=self.filename)

        logger.debug('Start parsing of file: %s', self.filename)

        if hasattr(self.source, "read"):
            self._parse_doc(self.source)
        else:
            with io.open(self.source, 'r', encoding='utf-8') as rule_file:
                self._parse_doc(rule_file)

        return self._lgr
Exemplo n.º 8
0
    def test_del_cp_validation_override(self):
        validation_lgr = LGR()
        validation_lgr.add_cp([0x0061])
        validation_lgr.add_cp([0x0030])

        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0030])

        self.lgr.add_variant([0x0061], [0x0062],
                             validating_repertoire=validation_lgr,
                             override_repertoire=True)
        self.assertIn((0x0062, ), self.lgr.repertoire[0x0061]._variants)
Exemplo n.º 9
0
 def test_add_range_validation_with_range(self):
     validation_lgr = LGR()
     validation_lgr.add_range(0x0061, 0x007A)
     self.lgr.add_range(0x0061,
                        0x007A,
                        validating_repertoire=validation_lgr,
                        override_repertoire=False)
     self.assertRaises(NotInRepertoire,
                       self.lgr.add_range,
                       0x00F8,
                       0x00FF,
                       validating_repertoire=validation_lgr,
                       override_repertoire=False)
Exemplo n.º 10
0
class LineParser(LGRParser):
    def unicode_version(self):
        # No Unicode version defined for now
        return ""

    def validate_document(self, schema):
        # No validation for now
        return True

    def parse_document(self):
        self._lgr = LGR(name=self.filename)

        logger.debug('Start parsing of file: %s', self.filename)

        if hasattr(self.source, "read"):
            self._parse_doc(self.source)
        else:
            with io.open(self.source, 'r', encoding='utf-8') as rule_file:
                self._parse_doc(rule_file)

        return self._lgr

    def _parse_doc(self, rule_file):
        """
        Actual parsing of document.

        :param rule_file: Content of the rule, as a file-like object.
        """
        line_num = 0
        for line in rule_file:
            line_num += 1

            line = line.strip()
            if len(line) == 0:
                continue
            if line[0] == '#':
                continue

            codepoints = []
            for cp in UNICODE_CODEPOINT_RE.finditer(line):
                try:
                    codepoints.append(int(cp.group(1), 16))
                except ValueError:
                    logger.error("Invalid code point '%s' at line %d", cp,
                                 line_num)

            try:
                self._lgr.add_cp(codepoints)
            except LGRException as exc:
                logger.error("Cannot add code point '%s' at line %d: %s",
                             format_cp(codepoints), line_num, exc)
Exemplo n.º 11
0
 def test_add_range_validation(self):
     validation_lgr = LGR()
     for cp in range(0x0061, 0x007A + 1):
         validation_lgr.add_cp(cp)
     self.lgr.add_range(0x0061,
                        0x007A,
                        validating_repertoire=validation_lgr,
                        override_repertoire=False)
     self.assertRaises(NotInRepertoire,
                       self.lgr.add_range,
                       0x00F8,
                       0x00FF,
                       validating_repertoire=validation_lgr,
                       override_repertoire=False)
Exemplo n.º 12
0
 def test_lgr_validating_repertoire(self):
     validating_repertoire = LGR(name='validating')
     __, result = rebuild_lgr(self.lgr, {'validating_repertoire': validating_repertoire})
     self.assertDictEqual(result, {'description': "Rebuilding LGR with Unicode version {} "
                                                  "and validating repertoire '{}'".format(
                                                             self.DEFAULT_UNICODE_VERSION, validating_repertoire),
                                   'repertoire': {}})
Exemplo n.º 13
0
def intersect_lgrs(lgr1, lgr2):
    """
    Compute the intersection of 2 LGRs and returns a valid LGR.

    Note: Ranges have to be expanded before calling this function.

    :param lgr1: First LGR.
    :param lgr2: Second LGR.
    :return: New LGR: intersection of two inputs.
    """
    name = 'Intersection of %s and %s' % (lgr1.name, lgr2.name)

    lgr1.expand_ranges()
    lgr2.expand_ranges()

    # Note: We need to create a copy (copy.deepcopy) for some elements
    # otherwise they could reference the original objects.

    metadata = copy.deepcopy(intersect_metadata(lgr1.metadata, lgr2.metadata))
    lgr = LGR(name=name, metadata=metadata)

    # No need to copy references, they are new objects
    references = intersect_reference_manager(lgr1.reference_manager,
                                             lgr2.reference_manager)
    lgr.reference_manager = references

    first_cps = {c.cp for c in lgr1.repertoire}
    second_cps = {c.cp for c in lgr2.repertoire}

    # No need to copy char, they are new objects
    for cp in set.intersection(first_cps, second_cps):
        char1 = lgr1.get_char(cp)
        char2 = lgr2.get_char(cp)

        intersect_char(lgr, char1, char2)

    (actions, actions_xml) = intersect_actions(lgr1, lgr2)
    lgr.actions = copy.deepcopy(actions)
    lgr.actions_xml = actions_xml

    (rules, rules_xml) = intersect_rules(lgr1, lgr2)
    lgr.rules = copy.deepcopy(rules)
    lgr.rules_xml = rules_xml

    (classes, classes_xml) = intersect_classes(lgr1, lgr2)
    lgr.classes = copy.deepcopy(classes)
    lgr.classes_xml = classes_xml

    return lgr
Exemplo n.º 14
0
class TestXmlValidity(unittest.TestCase):

    def setUp(self):
        self.lgr = LGR()

    def test_no_validation(self):
        success, result = check_xml_validity(self.lgr, {})
        self.assertTrue(success)
        self.assertDictEqual(result, {})

    def test_invalid_xml_lgr(self):
        self.lgr.add_cp(0x0061, when='#when')
        success, result = check_xml_validity(self.lgr, {'rng_filepath': os.path.join(RESOURCE_DIR, 'lgr.rng')})
        self.assertIn('validation_result', result)
        validation_result = result['validation_result']
        self.assertFalse(success)
        self.assertDictEqual(result, {'description': 'Testing XML validity using RNG',
                                      'rng_result': False,
                                      'validation_result': validation_result})
Exemplo n.º 15
0
class TestConditionalVariants(unittest.TestCase):

    def setUp(self):
        self.lgr = LGR()
        # Configure log system to redirect validation logs to local attribute
        self.log_output = StringIO()
        ch = logging.StreamHandler(self.log_output)
        ch.setLevel(logging.DEBUG)
        logging.getLogger('lgr.validate').addHandler(ch)

    def test_empty_lgr(self):
        check_conditional_variants(self.lgr, {})
        log_content = self.log_output.getvalue()
        self.assertEqual(len(log_content), 0)

    def test_no_variants(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_cp([0x0062])
        check_conditional_variants(self.lgr, {})
        log_content = self.log_output.getvalue()
        self.assertEqual(len(log_content), 0)

    def test_no_rule(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0062], when="when-rule")
        check_conditional_variants(self.lgr, {})
        log_content = self.log_output.getvalue()
        self.assertGreater(len(log_content), 0)
        self.assertEqual(log_content,
                         "CP U+0061: Variant 'U+0062' \"when\" attribute "
                         "'when-rule' is not an existing rule name.\n")

    def test_conditional_ok(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0062], when="when-rule")
        self.lgr.rules.append("when-rule")
        check_conditional_variants(self.lgr, {})
        log_content = self.log_output.getvalue()
        self.assertEqual(len(log_content), 0)
Exemplo n.º 16
0
    def test_merge_chars(self):
        merged_lgr = LGR()

        # Need to merge references first - OK since tested in previous test
        reference_mapping = {}
        merge_references(self.lgr_1, 'fr', merged_lgr, reference_mapping)
        merge_references(self.lgr_2, 'und-Khmer', merged_lgr, reference_mapping)

        merge_chars(self.lgr_1, 'fr', merged_lgr, reference_mapping, [])

        # Simple variant changed to blocked
        cp = merged_lgr.get_char(0x0041)
        self.assertIn('1', cp.references)

        variants = list(cp.get_variants())
        self.assertEqual(len(variants), 1)
        var = variants[0]
        self.assertEqual(var.cp, (0x0061, )),
        self.assertEqual(var.type, 'blocked')

        # Complete merge
        merge_chars(self.lgr_2, 'und-Khmer', merged_lgr, reference_mapping, [])

        self._test_merged_chars(merged_lgr)
Exemplo n.º 17
0
    def test_merge_actions(self):
        merged_lgr = LGR()

        lgr = LGR()
        lgr.add_action(Action(match='rule-name', disp='invalid'))
        lgr.actions_xml.append("""<action disp="invalid" match="rule-name"/>""")

        merge_actions(lgr, 'fr', merged_lgr, {})

        self.assertEqual(len(merged_lgr.actions), 1)
        self.assertEqual(len(merged_lgr.actions_xml), 1)
        self.assertEqual(merged_lgr.actions[0].match, 'fr-rule-name')

        # Default action should not be merged
        lgr = LGR()
        lgr.add_action(Action(disp='invalid', comment="Default action for invalid", any_variant=['invalid']))
        lgr.actions_xml.append("""<action disp="invalid" match="rule-name"/>""")

        merge_actions(lgr, 'fr', merged_lgr, {})

        self.assertEqual(len(merged_lgr.actions), 1)
        self.assertEqual(len(merged_lgr.actions_xml), 1)
        self.assertEqual(merged_lgr.actions[0].match, 'fr-rule-name')
Exemplo n.º 18
0
    def test_merge_references(self):
        merged_lgr = LGR()

        reference_mapping = {}
        merge_references(self.lgr_1, 'fr', merged_lgr, reference_mapping)

        self.assertEqual(len(reference_mapping), 1)
        self.assertIn('fr', reference_mapping)
        self.assertEqual(reference_mapping['fr'], {})

        merge_references(self.lgr_2, 'und-Khmer', merged_lgr, reference_mapping)

        self.assertEqual(len(reference_mapping), 2)
        self.assertIn('und-Khmer', reference_mapping)
        self.assertEqual(reference_mapping['und-Khmer'], {
            '0': '3',
            '1': '4',  # Generated
        })
Exemplo n.º 19
0
def merge_lgr_set(lgr_set, name):
    """
    Merge LGRs from a set

    :param lgr_set: The list of LGRs in the set
    :param name: Merged LGR name
    :return: New LGR (merge of LGR set)
    """
    logger.debug("Merge %s", name)

    # order LGRs
    lgr_set.sort(key=lambda x: get_script(x).replace('und-', 'zzz'))

    # Ensure all unicode version are correct
    unicode_version = OrderedDict().fromkeys(lgr.metadata.unicode_version
                                             for lgr in lgr_set)
    if len(unicode_version) > 1:
        logger.warning("Different unicode version in set: %s",
                       unicode_version.keys())

    ref_mapping = {}
    metadata = copy.deepcopy(merge_metadata(lgr_set))
    merged_lgr = LGR(name=name, metadata=metadata)
    previous_scripts = []
    for lgr in lgr_set:
        script = get_script(lgr)
        lgr.expand_ranges()

        merge_references(lgr, script, merged_lgr, ref_mapping)
        merge_chars(lgr, script, merged_lgr, ref_mapping, previous_scripts)
        merge_actions(lgr, script, merged_lgr, ref_mapping)
        merge_rules(lgr, script, merged_lgr, ref_mapping)
        merge_classes(lgr, script, merged_lgr, ref_mapping)
        previous_scripts.append(script)

    # XXX As the created merged_lgr is not a valid Python LGR object,
    # we have to serialize it/parse it to get a valid object.

    merged_lgr_xml = BytesIO(serialize_lgr_xml(merged_lgr))

    lgr_parser = XMLParser(source=merged_lgr_xml, filename=name)

    return lgr_parser.parse_document()
Exemplo n.º 20
0
def make_idna_repertoire(version):
    """
    Make a repertoire from IDNA tables.
    Parse IDNA table registry, convert it to an LGR XML format,
    and output it on stdout.

    Input:
        * version: The unicode version to use.
    """
    from lgr.core import LGR
    from lgr.parser.xml_serializer import serialize_lgr_xml

    lgr = LGR('idna2008-%s' % version)

    idna_url = IDNATABLES_URL.format(version=version)
    logger.debug("Fetching and parsing '%s'", idna_url)
    registry = etree.parse(idna_url)

    # To keep '{}' when string-formatting
    namespace = "{{{0}}}".format(IDNATABLES_NS)
    registry_id = "idna-tables-properties"
    if list(map(int, version.split('.'))) <= [6, 0, 0]:
        registry_id = "idna-tables-{}-properties".format(version)
    record_xpath = '{0}registry[@id="{1}"]/{0}record'.format(
        namespace, registry_id)

    for record in registry.findall(record_xpath):
        codepoint = record.find(CODEPOINT_TAG).text
        prop = record.find(PROPERTY_TAG).text

        if prop not in ['PVALID', 'CONTEXTO', 'CONTEXTJ']:
            continue

        if codepoint.find('-') > 0:
            # Codepoint is a range
            (first_cp, last_cp) = [int(c, 16) for c in codepoint.split('-')]
            lgr.add_range(first_cp, last_cp)
        else:
            # Single codepoint
            lgr.add_cp(int(codepoint, 16))

    lgr_root = serialize_lgr_xml(lgr,
                                 pretty_print=True,
                                 encoding='unicode',
                                 xml_declaration=False)
    print(lgr_root)
Exemplo n.º 21
0
def rebuild_lgr(lgr, options):
    """
    Rebuild an LGR with given parameters.

    options argument can contain:
        * unicode_version: The target Unicode version to be used
          when rebuilding the LGR. If None is given, use the current one.
        * validating_repertoire: The validating repertoire used
          for checking code points.
        * unidb: Munidata's Unicode database. If None, skip Unicode checks.

    :param LGR lgr: The LGR to rebuild.
    :param dict options: Dictionary of options to the validation function.
    """
    # Local import to prevent import cycles
    from lgr.core import LGR

    unicode_version = options.get('unicode_version',
                                  lgr.metadata.unicode_version)
    validating_repertoire = options.get('validating_repertoire', None)

    description = "Rebuilding LGR with Unicode version {}".format(
        unicode_version)
    if validating_repertoire is not None:
        description += " and validating repertoire '{}'".format(
            validating_repertoire)
    result = {
        'description': description,
        'repertoire': {}  # XXX: Cannot use defaultdict because of django...
    }

    logger.info(
        "Rebuilding LGR '%s' with Unicode version %s "
        "and Validating Repertoire '%s'", lgr, unicode_version,
        validating_repertoire)

    unidb = options.get('unidb', None)
    if unidb is not None:
        unidb_version = unidb.get_unicode_version()
        if unidb_version != unicode_version:
            result['generic'] = "Target Unicode version {} " \
                                "differs from UnicodeDatabase {}".format(unicode_version,
                                                                         unidb_version)
            logger.warning(
                "Target Unicode version %s differs "
                "from UnicodeDatabase %s", unicode_version, unidb_version)

    # For now, simply copy the metadata and references of the source LGR
    target_metadata = copy.deepcopy(lgr.metadata)
    target_metadata.unicode_version = unicode_version
    target_reference_manager = copy.deepcopy(lgr.reference_manager)

    target_lgr = LGR(name=lgr.name,
                     metadata=target_metadata,
                     reference_manager=target_reference_manager,
                     unicode_database=unidb)

    for char in lgr.repertoire:
        if isinstance(char, RangeChar):
            range_ok = True
            for cp, status in target_lgr.check_range(char.first_cp,
                                                     char.last_cp,
                                                     validating_repertoire):
                if status is not None:
                    result['repertoire'].setdefault(char, {}).setdefault(
                        'errors', []).append(status)
                    range_ok = False
                in_script, _ = lgr.cp_in_script([cp])
                if not in_script:
                    result['repertoire'].setdefault(char, {}).setdefault(
                        'warnings', []).append(CharNotInScript(cp))
                    range_ok = False

            if not range_ok:
                continue

            try:
                target_lgr.add_range(
                    char.first_cp,
                    char.last_cp,
                    comment=char.comment,
                    ref=char.references,
                    tag=char.tags,
                    when=char.when,
                    not_when=char.not_when,
                    validating_repertoire=validating_repertoire,
                    override_repertoire=False)
            except LGRException as exc:
                result['repertoire'].setdefault(char,
                                                {}).setdefault('errors',
                                                               []).append(exc)
                logger.error("Cannot add range '%s-%s'",
                             format_cp(char.first_cp), format_cp(char.last_cp))
            continue

        in_script, _ = lgr.cp_in_script(char.cp)
        if not in_script:
            result['repertoire'].setdefault(char, {}).setdefault(
                'warnings', []).append(CharNotInScript(char.cp))
        # Insert code point
        try:
            target_lgr.add_cp(char.cp,
                              comment=char.comment,
                              ref=char.references,
                              tag=char.tags,
                              when=char.when,
                              not_when=char.not_when,
                              validating_repertoire=validating_repertoire,
                              override_repertoire=False)
        except LGRException as exc:
            result['repertoire'].setdefault(char,
                                            {}).setdefault('errors',
                                                           []).append(exc)
            logger.error("Cannot add code point '%s'", format_cp(char.cp))
            if not isinstance(exc, CharInvalidIdnaProperty
                              ):  # Cannot include non-IDNA valid code points
                target_lgr.add_cp(char.cp,
                                  comment=char.comment,
                                  ref=char.references,
                                  tag=char.tags,
                                  when=char.when,
                                  not_when=char.not_when,
                                  force=True)

        # Create variants
        for var in char.get_variants():
            try:
                target_lgr.add_variant(
                    char.cp,
                    variant_cp=var.cp,
                    variant_type=var.type,
                    when=var.when,
                    not_when=var.not_when,
                    comment=var.comment,
                    ref=var.references,
                    validating_repertoire=validating_repertoire,
                    override_repertoire=True)
            except LGRException as exc:
                result['repertoire'].setdefault(char, {}).setdefault(
                    'variants', {}).setdefault(var, []).append(exc)
                logger.error("Cannot add variant '%s' to code point '%s'",
                             format_cp(var.cp), format_cp(char.cp))
                if not isinstance(
                        exc, CharInvalidIdnaProperty
                ):  # Cannot include non-IDNA valid code points
                    target_lgr.add_variant(char.cp,
                                           variant_cp=var.cp,
                                           variant_type=var.type,
                                           when=var.when,
                                           not_when=var.not_when,
                                           comment=var.comment,
                                           ref=var.references,
                                           force=True)

    logger.info("Rebuilding LGR '%s done", lgr)

    return True, result
Exemplo n.º 22
0
class RFC3743Parser(LGRParser):
    def unicode_version(self):
        # No Unicode version defined in file
        return ""

    def validate_document(self, schema=None):
        # No validation of document done for now
        return ""

    def parse_document(self):
        if not self.filename and isinstance(self.source, str):
            self.filename = os.path.basename(self.source)

        self._lgr = LGR(name=self.filename)

        logger.debug('Start parsing of file: %s', self.filename)

        if hasattr(self.source, "read"):
            self._parse_doc(self.source)
        else:
            with io.open(self.source, 'r', encoding='utf-8') as rule_file:
                self._parse_doc(rule_file)

        return self._lgr

    def _parse_doc(self, rule_file):
        """
        Actual parsing of document.

        :param rule_file: Content of the rule, as a file-like object.
        """
        line_num = 0
        for line in rule_file:
            line_num += 1

            line = line.strip()
            if len(line) == 0:
                continue
            if line[0] == '#':
                continue

            reference = REFERENCE_RE.match(line)
            if reference is not None:
                ref_id = reference.group('ref_id')
                value = reference.group('value')
                comment = reference.group('comment')
                try:
                    self._lgr.add_reference(value,
                                            ref_id=ref_id,
                                            comment=comment)
                except LGRException:
                    logger.error("Invalid reference '%s' on line %d", line,
                                 line_num)
                continue

            version = VERSION_RE.match(line)
            if version is not None:
                version_no = version.group('version_no')
                date = version.group('date')
                comment = version.group('comment')

                try:
                    self._lgr.metadata.version = Version(version_no,
                                                         comment=comment)
                    self._lgr.metadata.date = date
                except LGRException:
                    logger.error("Invalid version '%s' on line %d", line,
                                 line_num)
                continue

            if UNICODE_CODEPOINT_RE.match(line) is None:
                logger.debug("Skipping non-parsable line %d:\n%s", line_num,
                             line)
                # Line is not starting with a valid unicode code point, skip
                continue

            # Split base character from variant(s)
            char_variant = line.split(';')
            char = char_variant[0]

            try:
                [(codepoints, references)] = parse_char(char)
                self._lgr.add_cp(codepoints, ref=references)
            except ValueError:
                logger.error("Invalid character '%s' at line %d", char,
                             line_num)
            except LGRException as exc:
                logger.error("Cannot add code point '%s' at line %d: %s",
                             format_cp(codepoints), line_num, exc)

            if len(char_variant) > 1:
                preferred_variants = char_variant[1].strip()
                if len(preferred_variants
                       ) > 0 and preferred_variants[0] != '#':
                    # From RFC7940, Section 7.3. Recommended Disposition Values:
                    # activated  The resulting string should be activated for use.  (This
                    # is the same as a Preferred Variant [RFC3743].)
                    var_type = "activated"
                    self.insert_variant(line_num, codepoints,
                                        preferred_variants, var_type)

            if len(char_variant) > 2:
                variants = char_variant[2].strip()
                if len(variants) > 0 and variants[0] != '#':
                    self.insert_variant(line_num, codepoints, variants)

    def insert_variant(self, line_num, codepoints, var, var_type=None):
        try:
            variants = parse_char(var)
        except ValueError:
            logger.error("Invalid variant '%s' at line %d", var, line_num)
            return

        for (var_codepoints, references) in variants:
            try:
                self._lgr.add_variant(codepoints,
                                      var_codepoints,
                                      ref=references,
                                      variant_type=var_type)
            except LGRException as exc:
                logger.error(
                    "Cannot add variant '%s' to code point '%s' at line %d: %s",
                    format_cp(var_codepoints), format_cp(codepoints), line_num,
                    exc)
Exemplo n.º 23
0
class RFC4290Parser(LGRParser):
    def unicode_version(self):
        # No Unicode version defined in file
        return ""

    def validate_document(self, schema=None):
        # No validation of document done for now
        return True

    def parse_document(self):
        if not self.filename and isinstance(self.source, str):
            self.filename = os.path.basename(self.source)

        self._lgr = LGR(name=self.filename)

        logger.debug('Start parsing of file: %s', self.filename)

        if hasattr(self.source, "read"):
            self._parse_doc(self.source)
        else:
            with io.open(self.source, 'r', encoding='utf-8') as rule_file:
                self._parse_doc(rule_file)

        return self._lgr

    def _parse_doc(self, rule_file):
        """
        Actual parsing of document.
    
        :param rule_file: Content of the rule, as a file-like object.
        """
        line_num = 0
        for line in rule_file:
            line_num += 1

            line = line.strip()
            if len(line) == 0:
                continue
            if line[0] == '#':
                continue
            if UNICODE_CODEPOINT_RE.match(line) is None:
                # Line is not starting with a valid unicode code point, skip
                continue

            # Remove comments and split base character from variant(s)
            char_variant = line.split('#')[0].split('|')
            char = char_variant[0]

            try:
                codepoints = parse_char(char)
                self._lgr.add_cp(codepoints)
            except ValueError:
                logger.error("Invalid character '%s' at line %d", char,
                             line_num)
            except LGRException as exc:
                logger.error("Cannot add code point '%s' at line %d: %s",
                             format_cp(codepoints), line_num, exc)

            # Handle variants, if any
            if len(char_variant) > 1:
                variants = char_variant[1].split(':')

                for var in variants:
                    try:
                        var_codepoints = parse_char(var)
                        self._lgr.add_variant(codepoints, var_codepoints)
                    except ValueError:
                        logger.error("Invalid variant '%s' at line %d", var,
                                     line_num)
                    except LGRException as exc:
                        logger.error(
                            "Cannot add variant '%s' to code point '%s' at line %d: %s",
                            format_cp(var_codepoints), format_cp(codepoints),
                            line_num, exc)
Exemplo n.º 24
0
class TestLGRCore(unittest.TestCase):
    def setUp(self):
        unidb = IDNADatabase('6.3.0')
        self.lgr = LGR(unicode_database=unidb)

    def test_add_single_cp_list(self):
        self.lgr.add_cp([0x0061])
        self.assertIn(0x0061, self.lgr.repertoire)

    def test_add_single_cp_int(self):
        self.lgr.add_cp(0x0061)
        self.assertIn(0x0061, self.lgr.repertoire)

    def test_add_cp_sequence(self):
        self.lgr.add_cp([0x0061, 0x0062])
        self.assertIn([0x0061, 0x0062], self.lgr.repertoire)
        self.assertNotIn(0x0061, self.lgr.repertoire)
        self.assertNotIn(0x0062, self.lgr.repertoire)

    def test_add_multiple_cp_sequences(self):
        self.lgr.add_cp([0x0061, 0x0062])
        self.lgr.add_cp([0x0061, 0x0062, 0x0063])
        self.assertIn([0x0061, 0x0062], self.lgr.repertoire)
        self.assertIn([0x0061, 0x0062, 0x0063], self.lgr.repertoire)
        self.assertNotIn(0x0061, self.lgr.repertoire)
        self.assertNotIn(0x0062, self.lgr.repertoire)
        self.assertNotIn(0x0063, self.lgr.repertoire)

    def test_add_cp_in_repertoire(self):
        self.lgr.add_cp([0x0061])
        self.assertRaises(CharAlreadyExists, self.lgr.add_cp, [0x0061])
        self.assertRaises(CharAlreadyExists, self.lgr.add_cp, 0x0061)

    def test_add_cp_validation(self):
        validation_lgr = LGR()
        validation_lgr.add_cp([0x0061])
        self.lgr.add_cp([0x0061],
                        validating_repertoire=validation_lgr,
                        override_repertoire=False)
        self.assertRaises(NotInRepertoire,
                          self.lgr.add_cp, [0x0062],
                          validating_repertoire=validation_lgr,
                          override_repertoire=False)

    def test_add_cp_validation_override(self):
        validation_lgr = LGR()
        validation_lgr.add_cp([0x0061])
        self.lgr.add_cp([0x0061],
                        validating_repertoire=validation_lgr,
                        override_repertoire=False)
        self.lgr.add_cp([0x0062],
                        validating_repertoire=validation_lgr,
                        override_repertoire=True)
        self.assertIn(0x0062, self.lgr.repertoire)

    def test_del_single_cp_list(self):
        self.lgr.add_cp(0x0061)
        self.lgr.del_cp([0x0061])
        self.assertNotIn(0x0061, self.lgr.repertoire)

    def test_del_single_cp_int(self):
        self.lgr.add_cp([0x0061])
        self.lgr.del_cp(0x0061)
        self.assertNotIn(0x0061, self.lgr.repertoire)

    def test_del_cp_sequence(self):
        self.lgr.add_cp([0x0061, 0x0062])
        self.lgr.del_cp([0x0061, 0x0062])
        self.assertEqual(len(self.lgr.repertoire), 0)

    def test_del_cp_sequence_with_cp(self):
        self.lgr.add_cp([0x0061, 0x0062])
        self.assertRaises(NotInLGR, self.lgr.del_cp, 0x0061)
        self.assertRaises(NotInLGR, self.lgr.del_cp, 0x0062)
        self.assertIn([0x0061, 0x0062], self.lgr.repertoire)

    def test_add_cp_when_not_when(self):
        self.lgr.add_cp([0x0061], when='w1')
        with self.assertRaises(CharInvalidContextRule) as cm:
            self.lgr.add_cp([0x0062], when='w2', not_when='nw1')
        the_exception = cm.exception
        self.assertEqual(the_exception.cp, [0x0062])

        self.lgr.add_cp([0x0062], not_when='nw2')
        with self.assertRaises(CharInvalidContextRule) as cm:
            self.lgr.add_cp([0x0063], when='w3', not_when='nw3')
        the_exception = cm.exception
        self.assertEqual(the_exception.cp, [0x0063])

    def test_add_range(self):
        self.lgr.add_range(0x0061, 0x007A)
        for cp in range(0x0061, 0x007A + 1):
            self.assertIn(cp, self.lgr.repertoire)

    def test_add_range_in_repertoire(self):
        self.lgr.add_range(0x0061, 0x007A)
        self.assertRaises(CharAlreadyExists, self.lgr.add_range, 0x0061,
                          0x007A)

    def test_add_range_validation(self):
        validation_lgr = LGR()
        for cp in range(0x0061, 0x007A + 1):
            validation_lgr.add_cp(cp)
        self.lgr.add_range(0x0061,
                           0x007A,
                           validating_repertoire=validation_lgr,
                           override_repertoire=False)
        self.assertRaises(NotInRepertoire,
                          self.lgr.add_range,
                          0x00F8,
                          0x00FF,
                          validating_repertoire=validation_lgr,
                          override_repertoire=False)

    def test_add_range_validation_with_range(self):
        validation_lgr = LGR()
        validation_lgr.add_range(0x0061, 0x007A)
        self.lgr.add_range(0x0061,
                           0x007A,
                           validating_repertoire=validation_lgr,
                           override_repertoire=False)
        self.assertRaises(NotInRepertoire,
                          self.lgr.add_range,
                          0x00F8,
                          0x00FF,
                          validating_repertoire=validation_lgr,
                          override_repertoire=False)

    def test_add_range_validation_override(self):
        validation_lgr = LGR()
        for cp in range(0x0061, 0x007A):
            validation_lgr.add_cp(cp)
        self.lgr.add_range(0x0031,
                           0x0032,
                           validating_repertoire=validation_lgr,
                           override_repertoire=True)
        self.assertIn(0x0031, self.lgr.repertoire)

    def test_add_range_when_not_when(self):
        self.lgr.add_range(0x0061, 0x0065, when='w1')
        with self.assertRaises(RangeInvalidContextRule) as cm:
            self.lgr.add_range(0x0066, 0x007A, when='w2', not_when='nw1')
        the_exception = cm.exception
        self.assertEqual(the_exception.first_cp, 0x0066)
        self.assertEqual(the_exception.last_cp, 0x007A)

        self.lgr.add_range(0x0066, 0x007A, not_when='nw2')
        with self.assertRaises(RangeInvalidContextRule) as cm:
            self.lgr.add_range(0x01BD, 0x01C3, when='w3', not_when='nw3')
        the_exception = cm.exception
        self.assertEqual(the_exception.first_cp, 0x01BD)
        self.assertEqual(the_exception.last_cp, 0x01C3)

    def test_expand_ranges(self):
        self.lgr.add_range(0x0061, 0x007A)
        for cp in range(0x0061, 0x007A + 1):
            self.assertIsInstance(self.lgr.get_char(cp), RangeChar)
        self.lgr.add_range(0x01BD, 0x01C3)
        for cp in range(0x01BD, 0x01C3 + 1):
            self.assertIsInstance(self.lgr.get_char(cp), RangeChar)

        self.lgr.expand_ranges()
        for cp in range(0x0061, 0x007A + 1):
            char = self.lgr.get_char(cp)
            self.assertIsInstance(char, Char)
            self.assertNotIsInstance(char, RangeChar)
        for cp in range(0x01BD, 0x01C3 + 1):
            char = self.lgr.get_char(cp)
            self.assertIsInstance(char, Char)
            self.assertNotIsInstance(char, RangeChar)

    def test_expand_range(self):
        self.lgr.add_range(0x0061, 0x007A)
        for cp in range(0x0061, 0x007A + 1):
            self.assertIsInstance(self.lgr.get_char(cp), RangeChar)

        self.lgr.expand_range(0x0061, 0x007A)
        for cp in range(0x0061, 0x007A + 1):
            char = self.lgr.get_char(cp)
            self.assertIsInstance(char, Char)
            self.assertNotIsInstance(char, RangeChar)

    def test_add_variant_in_repertoire(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0030])
        self.assertRaises(VariantAlreadyExists, self.lgr.add_variant, [0x0061],
                          [0x0030])

    def test_add_variant_validation(self):
        validation_lgr = LGR()
        validation_lgr.add_cp([0x0061])
        validation_lgr.add_cp([0x0030])

        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0030])

        self.assertRaises(NotInRepertoire,
                          self.lgr.add_variant, [0x0061], [0x0062],
                          validating_repertoire=validation_lgr,
                          override_repertoire=False)

    def test_add_variant_when_not_when(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0030], when='w1')
        with self.assertRaises(VariantInvalidContextRule) as cm:
            self.lgr.add_variant([0x0061], [0x0031], when='w2', not_when='nw1')
        the_exception = cm.exception
        self.assertEqual(the_exception.cp, [0x0061])
        self.assertEqual(the_exception.variant, [0x0031])

        self.lgr.add_variant([0x0061], [0x0030], not_when='nw2')
        with self.assertRaises(VariantInvalidContextRule) as cm:
            self.lgr.add_variant([0x0061], [0x0031], when='w3', not_when='nw3')
        the_exception = cm.exception
        self.assertEqual(the_exception.cp, [0x0061])
        self.assertEqual(the_exception.variant, [0x0031])

    def test_del_cp_validation_override(self):
        validation_lgr = LGR()
        validation_lgr.add_cp([0x0061])
        validation_lgr.add_cp([0x0030])

        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0030])

        self.lgr.add_variant([0x0061], [0x0062],
                             validating_repertoire=validation_lgr,
                             override_repertoire=True)
        self.assertIn((0x0062, ), self.lgr.repertoire[0x0061]._variants)

    def test_get_variants(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0030])

        variants = self.lgr.get_variants([0x0061])
        self.assertIsInstance(variants, types.GeneratorType)

        variant_list = list(variants)

        self.assertEqual(len(variant_list), 1)

    def test_check_range_no_modification(self):
        self.lgr.check_range(0x0060, 0x007F)

        self.assertEqual(len(self.lgr.repertoire), 0)

    def test_check_range(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_cp([0x007A])

        codepoints = self.lgr.check_range(0x0060, 0x007F)

        for result in codepoints:
            cp = result[0]
            prop = result[1]
            if cp == 0x060 or cp >= 0x007B:
                self.assertIsInstance(prop, CharInvalidIdnaProperty)
            elif cp == 0x0061 or cp == 0x007A:
                self.assertIsInstance(prop, CharAlreadyExists)
            else:
                self.assertIsNone(prop)

    def test_add_codepoints(self):
        self.lgr.add_codepoints([c for c in range(0x0061, 0x007A + 1)] +
                                [0x0107] + [0x0137, 0x0138])

        expected_output = [
            RangeChar(0x061, 0x0061, 0x007A),
            Char(0x0107),
            RangeChar(0x0137, 0x0137, 0x0138)
        ]

        self.assertEqual(expected_output, list(self.lgr.repertoire))

    def test_tags_on_codepoint(self):
        self.lgr.add_cp([0x0061], tag=['t1', 't2'])
        with self.assertRaises(LGRFormatException) as cm:
            self.lgr.add_cp([0x0062], tag=['t1', 't1'])

        the_exception = cm.exception
        self.assertEqual(the_exception.reason,
                         LGRFormatException.LGRFormatReason.DUPLICATE_TAG)

    def test_tags_on_codepoint_sequence(self):
        with self.assertRaises(LGRFormatException) as cm:
            self.lgr.add_cp([0x0061, 0x0062], tag=['t1'])

        the_exception = cm.exception
        self.assertEqual(the_exception.reason,
                         LGRFormatException.LGRFormatReason.SEQUENCE_NO_TAG)

    def test_tags_on_range(self):
        self.lgr.add_range(0x0061, 0x0062, tag=['t1', 't2'])
        with self.assertRaises(LGRFormatException) as cm:
            self.lgr.add_range(0x0063, 0x0064, tag=['t1', 't1'])

        the_exception = cm.exception
        self.assertEqual(the_exception.reason,
                         LGRFormatException.LGRFormatReason.DUPLICATE_TAG)

    def test_list_types(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0030], variant_type='BLOCK')
        self.lgr.add_variant([0x0061], [0x0031], variant_type='VALID')
        self.lgr.add_variant([0x0061], [0x0032], variant_type='BLOCK')

        self.assertEquals(self.lgr.types, set(['BLOCK', 'VALID']))

    def test_del_reference(self):
        ref_id_1 = self.lgr.add_reference("Test - 1")
        ref_id_2 = self.lgr.add_reference("Test - 2")

        self.lgr.add_cp([0x0061], ref=[ref_id_1])
        self.lgr.add_cp([0x0062], ref=[ref_id_1, ref_id_2])

        self.lgr.del_reference(ref_id_1)

        self.assertNotIn(ref_id_1, self.lgr.reference_manager)
        self.assertEquals(self.lgr.get_char([0x0061]).references, [])
        self.assertEquals(self.lgr.get_char([0x0062]).references, [ref_id_2])

    def test_add_cp_duplicate_reference(self):
        ref_id = self.lgr.add_reference("Test - 1")
        with self.assertRaises(DuplicateReference) as cm:
            self.lgr.add_cp([0x0061], ref=[ref_id, ref_id])

        the_exception = cm.exception
        self.assertEqual(the_exception.cp, [0x0061])

    def test_add_range_duplicate_reference(self):
        ref_id = self.lgr.add_reference("Test - 1")
        with self.assertRaises(DuplicateReference) as cm:
            self.lgr.add_range(0x0061, 0x0062, ref=[ref_id, ref_id])

        the_exception = cm.exception
        self.assertEqual(the_exception.cp, 0x0061)

    def test_add_variant_duplicate_reference(self):
        self.lgr.add_cp([0x0061])
        ref_id = self.lgr.add_reference("Test - 1")
        with self.assertRaises(DuplicateReference) as cm:
            self.lgr.add_variant([0x0061], [0x0062], ref=[ref_id, ref_id])

        the_exception = cm.exception
        self.assertEqual(the_exception.cp, [0x0061])

    def test_generate_variants(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_cp([0x0062])
        self.lgr.add_cp([0x0063])
        self.lgr.add_cp([0x0064])

        self.lgr.add_variant([0x0061], [0x0070], variant_type="type0")
        self.lgr.add_variant([0x0062], [0x0071], variant_type="type1")
        self.lgr.add_variant([0x0062], [0x0072], variant_type="type2")

        self.assertEqual([], list(self.lgr._generate_label_variants([])))
        self.assertEqual([], list(self.lgr._generate_label_variants([0x0063])))
        self.assertEqual(
            [], list(self.lgr._generate_label_variants([0x0063, 0x0064])))
        self.assertEqual(
            set([((0x0071, 0x0063), frozenset(['type1']), False),
                 ((0x0072, 0x0063), frozenset(['type2']), False)]),
            set(self.lgr._generate_label_variants([0x0062, 0x0063])))
        self.assertEqual(
            set([
                ((0x0061, 0x0062), frozenset(), False),
                ((0x0061, 0x0071), frozenset(['type1']), False),
                ((0x0061, 0x0072), frozenset(['type2']), False),
                ((0x0070, 0x0062), frozenset(['type0']), False),
                ((0x0070, 0x0071), frozenset(['type0', 'type1']), True),
                ((0x0070, 0x0072), frozenset(['type0', 'type2']), True),
            ]), set(self.lgr._generate_label_variants([0x0061, 0x0062])))
        self.assertEqual(
            set([
                ((0x0061, 0x0062, 0x0062), frozenset(), False),
                ((0x0061, 0x0062, 0x0071), frozenset(['type1']), False),
                ((0x0061, 0x0062, 0x0072), frozenset(['type2']), False),
                ((0x0061, 0x0071, 0x0062), frozenset(['type1']), False),
                ((0x0061, 0x0071, 0x0071), frozenset(['type1']), False),
                ((0x0061, 0x0071, 0x0072), frozenset(['type1',
                                                      'type2']), False),
                ((0x0061, 0x0072, 0x0062), frozenset(['type2']), False),
                ((0x0061, 0x0072, 0x0071), frozenset(['type1',
                                                      'type2']), False),
                ((0x0061, 0x0072, 0x0072), frozenset(['type2']), False),
                ((0x0070, 0x0062, 0x0062), frozenset(['type0']), False),
                ((0x0070, 0x0062, 0x0071), frozenset(['type0',
                                                      'type1']), False),
                ((0x0070, 0x0062, 0x0072), frozenset(['type0',
                                                      'type2']), False),
                ((0x0070, 0x0071, 0x0062), frozenset(['type0',
                                                      'type1']), False),
                ((0x0070, 0x0071, 0x0071), frozenset(['type0',
                                                      'type1']), True),
                ((0x0070, 0x0071, 0x0072),
                 frozenset(['type0', 'type1', 'type2']), True),
                ((0x0070, 0x0072, 0x0062), frozenset(['type0',
                                                      'type2']), False),
                ((0x0070, 0x0072, 0x0071),
                 frozenset(['type0', 'type1', 'type2']), True),
                ((0x0070, 0x0072, 0x0072), frozenset(['type0',
                                                      'type2']), True),
            ]), set(self.lgr._generate_label_variants([0x0061, 0x0062,
                                                       0x0062])))

    def test_generate_variants_reflexive(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_cp([0x0062])
        self.lgr.add_cp([0x0063])

        self.lgr.add_variant([0x0062], [0x0062], variant_type="reflexive")
        self.lgr.add_variant([0x0063], [0x0070], variant_type="type")

        self.assertEqual([], list(self.lgr._generate_label_variants([])))
        self.assertEqual([], list(self.lgr._generate_label_variants([0x0061])))
        self.assertEqual([((0x0062, ), frozenset(['reflexive']), True)],
                         list(self.lgr._generate_label_variants([0x0062])))
        self.assertEqual(
            set([
                ((0x0062, 0x0063), frozenset(['reflexive']), False),
                ((0x0062, 0x0070), frozenset(['reflexive', 'type']), True),
            ]), set(self.lgr._generate_label_variants([0x0062, 0x0063])))

    def test_label_simple(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_cp([0x0062, 0x0063])
        self.lgr.add_range(0x0064, 0x0068)

        valid_labels = ([0x0061], [0x0062, 0x0063], [0x0064], [0x0068],
                        [0x0061, 0x0064], [0x0061, 0x0062, 0x0063,
                                           0x0064], [0x0062, 0x0063, 0x0068])
        invalid_labels = (([0x0060], [], [0x0060]), ([0x0069], [], [0x0069]),
                          ([0x0062], [], [0x0062]), ([0x0063], [], [0x0063]),
                          ([0x0061, 0x0062], [0x0061], [0x0062]))

        for label in valid_labels:
            self.assertEqual((True, label, []),
                             self.lgr._test_preliminary_eligibility(label))
        for (label, label_part, not_in_lgr) in invalid_labels:
            self.assertEqual((False, label_part, not_in_lgr),
                             self.lgr._test_preliminary_eligibility(label))

    def test_label_eligibility_multiple_choices(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_cp([0x0061, 0x0062, 0x0063])
        self.lgr.add_cp([0x0064])

        self.assertEqual(self.lgr._test_preliminary_eligibility([0x0062]),
                         (False, [], [0x0062]))
        self.assertEqual(
            self.lgr._test_preliminary_eligibility(
                [0x0061, 0x0062, 0x0063, 0x0064]),
            (True, [0x0061, 0x0062, 0x0063, 0x0064], []))

    def test_label_delayed_eligibilty(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0061], 'block')
        self.lgr.add_cp([0x0062])
        self.lgr.add_variant([0x0062], [0x0062], 'invalid')
        self.lgr.add_cp([0x0063, 0x0064])
        self.lgr.add_variant([0x0063, 0x0064], [0x0063, 0x0064], 'invalid')

        self.assertEqual(self.lgr._test_label_disposition([0x0062]),
                         ('invalid', 0))
        self.assertEqual(self.lgr._test_label_disposition([0x0063, 0x0064]),
                         ('invalid', 0))
        self.assertEqual(self.lgr._test_label_disposition([0x0061, 0x0062]),
                         ('invalid', 0))

    def test_label_length(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0061], 'disp')
        self.lgr.add_cp([0x0062])
        self.lgr.add_variant([0x0062], [0x0062], 'disp')

        self.assertEqual(PROTOCOL_LABEL_MAX_LENGTH,
                         self.lgr.max_label_length())

        for i in range(80):
            self.lgr.add_variant([0x0062], [0x074D + i], 'disp')

        # 41: mean number of variants per character
        self.assertEqual(int(math.log(MAX_NUMBER_GENERATED_VARIANTS, 41)),
                         self.lgr.max_label_length())
 def setUp(self):
     self.lgr = LGR()
     self.root = etree.Element('lgr', nsmap=NSMAP)
Exemplo n.º 26
0
 def setUp(self):
     unidb = IDNADatabase('6.3.0')
     self.lgr = LGR(unicode_database=unidb)
Exemplo n.º 27
0
class XMLParser(LGRParser):
    # Keep content intact, so do not strip CDATA section
    # (used in the <meta>/<description> element).
    # Do not resolve entities.
    # Skip comment, as we do not care.
    PARSER_OPTIONS = {
        'resolve_entities': False,
        'strip_cdata': False,
        'remove_comments': True
    }

    def __init__(self, *args, **kwargs):
        if 'force_mode' in kwargs:
            force_mode = kwargs['force_mode']
            del kwargs['force_mode']
        else:
            force_mode = True

        super(XMLParser, self).__init__(*args, **kwargs)
        self.force_mode = force_mode
        self.rfc7940_checks = LGRFormatTestResults()

    def validate_document(self, rng_schema_path):
        # Construct the RelaxNG validator
        schema = etree.RelaxNG(file=rng_schema_path)

        # Parse the XML file
        parser = etree.XMLParser(**self.PARSER_OPTIONS)
        doc = etree.parse(self.source, parser=parser)

        logger.debug("Validating document '%s' with RNG '%s'", self.source,
                     rng_schema_path)

        error_log = None
        if not schema.validate(doc):
            logger.warning("Validation of document '%s' failed", self.source)
            self.rfc7940_checks.error('schema')
            error_log = schema.error_log
            if len(error_log) == 0:
                # Bug in LXML, see https://bugs.launchpad.net/lxml/+bug/1526522
                error_log = "CANNOT VALIDATE XML"

        self.rfc7940_checks.tested('schema')
        return error_log

    def unicode_version(self):
        logger.debug("Get unicode version from meta")
        # Only parse the "meta" element
        # Skip comment, as we do not care.
        context = etree.iterparse(self.source,
                                  tag=META_TAG,
                                  **self.PARSER_OPTIONS)
        self._fast_iter(context)
        unicode_version = self._lgr.metadata.unicode_version
        self._lgr = None

        # FD is now potentially at the end of the documents,
        # set it back to start
        if hasattr(self.source, "seek"):
            self.source.seek(0)
        return unicode_version

    def parse_document(self):
        logger.debug('Start parsing of file: %s', self.filename)

        # Keep content intact, so do not strip CDATA section
        # (used in the <meta>/<description> element).
        # Do not resolve entities.
        # Skip comment, as we do not care.
        context = etree.iterparse(self.source, **self.PARSER_OPTIONS)

        self._fast_iter(context)

        # FD is now potentially at the end of the documents,
        # set it back to start
        if hasattr(self.source, "seek"):
            self.source.seek(0)

        self.rfc7940_checks.tested('parse_xml')
        return self._lgr

    def _process_meta(self, elem):
        """
        Process the <meta> element of an LGR XML file.
        """
        metadata = Metadata(self.rfc7940_checks)
        reference_manager = ReferenceManager()
        MAPPER = {
            DATE_TAG:
            lambda d: metadata.set_date(d, force=self.force_mode),
            VALIDITY_START_TAG:
            lambda d: metadata.set_validity_start(d, force=self.force_mode),
            VALIDITY_END_TAG:
            lambda d: metadata.set_validity_end(d, force=self.force_mode),
            UNICODE_VERSION_TAG:
            lambda d: metadata.set_unicode_version(d, force=self.force_mode),
        }
        unicode_version_tag_found = False
        for child in elem:
            tag = child.tag
            logger.debug("Got '%s' element", tag)
            if tag in MAPPER:
                MAPPER[tag](child.text)
                if tag == UNICODE_VERSION_TAG:
                    unicode_version_tag_found = True
            elif tag == VERSION_TAG:
                metadata.version = Version(child.text,
                                           child.get('comment', None))
            elif tag == LANGUAGE_TAG:
                metadata.add_language(child.text, force=self.force_mode)
            elif tag == SCOPE_TAG:
                metadata.scopes.append(
                    Scope(child.text, child.get('type', None)))
            elif tag == DESCRIPTION_TAG:
                # Seems to be an issue with CDATA/iterparse: https://bugs.launchpad.net/lxml/+bug/1788449
                # For now, manually replace CRLF with LF
                metadata.description = Description(
                    child.text.replace('\r\n', '\n'), child.get('type', None))
            elif tag == REFERENCES_TAG:
                for reference in child:
                    value = reference.text
                    # Don't convert it to an int since ref_id may be a string
                    ref_id = reference.get('id')
                    comment = reference.get('comment', None)
                    reference_manager.add_reference(value,
                                                    comment=comment,
                                                    ref_id=ref_id)
                # Since we have processed <reference> elements here, let's clean-up
                child.clear()
            else:
                logger.warning("Unhandled '%s' element in <meta> section", tag)
                self.rfc7940_checks.error('parse_xml')
            child.clear()

        self.rfc7940_checks.add_test_result('explicit_unicode_version',
                                            unicode_version_tag_found)
        self._lgr = LGR(name=self.filename,
                        metadata=metadata,
                        reference_manager=reference_manager,
                        unicode_database=self._unicode_database)

    def _process_data(self, elem):
        """
        Process the <data> element of an LGR XML file.
        """

        # It is RECOMMENDED to list all "char" elements in ascending order of
        # the "cp" attribute. The below variable is used when verifying that.
        previous_codepoint = []

        for child in elem:
            comment = child.get('comment', None)
            when = child.get('when', None)
            not_when = child.get('not-when', None)

            # Handle references
            ref = string_to_list(child.get('ref', ''))

            # Handle tags
            tag = string_to_list(child.get('tag', ''))

            if child.tag == CHAR_TAG:
                codepoint = [int(c, 16) for c in child.get('cp').split()]

                if codepoint <= previous_codepoint:
                    if previous_codepoint[0:len(codepoint)] == codepoint:
                        # Not clear what order is to be recommended here
                        self.rfc7940_checks.error(
                            'char_strict_ascending_order')
                    else:
                        logger.warning(
                            "cp attribute not in ascending order: '%s'",
                            child.get('cp'))
                        self.rfc7940_checks.error('char_ascending_order')
                previous_codepoint = codepoint

                try:
                    self._lgr.add_cp(codepoint,
                                     comment=comment,
                                     ref=ref,
                                     tag=tag,
                                     when=when,
                                     not_when=not_when,
                                     force=self.force_mode)
                except LGRException as exc:
                    logger.error("Cannot add code point '%s': %s",
                                 format_cp(codepoint), exc)
                    self.rfc7940_checks.error('parse_xml')
                    self.rfc7940_checks.error('codepoint_valid')
                    if not self.force_mode:
                        raise

                # Variants of char
                for variant in child.iter(VARIANT_TAG):
                    var_codepoint = [
                        int(c, 16) for c in variant.get('cp').split()
                    ]
                    when = variant.get('when', None)
                    not_when = variant.get('not-when', None)
                    variant_type = variant.get('type', None)
                    comment = variant.get('comment', None)

                    # Handle references
                    ref = string_to_list(variant.get('ref', ''))

                    try:
                        self._lgr.add_variant(codepoint,
                                              var_codepoint,
                                              variant_type=variant_type,
                                              when=when,
                                              not_when=not_when,
                                              comment=comment,
                                              ref=ref,
                                              force=self.force_mode)
                    except LGRException as exc:
                        logger.error(
                            "Cannot add variant '%s' "
                            "to code point '%s': %s", format_cp(var_codepoint),
                            format_cp(codepoint), exc)
                        self.rfc7940_checks.error('parse_xml')
                        self.rfc7940_checks.error('codepoint_valid')
                        if not self.force_mode:
                            raise
            elif child.tag == RANGE_TAG:
                first_cp = int(child.get('first-cp'), 16)
                last_cp = int(child.get('last-cp'), 16)

                try:
                    self._lgr.add_range(first_cp,
                                        last_cp,
                                        comment=comment,
                                        ref=ref,
                                        tag=tag,
                                        when=when,
                                        not_when=not_when,
                                        force=self.force_mode)
                except LGRException as exc:
                    self.rfc7940_checks.error('parse_xml')
                    self.rfc7940_checks.error('codepoint_valid')
                    logger.error("Cannot add range '%s-%s': %s",
                                 format_cp(first_cp), format_cp(last_cp), exc)
                    if not self.force_mode:
                        raise

            child.clear()

        self.rfc7940_checks.tested('char_ascending_order')
        self.rfc7940_checks.tested('char_strict_ascending_order')

    def _process_rules(self, elem):
        """
        Process the <rules> element of an LGR XML file.
        """
        # Keep "text" version of the rules since we don't do anything with them.
        for child in elem:
            if child.tag in COMBINATOR_TAGS + (CLASS_TAG, ):
                cls = self._parse_class(child)
                self._lgr.add_class(cls, force=self.force_mode)
                child = drop_ns(child)
                self._lgr.classes_xml.append(
                    etree.tostring(child, encoding=text_type))
            elif child.tag == RULE_TAG:
                rule = self._parse_rule(child)
                self._lgr.add_rule(rule, force=self.force_mode)
                child = drop_ns(child)
                self._lgr.rules_xml.append(
                    etree.tostring(child, encoding=text_type))
            elif child.tag == ACTION_TAG:
                action = self._parse_action(child)
                self._lgr.add_action(action, force=self.force_mode)
                child = drop_ns(child)
                self._lgr.actions_xml.append(
                    etree.tostring(child, encoding=text_type))
            else:
                logger.warning("Unhandled '%s' element in <rules> section",
                               child.tag)
                self.rfc7940_checks.error("parse_xml")
            child.clear()

    def _parse_rule(self, elem):
        """
        Parse a <rule> element.

        :return: The rule object created.
        """
        rule = Rule(name=elem.get('name', None),
                    comment=elem.get('comment', None),
                    ref=string_to_list(elem.get('ref', '')),
                    by_ref=elem.get('by-ref', None))

        for child in elem:
            self._parse_rule_helper(child, rule)

        return rule

    def _parse_rule_helper(self, child, rule):
        """
        Helper to parse the content of a <rule> element.

        This function is to be called on children of a top-level <rule>.

        :param child: Child element of a top-level <rule> element.
        :param rule: The top-level rule element to add the content to.
        """
        tag = child.tag
        comment = child.get('comment', None)
        count = child.get('count', None)

        if tag == ANCHOR_TAG:
            rule.add_child(AnchorMatcher(comment=comment))
        elif tag == ANY_TAG:
            rule.add_child(AnyMatcher(comment=comment, count=count))
        elif tag == CHAR_TAG:
            rule.add_child(
                CharMatcher(cp_or_sequence_from_class(child),
                            comment=comment,
                            count=count))
        elif tag == CHOICE_TAG:
            choice = ChoiceMatcher(comment=comment, count=count)
            for matcher in child:
                self._parse_rule_helper(matcher, choice)
            rule.add_child(choice)
        elif tag == END_TAG:
            rule.add_child(EndMatcher(comment=comment))
        elif tag == LOOKAHEAD_TAG:
            look_ahead = LookAheadMatcher(comment=comment)
            for matcher in child:
                self._parse_rule_helper(matcher, look_ahead)
            rule.add_child(look_ahead)
        elif tag == LOOKBEHIND_TAG:
            look_behind = LookBehindMatcher(comment=comment)
            for matcher in child:
                self._parse_rule_helper(matcher, look_behind)
            rule.add_child(look_behind)
        elif tag == START_TAG:
            rule.add_child(StartMatcher(comment=comment))
        elif tag == RULE_TAG:
            child_rule = self._parse_rule(child)
            rule.add_child(
                RuleMatcher(child_rule, comment=comment, count=count))
        elif tag == CLASS_TAG or tag in COMBINATOR_TAGS:
            rule.add_child(
                ClassMatcher(self._parse_class(child),
                             comment=comment,
                             count=count))
        else:
            logger.warning("Unhandled '%s' element in <rule> object", tag)
            self.rfc7940_checks.error('parse_xml')

    def _parse_action(self, elem):
        """
        Parse an <action> element.

        :return: The action object created.
        """
        disp = elem.get('disp')
        comment = elem.get('comment', None)

        match = elem.get('match', None)
        not_match = elem.get('not-match', None)

        any_variant = string_to_list(elem.get('any-variant', ''))
        all_variants = string_to_list(elem.get('all-variants', ''))
        only_variants = string_to_list(elem.get('only-variants', ''))

        return Action(disp,
                      comment=comment,
                      ref=string_to_list(elem.get('ref', '')),
                      match=match,
                      not_match=not_match,
                      any_variant=any_variant,
                      all_variants=all_variants,
                      only_variants=only_variants)

    def _parse_class(self, elem):
        """
        Parse an <class> element.

        :return: The Class object created.
        """
        tag = elem.tag
        name = elem.get('name', None)
        comment = elem.get('comment', None)

        if tag == CLASS_TAG:
            cls = Class(name=name,
                        comment=comment,
                        ref=string_to_list(elem.get('ref', '')),
                        from_tag=elem.get('from-tag', None),
                        unicode_property=elem.get('property', None),
                        by_ref=elem.get('by-ref', None))
            if len(elem) == 0 and elem.text:
                # No child, code point(s) defined in text
                cls.add_codepoint(cp_or_sequence_from_class(elem))
            for child in elem:
                cls.add_codepoint(cp_or_sequence_from_class(child))
        elif tag in COMBINATOR_TAGS:
            MAPPING = {
                UNION_TAG: UnionClass,
                COMPLEMENT_TAG: ComplementClass,
                INTERSECTION_TAG: IntersectionClass,
                DIFFERENCE_TAG: DifferenceClass,
                SYM_DIFFERENCE_TAG: SymmetricDifferenceClass
            }
            cls = MAPPING[tag](name=name, comment=comment)
            # TODO: ensure number of children
            for child in elem:
                cls.add_child(self._parse_class(child))
        else:
            logger.warning("Unhandled '%s' element in <class> object", tag)
            self.rfc7940_checks.error('parse_xml')

        return cls

    def _fast_iter(self, context):
        """
        Iterator used to incrementally parse the XML file.
        """
        metadata_added = False
        for _, elem in context:
            if not metadata_added and elem == DATA_TAG:
                # The optional "meta" element is not present since it must
                # preceed the required data element.
                # However, we still have to call _process_meta
                self._process_meta({})
                metadata_added = True
            if elem.tag == META_TAG:
                logger.debug("Got 'meta' element")
                self._process_meta(elem)
            elif elem.tag == DATA_TAG:
                logger.debug("Got 'data' element")
                self._process_data(elem)
            elif elem.tag == RULES_TAG:
                logger.debug("Got 'rules' element")
                self._process_rules(elem)
            else:
                continue
            # Clean-up memory
            elem.clear()
        del context
Exemplo n.º 28
0
def union_lgrs(lgr1, lgr2):
    """
    Compute the union of 2 LGRs and returns a valid LGR.

    Note: Ranges have to be expanded before calling this function.

    :param lgr1: First LGR.
    :param lgr2: Second LGR.
    :return: New LGR: union of two inputs.
    """
    name = 'Union of %s and %s' % (lgr1.name, lgr2.name)

    logger.debug("Union of %s", name)

    lgr1.expand_ranges()
    lgr2.expand_ranges()

    # Note: We need to create a copy (copy.deepcopy) for some elements
    # otherwise they could reference the original objects.

    metadata = copy.deepcopy(union_metadata(lgr1.metadata, lgr2.metadata))
    lgr = LGR(name=name, metadata=metadata)

    # No need to copy references, they are new objects
    references = union_reference_manager(lgr1.reference_manager,
                                         lgr2.reference_manager)
    lgr.reference_manager = references

    first_cps = {c.cp for c in lgr1.repertoire}
    second_cps = {c.cp for c in lgr2.repertoire}


    # No need to copy char, they are new objects

    # Compute union of all common code points
    for cp in set.intersection(first_cps, second_cps):
        char1 = lgr1.get_char(cp)
        char2 = lgr2.get_char(cp)

        union_char(lgr, char1, char2)

    # Append all other code points
    for cp in set.difference(first_cps, second_cps):
        char = lgr1.get_char(cp)

        lgr.add_cp(char.cp,
                   comment=char.comment,
                   #ref=char.references,
                   tag=char.tags,
                   when=char.when, not_when=char.not_when)

    for cp in set.difference(second_cps, first_cps):
        char = lgr2.get_char(cp)

        lgr.add_cp(char.cp,
                   comment=char.comment,
                   #ref=char.references,
                   tag=char.tags,
                   when=char.when, not_when=char.not_when)

    (actions, actions_xml) = union_actions(lgr1, lgr2)
    lgr.actions = copy.deepcopy(actions)
    lgr.actions_xml = actions_xml

    (rules, rules_xml) = union_rules(lgr1, lgr2)
    lgr.rules = copy.deepcopy(rules)
    lgr.rules_xml = rules_xml

    (classes, classes_xml) = union_classes(lgr1, lgr2)
    lgr.classes = copy.deepcopy(classes)
    lgr.classes_xml = classes_xml

    return lgr
Exemplo n.º 29
0
class TestPopulate(unittest.TestCase):
    def setUp(self):
        self.lgr = LGR()
        # Configure log system to redirect validation logs to local attribute
        self.log_output = StringIO()
        ch = logging.StreamHandler(self.log_output)
        ch.setLevel(logging.INFO)
        logger = logging.getLogger('lgr.populate')
        logger.addHandler(ch)
        logger.setLevel(logging.INFO)

    def test_no_symmetric_in_repertoire(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0062])
        populate_lgr(self.lgr)
        log_content = self.log_output.getvalue()
        self.assertEqual(
            "Add missing code point 'U+0062' in LGR as it is a variant of 'U+0061'\n"
            "Add code point 'U+0061' as variant of 'U+0062' for symmetry\n",
            log_content)
        self.assertIn(0x0062, self.lgr.repertoire)
        new_variant = self.lgr.get_char([0x0062])
        self.assertEqual([(0x0061, )],
                         [c.cp for c in new_variant.get_variants()])

    def test_no_symmetric_in_repertoire_twice(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0062])
        self.lgr.add_variant([0x0061], [0x0063])
        self.lgr.add_cp([0x0062])
        self.lgr.add_variant([0x0062], [0x0061])
        self.lgr.add_variant([0x0062], [0x0063])
        populate_lgr(self.lgr)
        log_content = self.log_output.getvalue()
        self.assertEqual(
            "Add missing code point 'U+0063' in LGR as it is a variant of 'U+0061'\n"
            "Add code point 'U+0061' as variant of 'U+0063' for symmetry\n"
            "Add code point 'U+0062' as variant of 'U+0063' for symmetry\n",
            log_content)
        self.assertIn(0x0063, self.lgr.repertoire)
        new_variant = self.lgr.get_char([0x0063])
        self.assertEqual([(0x0061, ), (0x0062, )],
                         [c.cp for c in new_variant.get_variants()])

    def test_no_symmetric_in_variants(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0062])
        self.lgr.add_cp([0x0062])
        populate_lgr(self.lgr)
        log_content = self.log_output.getvalue()
        self.assertEqual(
            "Add code point 'U+0061' as variant of 'U+0062' for symmetry\n",
            log_content)
        cp = self.lgr.get_char([0x0062])
        self.assertEqual([(0x0061, )], [c.cp for c in cp.get_variants()])

    def test_no_transitivity(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0062])
        self.lgr.add_cp([0x0062])
        self.lgr.add_variant([0x0062], [0x0063])
        self.lgr.add_cp([0x0063])
        populate_lgr(self.lgr)
        log_content = self.log_output.getvalue()
        self.assertEqual(
            "Add code point 'U+0061' as variant of 'U+0062' for symmetry\n"
            "Add code point 'U+0062' as variant of 'U+0063' for symmetry\n"
            "Add code point 'U+0063' as variant of 'U+0061' for transitivity with 'U+0062'\n"
            "Add code point 'U+0061' as variant of 'U+0063' for transitivity with 'U+0062'\n",
            log_content)
        cp = self.lgr.get_char([0x0061])
        self.assertEqual([(0x0062, ), (0x0063, )],
                         [c.cp for c in cp.get_variants()])
        cp = self.lgr.get_char([0x0062])
        self.assertEqual([(0x0061, ), (0x0063, )],
                         [c.cp for c in cp.get_variants()])
        cp = self.lgr.get_char([0x0063])
        self.assertEqual([(0x0061, ), (0x0062, )],
                         [c.cp for c in cp.get_variants()])
Exemplo n.º 30
0
    def test_merge_rules(self):
        merged_lgr = LGR()

        lgr = LGR()
        rule = Rule(name='rule-name')
        anonymous_class = UnionClass()
        anonymous_class.add_child(Class(codepoints=[0x0061]))
        anonymous_class.add_child(Class(codepoints=[0x0062]))
        rule.add_child(ClassMatcher(anonymous_class))
        rule_xml = """
<rule name="rule-name">
    <union>
        <class>0x0061</class>
        <class>0x0062</class>
    </union>
</rule>
"""
        lgr.add_rule(rule)
        lgr.rules_xml.append(rule_xml)

        merge_rules(lgr, 'fr', merged_lgr, {})

        self.assertEqual(len(merged_lgr.rules), 1)
        self.assertEqual(len(merged_lgr.rules_xml), 1)
        self.assertEqual(merged_lgr.rules[0], 'fr-rule-name')

        # Merging is idempotent
        merge_rules(lgr, 'fr', merged_lgr, {})
        self.assertEqual(len(merged_lgr.rules), 1)
        self.assertEqual(len(merged_lgr.rules_xml), 1)
        self.assertEqual(merged_lgr.rules[0], 'fr-rule-name')

        # Not with different script
        merge_rules(lgr, 'en', merged_lgr, {})
        self.assertEqual(len(merged_lgr.rules), 2)
        self.assertEqual(len(merged_lgr.rules_xml), 2)
        self.assertEqual(merged_lgr.rules[1], 'en-rule-name')

        # Nor with MSR2
        lgr = LGR()
        rule = Rule(name='leading-combining-mark')
        rule.add_child(StartMatcher())
        anonymous_class = UnionClass()
        anonymous_class.add_child(Class(unicode_property="gc:Mn"))
        anonymous_class.add_child(Class(unicode_property="gc:Mc"))
        lgr.add_rule(rule)
        lgr.rules_xml.append("""
<rule name="leading-combining-mark" comment="WLE Rule1: default WLE rule matching labels with leading combining marks ⍟">
    <start />
    <union>
        <class property="gc:Mn" />
        <class property="gc:Mc" />
    </union>
</rule>
""")

        merge_rules(lgr, 'fr', merged_lgr, {})
        self.assertEqual(len(merged_lgr.rules), 3)
        self.assertEqual(len(merged_lgr.rules_xml), 3)
        self.assertEqual(merged_lgr.rules[2], 'Common-leading-combining-mark')

        merge_rules(lgr, 'fr', merged_lgr, {})
        self.assertEqual(len(merged_lgr.rules), 3)
        self.assertEqual(len(merged_lgr.rules_xml), 3)
        self.assertEqual(merged_lgr.rules[2], 'Common-leading-combining-mark')