def _get_variants(lgr, label_cplist, threshold_include_vars, idna_encoder, lgr_actions): res = {} var_results = [] summary, label_dispositions = lgr.compute_label_disposition_summary( label_cplist, include_invalid=True) res['summary'] = summary res['num_variants'] = len(label_dispositions) res['threshold_include_vars'] = threshold_include_vars if threshold_include_vars < 0 or len( label_dispositions) <= threshold_include_vars: for (variant_cp, var_disp, var_invalid_parts, action_idx, disp_set, logs) in label_dispositions: invalid_codepoints = set([c for c, _ in var_invalid_parts or []]) def format_cphex(c, want_html=True): if want_html and c in invalid_codepoints: return u'<span class="text-danger not-in-rep">U+{:04X} (&#{};)</span>'.format( c, c) else: return u"U+{:04X} (&#{};)".format(c, c) variant_u = cp_to_ulabel(variant_cp) variant_display_html = mark_safe(u' '.join( map(format_cphex, variant_cp))) variant_display = u' '.join( u"U+{:04X}".format(cp, cp_to_ulabel(cp)) for cp in variant_cp) variant_input = u' '.join(u"U+{:04X}".format(cp) for cp in variant_cp) variant_a = idna_encoder(variant_u) var_results.append({ 'u_label': variant_u, 'a_label': variant_a, 'cp_display_html': variant_display_html, 'cp_display': variant_display, 'cp_input': variant_input, 'disposition': var_disp, 'label_invalid_parts': var_invalid_parts, 'action_idx': action_idx, 'action': lgr_actions[action_idx] if action_idx >= 0 else None, 'disp_set': disp_set, 'logs': logs, }) res['variants'] = var_results return res
def validate_label_task(lgr_json, label, email_address, storage_path): """ Compute label validation variants of labels in a LGR. :param lgr_json: The LGRInfo as a JSON object. :param label: The label to validate, as a list of code points. :param email_address: The e-mail address where the results will be sent :param storage_path: The place where results will be stored """ lgr_info = LGRInfo.from_dict(lgr_json) udata = get_db_by_version(lgr_info.lgr.metadata.unicode_version) logger.info("Starting task 'validate label' for %s, for input label '%s'", lgr_info.name, label) u_label = cp_to_ulabel(label) body = "Hi,\nThe processing of label validation for label '{label}' in LGR '{lgr}' has".format(label=u_label, lgr=lgr_info.name) _lgr_tool_task(storage_path, base_filename='label_validation_{0}'.format(lgr_info.name), email_subject='LGR Toolset label validation result', email_body=body, email_address=email_address, cb=lgr_validate_label, lgr=lgr_info.lgr, label=label, udata=udata)
def find_variants_to_block(lgr, label_ref, label): var_ref = [var for (var, _, _) in lgr._generate_label_variants(label_ref)] for (variant_cp, disp, _, _, disp_set, _) in lgr.compute_label_disposition(label): if variant_cp in var_ref: variant_u = cp_to_ulabel(variant_cp) write_output("Variant '%s' [%s] with disposition set '%s' " "should be blocked (current disposition :%s)" % (variant_u, format_cp(variant_cp), disp_set, disp))
def cross_script_variants(lgr, labels_input): """ Compute cross-script variants of labels. :param lgr: The LGR to use for variant generation. :param labels_input: The file containing the labels """ if lgr.metadata is None: logger.error("Cannot generate cross-scripts variants " "for LGR without metadata") raise Exception if lgr.unicode_database is None: logger.error("Cannot generate cross-scripts variants " "for LGR without unicode database attached") raise Exception found = False for label, valid, error in read_labels(labels_input, lgr.unicode_database): if not valid: yield "Input label {}: {}\n".format(label, error) else: label_cp = tuple([ord(c) for c in label]) result, _, _, _, _, _ = lgr.test_label_eligible(label_cp) if not result: continue label_displayed = False for variant, disp, script_mapping in _generate_variants( lgr, label_cp): if not label_displayed: # Only display input label if it has x-variants yield "Input label {} ({}) has cross-script variants:\n".format( format_cp(label_cp), label) label_displayed = True found = True yield "\t- Cross-variant {} ({}), disposition {}:\n".format( format_cp(variant), cp_to_ulabel(variant), disp) yield '\t\t+ ' + '\t\t+ '.join([ "{} ({}): {}\n".format(format_cp(c), cp_to_ulabel(c), s) for c, s in script_mapping.items() ]) if not found: yield 'No cross-script variants for input!'
def check_label(lgr, label, invalid, test): from lgr.utils import format_cp label_cp = tuple([ord(c) for c in label]) label_display = ' '.join("{:04X}".format(cp) for cp in label_cp) logger.info("- Code points: %s", label_display) (eligible, label_parts, label_invalid_parts, disp, action_idx, logs) = lgr.test_label_eligible(label_cp) logger.info("- Eligible: %s", eligible) logger.info("- Disposition: %s", disp) is_default_action = action_idx > len(lgr.actions) actual_index = action_idx if not is_default_action else action_idx - len(lgr.actions) action_name = "DefaultAction" if is_default_action else "Action" logger.info("- Action triggered: %s[%d]", action_name, actual_index) logger.info("- Logs: %s", logs) write_output("Validation: {} ({}): Result: {}".format(label, label_display, "valid" if eligible else "INVALID"), test) if eligible: write_output("Disposition: {} ({}): Result: {} due to {}[{}]".format(label, label_display, disp, action_name, actual_index), test) summary, labels = lgr.compute_label_disposition_summary(label_cp, include_invalid=invalid) logger.info("Summary: %s", summary) for (variant_cp, var_disp, variant_invalid_parts, action_idx, disp_set, logs) in labels: variant_u = cp_to_ulabel(variant_cp) variant_display = ' '.join("{:04X}".format(cp) for cp in variant_cp) logger.info("\tVariant '%s'", variant_u) logger.info("\t- Code points: %s", format_cp(variant_cp)) logger.info("\t- Disposition: '%s'", var_disp) if variant_invalid_parts: logger.info("\t- Invalid code points from variant: %s", ' '.join(("{:04X} ({})".format(cp, "not in repertoire" if rules is None else ','.join(rules)) for cp, rules in variant_invalid_parts))) is_default_action = action_idx > len(lgr.actions) actual_index = action_idx if not is_default_action else action_idx - len(lgr.actions) action_name = "DefaultAction" if is_default_action else "Action" logger.info("\t- Action triggered: %s[%d]", action_name, actual_index) disp_set_display = '{%s}' % ','.join(disp_set) write_output("Variant: ({}): [{}] ==> {} due to {}[{}]".format(variant_display, disp_set_display, var_disp, action_name, actual_index), test) logger.info("\t- Logs: %s", logs) else: logger.info("- Valid code points from label: %s", ' '.join("{:04X}".format(cp) for cp in label_parts)) logger.info("- Invalid code points from label: %s", ' '.join(("{:04X} ({})".format(cp, "not in repertoire" if rules is None else ','.join(rules)) for cp, rules in label_invalid_parts)))
def parse_label_input(s, idna_decoder=lambda x: x.encode('utf-8').decode('idna'), as_cp=True): """ Parses a label from user input, applying a bit of auto-detection smarts :param s: input string in A-label, U-label or space-separated hex sequences. :param idna_decoder: IDNA decode function. :param as_cp: If True, returns a list of code points. Otherwise, unicode string. :return: list of code points >>> parse_label_input('0061') # treated as U-label - probably the only confusing result [48, 48, 54, 49] >>> parse_label_input('U+0061') # this is how to signal that you want hex [97] >>> parse_label_input('abc') [97, 98, 99] >>> parse_label_input('a b c') [97, 98, 99] >>> parse_label_input('xn--m-0ga') # "öm" [246, 109] """ if s.lower().startswith('xn--'): if as_cp: return [ord(c) for c in idna_decoder(s.lower())] else: return idna_decoder(s.lower()) elif ' ' in s or 'U+' in s.upper(): try: label_cp = parse_codepoint_input(s) except: if ' ' in s: raise ValueError("Label '{}' contains spaces " "that are not PVALID for IDNA2008".format(s)) raise if as_cp: return label_cp else: return cp_to_ulabel(label_cp) else: # treat as unicode if as_cp: return [ord(c) for c in s] else: return s
def lgr_set_validate_label_task(lgr_json, script_lgr_json, label, email_address, storage_path): """ Compute label validation variants of labels in a LGR. :param lgr_json: The LGRInfo as a JSON object. :param script_lgr_json: The LGRInfo for the script used to check label validity as a JSON object. :param label: The label to validate, as a list of code points. :param email_address: The e-mail address where the results will be sent :param storage_path: The place where results will be stored """ lgr_info = LGRInfo.from_dict(lgr_json) udata = get_db_by_version(lgr_info.lgr.metadata.unicode_version) script_lgr = LGRInfo.from_dict(script_lgr_json).lgr set_labels_info = lgr_info.set_labels_info if set_labels_info is None: set_labels_info = LabelInfo(name='None', labels=[]) logger.info("Starting task 'validate label' for %s, for input label '%s'", lgr_info.name, label) u_label = cp_to_ulabel(label) body = "Hi,\nThe processing of label validation for label '{label}'" \ " in LGR set '{lgr}' with script '{script}' has".format(label=u_label, lgr=lgr_info.lgr.name, script=script_lgr.name) _lgr_tool_task(storage_path, base_filename='label_validation_{0}'.format(lgr_info.name), email_subject='LGR Toolset label validation result', email_body=body, email_address=email_address, cb=lgr_set_validate_label, lgr=lgr_info.lgr, script_lgr=script_lgr, set_labels=set_labels_info.labels, label=label, udata=udata)
def collision(lgr, labels_input, show_dump=False, quiet=False): """ Show collisions in a list of labels for a given LGR :param lgr: The LGR object. :param labels_input: The file containing the labels :param show_dump: Generate a full dump :param quiet: Do not print rules """ from lgr.tools.utils import read_labels labels = set() for label, valid, error in read_labels(labels_input, lgr.unicode_database): if valid: labels.add(label) else: yield "Label {}: {}\n".format(label, error) # get diff between labels and variants for the two LGR # only keep label without collision for a full dump label_indexes, not_in_lgr = _generate_indexes(lgr, labels, keep=show_dump, quiet=quiet) if not_in_lgr: yield "\n# Labels not in LGR #\n\n" for label_cp in not_in_lgr: yield "Label {}\n".format(cp_to_ulabel(label_cp)) # output collisions yield "\n# Collisions #\n\n" for output in _write_complete_output(label_indexes): yield output if show_dump: yield "\n# Summary #\n\n" for output in _full_dump(label_indexes): yield output
def _get_validity(lgr, label_cplist, idna_encoder): label_u = cp_to_ulabel(label_cplist) try: label_a = idna_encoder(label_u) except UnicodeError as e: label_a = '!ERROR - {}!'.format(e) (eligible, label_valid_parts, label_invalid_parts, disp, action_idx, logs) = lgr.test_label_eligible(label_cplist) invalid_codepoints = set([c for c, _ in label_invalid_parts]) def format_cphex(c, want_html=True): if want_html and c in invalid_codepoints: return u'<span class="text-danger not-in-rep">U+{:04X} (&#{};)</span>'.format( c, c) else: return u"U+{:04X} (&#{};)".format(c, c) label_display_html = mark_safe(u' '.join(map(format_cphex, label_cplist))) label_display_text = u' '.join(u"U+{:04X}".format(cp) for cp in label_cplist) lgr_actions = lgr.effective_actions_xml # save it once (since `lgr.effective_actions` is dynamically computed) return { 'u_label': label_u, 'a_label': label_a, 'cp_display_html': label_display_html, 'cp_display': label_display_text, 'eligible': eligible, 'disposition': disp, 'label_invalid_parts': label_invalid_parts, 'action_idx': action_idx, 'action': lgr_actions[action_idx] if action_idx >= 0 else None, 'logs': logs }, lgr_actions
def __unicode__(self): return cp_to_ulabel(self.cp)
def main(): parser = argparse.ArgumentParser(description='LGR Collision') parser.add_argument('-v', '--verbose', action='store_true', help='be verbose') parser.add_argument('-g', '--generate', action='store_true', help='Generate variants') parser.add_argument('-l', '--libs', metavar='LIBS', help='ICU libraries', required=True) parser.add_argument('-s', '--set', metavar='SET FILE', help='Filepath to the set of reference labels', required=True) parser.add_argument('xml', metavar='XML') args = parser.parse_args() log_level = logging.DEBUG if args.verbose else logging.INFO logging.basicConfig(stream=sys.stdout, level=log_level) lgr_parser = XMLParser(args.xml) libpath, i18n_libpath, libver = args.libs.split('#') manager = UnicodeDataVersionManager() unidb = manager.register(None, libpath, i18n_libpath, libver) lgr_parser.unicode_database = unidb lgr = lgr_parser.parse_document() if lgr is None: logger.error("Error while parsing LGR file.") logger.error("Please check compliance with RNG.") return ref_label_indexes = {} # Compute index label for set or reference labels with io.open(args.set, 'r', encoding='utf-8') as ref_set: for ref_label in ref_set: label_cp = tuple([ord(c) for c in ref_label.strip()]) try: label_index = compute_label_index(lgr, label_cp) except NotInLGR: continue ref_label_indexes[label_index] = label_cp # Deal with input for label in get_stdin().read().splitlines(): write_output("Check label '%s'" % label) label_cp = tuple([ord(c) for c in label]) label_disp = format_cp(label_cp) label_index = compute_label_index(lgr, label_cp) if label_index in ref_label_indexes: ref_label_cp = ref_label_indexes[label_index] ref_label_disp = format_cp(ref_label_cp) ref_label_u = cp_to_ulabel(ref_label_cp) write_output("Collision for label '%s' [%s] with '%s' [%s]" % (label, label_disp, ref_label_u, ref_label_disp)) if args.generate: find_variants_to_block(lgr, ref_label_cp, label_cp) else: write_output("No collision for label %s [%s]" % (label, label_disp))
def matches(self, label, rules_lookup, classes_lookup, unicode_database, anchor=None, index=0): """ Test if a rule matches a label. :param label: Label to test, as a sequence of code points. :param rules_lookup: Dictionary of defined rules in the LGR to use for by-ref rules. :param classes_lookup: Dictionary of defined classes in the LGR to use for by-ref classes. :param unicode_database: The Unicode Database. :param anchor: Optional anchor to use for look-around rules. :param index: If anchor is used, its index (0-based). :return: True if label is matched by the rule, False otherwise. """ rule_logger.debug( "Test match on %s for label '%s' with anchor '%s' (%d)", self, format_cp(label), format_cp(anchor) if anchor else anchor, index) try: pattern = self.get_pattern(rules_lookup, classes_lookup, unicode_database) except (re.error, PICUException) as re_exc: rule_logger.error('Cannot get pattern for rule %s: %s', self, re_exc) raise RuleError(self.name, re_exc) if len(pattern) == 0: # Pattern is empty, nothing will match rule_logger.debug('Empty pattern') return False if anchor is not None: if '%(anchor)s' not in pattern: rule_logger.debug('Not a parameterized context rule') # Pattern is not a parameterized context-rule, so set index to 0 index = 0 # Format anchor - Can be a sequence. # Use old-style formatting, see note in matcher.AnchorMatcher pattern = pattern % { 'anchor': ''.join( map(lambda c: '\\x{{{:X}}}'.format(c), anchor)) } rule_logger.debug("Pattern for rule %s: '%s'", self, pattern) try: regex = unicode_database.compile_regex(pattern) except (re.error, PICUException) as re_exc: rule_logger.error('Cannot compile regex: %s', re_exc) raise RuleError(self.name, re_exc) rule_logger.debug("Index: %d", index) # Convert label to U-format to be used in regex label_u = cp_to_ulabel(label) # Look for match. It is important to use "search" and not "match" # here, since a rule may not match at the beginning of a label. result = regex.search(label_u, index=index) rule_logger.debug("Result of match: %s", result) if result is None: return False if anchor is not None: match_index = result.start() rule_logger.debug('Match index: %d - Index: %d', match_index, index) if match_index > index: rule_logger.debug('Match found after index, invalid') return False return True
def check_label(lgr, label, generate_variants=False, merged_lgr=None, set_labels=None): from lgr.utils import format_cp label_cp = tuple([ord(c) for c in label]) write_output("\nLabel: %s [%s]" % (label, format_cp(label_cp))) (eligible, label_parts, label_invalid_parts, disp, _, _) = lgr.test_label_eligible(label_cp) write_output("\tEligible: %s" % eligible) write_output("\tDisposition: %s" % disp) if eligible: if merged_lgr and set_labels: write_output("Collisions:") if label in set_labels: write_output("Labels is in the LGR set labels") else: indexes = get_collisions(merged_lgr, set_labels + [label], quiet=True) if len(indexes) > 1: # there should be one collision except if set labels are not checked logger.error( 'More than one collision, please check your LGR set labels' ) return elif len(indexes) > 0: collisions = indexes[list(indexes.keys())[0]] collision = None collide_with = [] # retrieve label in collision list for col in collisions: if col['label'] == label: collision = col if col['label'] in set_labels: collide_with.append(col) if not collision: # this should not happen except if set labels are not checked logger.error( 'Cannot retrieve label in collisions, please check your LGR set labels' ) return if len(collide_with) != 1: logger.error( 'Collision with more than one label in the LGR set labels,' 'please check your LGR set labels') return write_output("Label collides with LGR set label '%s'" % collide_with[0]['label']) else: write_output('\tNone') if generate_variants: write_output("Variants:") summary, labels = lgr.compute_label_disposition_summary(label_cp) for (variant_cp, var_disp, _, _, _) in labels: variant_u = cp_to_ulabel(variant_cp) write_output("\tVariant %s [%s]" % (variant_u, format_cp(variant_cp))) write_output("\t- Disposition: '%s'" % var_disp) else: write_output("- Valid code points from label: %s" % u' '.join(u"{:04X}".format(cp) for cp in label_parts)) if label_invalid_parts: write_output("- Invalid code points from label: {}".format( ' '.join("{:04X} ({})".format( cp, "not in repertoire" if rules is None else ','.join(rules)) for cp, rules in label_invalid_parts)))
def _get_collisions(lgr, label_cplist, set_labels, idna_encoder, lgr_actions): res = {} label_u = cp_to_ulabel(label_cplist) set_labels = [l.strip() for l in set_labels] # if label is in the LGR set labels skip if label_u in set_labels: res['collisions_error'] = _('The label is in the LGR set labels.') return res # check for collisions indexes = get_collisions(lgr, set_labels + [label_u], quiet=False) if len(indexes) > 1: # there should be one collision as set labels are checked, this error should not happen res['collisions_error'] = _( 'ERROR more than one collision, please check your LGR set labels') return res if len(indexes) == 0: return res collisions = indexes[list(indexes.keys())[0]] collision = None collide_with = [] # retrieve label in collision list for col in collisions: if col['label'] == label_u: collision = col if col['label'] in set_labels: collide_with.append(col) if not collision: # this should not happen res['collisions_error'] = _( 'ERROR cannot retrieve label in collisions, please check your LGR set labels' ) return res if len(collide_with) != 1: res['collisions_error'] = _( 'ERROR collision with more than one label in the LGR set labels,' 'please check your LGR set labels') return res collide_with = collide_with[0] variant_u = idna_encoder(collide_with['label']) variant_display_html = mark_safe(u' '.join( u"U+{:04X} ({})".format(cp, cp_to_ulabel(cp)) for cp in collide_with['cp'])) variant_display = u' '.join(u"U+{:04X}".format(cp) for cp in collide_with['cp']) try: variant_a = idna_encoder(variant_u) except UnicodeError as e: variant_a = '!ERROR - {}!'.format(e) # XXX Collided variants info may be retrieved in script LGR rather than in merged LGR action_idx = collision['action_idx'][collide_with['label']] collision_dct = { 'input': collide_with['label'], 'u_label': variant_u, 'a_label': variant_a, 'cp_display_html': variant_display_html, 'cp_display': variant_display, 'disposition': collision['disp'][collide_with['label']], 'action_idx': action_idx, 'action': lgr_actions[action_idx] if action_idx >= 0 else None, 'rules': collision['rules'][collide_with['label']] } # remove variants that are not in our labels set res['collision'] = collision_dct return res
def diff(lgr_1, lgr_2, labels_input, show_collision=True, show_dump=False, quiet=False): """ Show diff for a list of labels between 2 LGR :param lgr_1: The first LGR info object. :param lgr_2: The second LGR info object. :param labels_input: The file containing the labels :param show_collision: Output collisions :param show_dump: Generate a full dump :param quiet: Do not print rules """ from lgr.tools.utils import read_labels labels = set() for label, valid, error in read_labels(labels_input, lgr_1.unicode_database): if valid: labels.add(label) else: yield "Label {}: {}\n".format(label, error) # get diff between labels and variants for the two LGR # keep label without collision as we need to compare label1_indexes, not_in_lgr_1 = _generate_indexes(lgr_1, labels, keep=True, quiet=quiet) label2_indexes, not_in_lgr_2 = _generate_indexes(lgr_2, labels, keep=True, quiet=quiet) if not_in_lgr_1 or not_in_lgr_2: for index, not_in_lgr in enumerate([not_in_lgr_1, not_in_lgr_2], 1): yield "# Labels not in LGR {} #\n\n".format(index) for label_cp in not_in_lgr: yield "Label {}\n".format(cp_to_ulabel(label_cp)) yield '\n' # generate a dictionary of indexes per label labels_dic = {} yield "\n# LGR comparison #\n" for label in labels: label_cp = tuple([ord(c) for c in label]) try: index1 = lgr_1.generate_index_label(label_cp) except NotInLGR: yield "Label {} not in LGR {}\n".format(label, lgr_1) continue try: index2 = lgr_2.generate_index_label(label_cp) except NotInLGR: yield "Label {} not in LGR {}\n".format(label, lgr_2) continue labels_dic[label] = (index1, index2) for output in _compare(labels_dic, label1_indexes, label2_indexes): yield output # output collisions if show_collision: yield "\n\n# Collisions for LGR1 #\n" for output in _write_complete_output(label1_indexes): yield output if show_dump: yield "\n# Summary for LGR1 #\n" for output in _full_dump(label1_indexes): yield output yield "\n\n# Collisions for LGR2 #\n" for output in _write_complete_output(label2_indexes): yield output if show_dump: yield "\n# Summary for LGR2 #\n\n" for output in _full_dump(label2_indexes): yield output
def _generate_indexes(lgr, labels, keep=False, quiet=False): """ Generate indexes based on labels provided in the list :param lgr: The current LGR :param labels: The list of labels, as a list of U-Labels. :param keep: Do we keep labels without collision in the output :param quiet: If True, do not collect rule log. :return: (label_indexes, not_in_lgr), with: - label_indexes: the dictionary containing the primary labels and their variants (with various information) for each index. - not_in_lgr: List of labels that do not pass preliminary eligibility testing. """ label_indexes = {} not_in_lgr = [] # Get the indexes and variants for all labels for label in labels: label_cp = tuple([ord(c) for c in label]) try: label_index = lgr.generate_index_label(label_cp) except NotInLGR: not_in_lgr.append(label_cp) continue label_cp_out = format_cp(label_cp) if label_index not in label_indexes: label_indexes[label_index] = [] label_indexes[label_index].append({ 'label': label, 'bidi': "%s'%s'%s" % (LRI, label, PDI), 'cat': PRIMARY, 'cp': label_cp, 'cp_out': label_cp_out, 'disp': { label: '-' }, 'rules': { label: '-' }, 'action_idx': { label: '-' } }) for (label_index, primaries) in deepcopy(label_indexes).items(): # only get variants for collided labels (if not keep) if len(primaries) < 2 and not keep: del label_indexes[label_index] continue for primary in primaries: label_cp = primary['cp'] label = primary['label'] for (variant_cp, variant_disp, variant_invalid_parts, action_idx, _, log) in lgr.compute_label_disposition(label_cp, include_invalid=True, collect_log=not quiet): variant = cp_to_ulabel(variant_cp) log = log.strip() if quiet: log = '' variant_cp_out = format_cp(variant_cp) # search if variant is already in our dict, then add or # update it existing = [ var for var in label_indexes[label_index] if var['label'] == variant ] if len(existing) < 1: label_indexes[label_index].append({ 'label': variant, 'bidi': "%s'%s'%s" % (LRI, variant, PDI), 'cat': VARIANT, 'cp': variant_cp, 'cp_out': variant_cp_out, 'disp': { label: variant_disp }, 'rules': { label: log }, 'action_idx': { label: action_idx } }) else: assert len(existing) == 1 existing[0]['disp'][label] = variant_disp existing[0]['rules'][label] = log existing[0]['action_idx'][label] = action_idx return label_indexes, not_in_lgr