def snap_property_set(stream): position = stream.tell() x = stream.read(2) if x == "\\P": stream.seek(position) return property.do(stream) elif x == "\\N": stream.seek(position) return property.do_shortcut(stream, "N", "na") # UCS Property: Name elif x == "\\G": stream.seek(position) return property.do_shortcut(stream, "G", "gc") # UCS Property: General_Category elif x == "\\E": skip_whitespace(stream) if check(stream, "{") == False: error.log("Missing '{' after '\\E'.", stream) encoding_name = __snap_until(stream, "}").strip() result = codec_db.get_supported_unicode_character_set(encoding_name) if result is None: error.log("Error occured at this place.", stream) return result else: stream.seek(position) return None
def __parse_property_expression(stream, PropertyLetter, EqualConditionPossibleF=True): """Parses an expression of the form '\? { X [ = Y] }' where ? = PropertyLetter. If the '=' operator is present then two fields are returned first = left hand side, second = right hand side. Othewise an element is returned. """ assert len(PropertyLetter) == 1 assert type(PropertyLetter) == str assert type(EqualConditionPossibleF) == bool # verify '\?' x = stream.read(2) if x != "\\" + PropertyLetter: raise RegularExpressionException("Unicode property letter '\\%s' expected, received '%s'." % x) skip_whitespace(stream) x = stream.read(1) if x != "{": raise RegularExpressionException("Unicode property '\\%s' not followed by '{'." % PropertyLetter) content = __snap_until(stream, "}") fields = content.split("=") if len(fields) == 0: raise RegularExpressionException("Unicode property expression '\\%s{}' cannot have no content.") if len(fields) > 2: raise RegularExpressionException("Unicode property expression '\\%s' can have at maximum one '='.") if not EqualConditionPossibleF and len(fields) == 2: raise RegularExpressionException("Unicode property expression '\\%s' does not allow '=' conditions") return map(lambda x: x.strip(), fields)
def __snap_repetition_range(the_state_machine, stream): """Snaps a string that represents a repetition range. The following syntaxes are supported: '?' one or none repetition '+' one or arbitrary repetition '*' arbitrary repetition (even zero) '{n}' exactly 'n' repetitions '{m,n}' from 'm' to 'n' repetitions '{n,}' arbitrary, but at least 'n' repetitions """ assert the_state_machine.__class__.__name__ == "StateMachine", \ "received object of type '%s'" % the_state_machine.__class__.__name__ + "\n" + \ repr(the_state_machine) position_0 = stream.tell() x = stream.read(1) if x == "+": result = repeat.do(the_state_machine, 1) elif x == "*": result = repeat.do(the_state_machine) elif x == "?": result = repeat.do(the_state_machine, 0, 1) elif x == "{": repetition_range_str = __snap_until(stream, "}") if len(repetition_range_str) and not repetition_range_str[0].isdigit(): # no repetition range, so everything remains as it is stream.seek(position_0) return the_state_machine try: if repetition_range_str.find(",") == -1: # no ',' thus "match exactly a certain number": # e.g. {4} = match exactly four repetitions number = int(repetition_range_str) result = repeat.do(the_state_machine, number, number) return result # a range of numbers is given fields = repetition_range_str.split(",") fields = map(lambda x: x.strip(), fields) number_1 = int(fields[0].strip()) if fields[1] == "": number_2 = -1 # e.g. {2,} else: number_2 = int(fields[1].strip()) # e.g. {2,5} # produce repeated state machine result = repeat.do(the_state_machine, number_1, number_2) return result except: raise RegularExpressionException("error while parsing repetition range expression '%s'" \ % repetition_range_str) else: # no repetition range, so everything remains as it is stream.seek(position_0) return the_state_machine return result
def snap_property_set(stream): position = stream.tell() x = stream.read(2) if x == "\\P": stream.seek(position) return property.do(stream) elif x == "\\N": stream.seek(position) return property.do_shortcut(stream, "N", "na") # UCS Property: Name elif x == "\\G": stream.seek(position) return property.do_shortcut(stream, "G", "gc") # UCS Property: General_Category elif x == "\\E": skip_whitespace(stream) if check(stream, "{") == False: error_msg("Missing '{' after '\\E'.", stream) encoding_name = __snap_until(stream, "}").strip() return codec_db.get_supported_unicode_character_set(encoding_name, FH=stream) else: stream.seek(position) return None