def test_on_UCS_sample_sets(Trafo, unicode_to_transformed_sequence): script_list = [ "Arabic", "Armenian", "Balinese", "Bengali", "Bopomofo", "Braille", "Buginese", "Buhid", "Canadian_Aboriginal", "Cherokee", "Common", "Cuneiform", "Cypriot", "Deseret", "Gothic", "Greek", "Hanunoo", "Hebrew", "Hiragana", "Inherited", "Kannada", "Han", "Katakana", "Kharoshthi", "Khmer", "Lao", "Latin", "Limbu", "Linear_B", "Malayalam", "Mongolian", "Myanmar", "New_Tai_Lue", "Nko", "Osmanya", "Ogham", "Old_Italic", "Old_Persian", "Phoenician", "Shavian", "Syloti_Nagri", "Syriac", "Tagalog", "Tagbanwa", "Tai_Le", "Tamil", "Telugu", "Thaana", "Thai", "Tibetan", "Tifinagh", "Ugaritic", "Yi" ] sets = [ X(name) for name in script_list ] orig = get_combined_state_machine(map(lambda x: x.sm, sets)) state_n_before, result = transform(Trafo, orig) # print result.get_graphviz_string(Option="hex") for set in sets: set.check(result, unicode_to_transformed_sequence) print "Translated %i groups without abortion on error (OK)" % len(sets) union = NumberSet() for nset in map(lambda set: set.charset, sets): union.unite_with(nset) inverse_union = NumberSet(Interval(0, 0x110000)) inverse_union.subtract(union) # print inverse_union.get_string(Option="hex") check_negative(result, inverse_union.get_intervals(PromiseToTreatWellF=True), unicode_to_transformed_sequence)
def test_on_UCS_sample_sets(Trafo, unicode_to_transformed_sequence): script_list = [ "Arabic", "Armenian", "Balinese", "Bengali", "Bopomofo", "Braille", "Buginese", "Buhid", "Canadian_Aboriginal", "Cherokee", "Common", "Cuneiform", "Cypriot", "Deseret", "Gothic", "Greek", "Hanunoo", "Hebrew", "Hiragana", "Inherited", "Kannada", "Han", "Katakana", "Kharoshthi", "Khmer", "Lao", "Latin", "Limbu", "Linear_B", "Malayalam", "Mongolian", "Myanmar", "New_Tai_Lue", "Nko", "Osmanya", "Ogham", "Old_Italic", "Old_Persian", "Phoenician", "Shavian", "Syloti_Nagri", "Syriac", "Tagalog", "Tagbanwa", "Tai_Le", "Tamil", "Telugu", "Thaana", "Thai", "Tibetan", "Tifinagh", "Ugaritic", "Yi" ] sets = [X(name) for name in script_list] orig = get_combined_state_machine(map(lambda x: x.sm, sets)) state_n_before, result = transform(Trafo, orig) # print result.get_graphviz_string(Option="hex") for set in sets: set.check(result, unicode_to_transformed_sequence) print "Translated %i groups without abortion on error (OK)" % len(sets) union = NumberSet() for nset in map(lambda set: set.charset, sets): union.unite_with(nset) inverse_union = NumberSet(Interval(0, 0x110000)) inverse_union.subtract(union) # print inverse_union.get_string(Option="hex") check_negative(result, inverse_union.get_intervals(PromiseToTreatWellF=True), unicode_to_transformed_sequence)
def combined(appendix_sm_db, SmList): sm_ulist = unique(SmList) id_key = tuple(sorted(list(set(sm.get_id() for sm in sm_ulist)))) entry = appendix_sm_db.get(id_key) if entry is None: entry = get_combined_state_machine(sm_ulist, AlllowInitStateAcceptF=True) appendix_sm_db[id_key] = entry return entry.get_id()
def prepare(PatternStringList, GetPreContextSM_F=False): pattern_list = map(lambda x: regex.do(x, {}), PatternStringList) for pattern in pattern_list: pattern.mount_post_context_sm() pattern.mount_pre_context_sm() if GetPreContextSM_F: state_machine_list = [ pattern.pre_context_sm for pattern in pattern_list ] else: state_machine_list = [ pattern.sm for pattern in pattern_list ] sm = get_combined_state_machine(state_machine_list, False) # May be 'True' later. return sm.normalized_clone()
def prepare(PatternStringList, GetPreContextSM_F=False): pattern_list = map(lambda x: regex.do(x, {}), PatternStringList) for pattern in pattern_list: pattern.mount_post_context_sm() pattern.mount_pre_context_sm() if GetPreContextSM_F: state_machine_list = [ pattern.pre_context_sm for pattern in pattern_list ] else: state_machine_list = [pattern.sm for pattern in pattern_list] sm = get_combined_state_machine(state_machine_list, False) # May be 'True' later. return sm.normalized_clone()
"Phoenician", "Shavian", "Ugaritic", "Buginese", "Buhid", "Canadian_Aboriginal", "Cherokee", "Syloti_Nagri", "Syriac", "Tagalog", "Tagbanwa", "Tai_Le", "Yi", ]) orig = get_combined_state_machine(map(lambda x: x.sm, sets)) print "# Number of states in state machine:" print "# Unicode: %i" % len(orig.states) result = trafo.do(orig) print "# UTF8-Splitted: %i" % len(result.states) # print result.get_graphviz_string(Option="hex") for set in sets: set.check(result) union = NumberSet() for nset in map(lambda set: set.charset, sets): union.unite_with(nset) inverse_union = NumberSet(Interval(0, 0x110000))
#! /usr/bin/env python # -*- coding: utf-8 -*- import sys import os sys.path.insert(0, os.environ["QUEX_PATH"]) import quex.input.regular_expression.engine as regex from quex.engine.misc.interval_handling import NumberSet, Interval import quex.engine.state_machine.transformation.utf8_state_split as trafo from quex.engine.state_machine.transformation.utf8_state_split import unicode_to_utf8 import quex.input.regular_expression.engine as regex from quex.engine.state_machine.engine_state_machine_set import get_combined_state_machine if "--hwut-info" in sys.argv: print "UTF8 State Split: Larger Number Sets" sm1 = regex.do("[ΆΈΉΊΌΎ-Ϋ]+", {}).sm sm2 = regex.do("[ \\t\\n]", {}).sm result = trafo.do(get_combined_state_machine([sm1, sm2])) for line in result.get_graphviz_string(NormalizeF=True, Option="hex").splitlines(): if line.find("digraph") != -1: print "digraph state_machine {" else: print line
# -*- coding: utf8 -*- import os import sys sys.path.insert(0, os.environ["QUEX_PATH"]) import quex.input.regular_expression.engine as regex from quex.engine.state_machine.engine_state_machine_set import get_combined_state_machine import quex.engine.analyzer.engine_supply_factory as engine from quex.blackboard import E_InputActions import help from operator import attrgetter if "--hwut-info" in sys.argv: print "Track Analyzis: Backward Input Position Detection;" sys.exit() # There are no 'special cases' pattern_list = [ 'ax', ] state_machine_list = map(lambda x: regex.do(x, {}).sm, pattern_list) sm = get_combined_state_machine(state_machine_list, False) # May be 'True' later. sm = sm.normalized_clone() # For DEBUG purposes: specify 'DRAW' on command line (in sys.argv) help.if_DRAW_in_sys_argv(sm) help.test(sm, engine.Class_BACKWARD_INPUT_POSITION(0))
# An acceptance state cannot be reached by a unicode value in ImpossibleIntervals for cmd in result.states[s_idx].single_entry: assert not cmd.is_acceptance() print " (OK)" sets = map(lambda name: X(name), ["Arabic", "Armenian", "Balinese", "Bengali", "Bopomofo", "Braille", "Hanunoo", "Hebrew", "Hiragana", "Inherited", "Kannada", "Katakana", "Kharoshthi", "Khmer", "Lao", "Latin", "Limbu", "Linear_B", "Malayalam", "Mongolian", "Myanmar", "New_Tai_Lue", "Nko", "Ogham", "Old_Italic", "Old_Persian", "Syriac", "Tagalog", "Tagbanwa", "Tai_Le", "Tamil", "Telugu", "Thaana", "Thai", "Tibetan", "Tifinagh", "Ugaritic", "Yi"]) orig = get_combined_state_machine(map(lambda x: x.sm, sets)) print "Number of states in state machine:" print " Unicode: %i" % len(orig.states) result = trafo.do(orig) print " UTF8-Splitted: %i" % len(result.states) for set in sets: set.check(result) union = NumberSet() for nset in map(lambda set: set.charset, sets): union.unite_with(nset) inverse_union = NumberSet(Interval(0, 0x110000)) inverse_union.subtract(union) # print inverse_union.get_string(Option="hex")