def test_handle_tones(self): ## Test "tone" value = u"LaM1H" input = Element("name", att="tone", val=value) # Create output element and sub-elements output = Element("name", att="tone", val=value) sub1 = SubElement(output, "sub") sub2 = SubElement(output, "sub") # Fill in text output.text = "L" sub1.text = "a" sub1.tail = "M" sub2.text = "1" sub2.tail = "H" self.assertEqual(tostring(handle_tones(input)), tostring(output)) ## Test "lexeme" value = "aa˩abb˧bcc˥c".decode(encoding=ENCODING) input = Element("name", att="lexeme", val=value) # Create output element and sub-elements output = Element("name", att="lexeme", val=value) sub = SubElement(output, "sub") # Fill in text output.text = "aa˩abb˧bcc˥".decode(encoding=ENCODING) sub.text = "c" self.assertEqual(tostring(handle_tones(input)), tostring(output)) ## Test others input = Element("name", att="other", val=value) output = Element("name", att="other", val=value) self.assertEqual(tostring(handle_tones(input)), tostring(output))
def handle_caps(element): """Handle small caps. Replace '°xxx' by '<span class="sc">xxx</span>'. """ import re pattern = r"([^°]*)°([^\s\.,)+/:]*)(.*)" # Find text to display in small caps result = re.match(pattern, element.attrib["val"].encode(ENCODING)) # Initialize loop variables previous_span = None index = 0 while result: before = result.group(1).decode(ENCODING) sc = result.group(2).decode(ENCODING) after = result.group(3).decode(ENCODING) # Handle previous span or element if previous_span is None: element.text = before else: previous_span.tail = before # Create span span = Element("span") span.attrib["class"] = "sc" span.text = sc # Insert span in element element.insert(index, span) # Update result result = re.match(pattern, after.encode(ENCODING)) if not result: span.tail = after # Update loop variables previous_span = span index += 1 return element
def handle_font(element): """Replace '{xxx}' by '<span class="ipa">xxx</span>'. """ import re # Find text to display in IPA pattern = r"([^{}]*){([^}]*)}(.*)" result = re.match(pattern, element.attrib["val"]) # Initialize loop variables previous_span = None index = 0 while result: before = result.group(1) ipa = result.group(2) after = result.group(3) # Handle previous span or element if previous_span is None: element.text = before else: previous_span.tail = before # Create span span = Element("span") span.attrib["class"] = "ipa" span.text = ipa # Insert span in element element.insert(index, span) # Update result result = re.match(pattern, after) if not result: span.tail = after # Update loop variables previous_span = span index += 1 return element
def handle_pinyin(element): """Replace '@xxx' by '<span class="pinyin">xxx</span>'. """ import re # Find pinyin pattern = r"([^@]*)@(\w*)(.*)" result = re.match(pattern, element.attrib["val"]) # Initialize loop variables previous_span = None index = 0 while result: before = result.group(1) pinyin = result.group(2) after = result.group(3) # Handle previous span or element if previous_span is None: element.text = before else: previous_span.tail = before # Create span span = Element("span") span.attrib["class"] = "pinyin" span.text = pinyin # Insert span in element element.insert(index, span) # Update result result = re.match(pattern, after) if not result: span.tail = after # Update loop variables previous_span = span index += 1 return element
def test_handle_caps(self): value = u"°trucs et°astuces" input = Element("name", val=value) # Create output element and sub-elements output = Element("name", val=value) sub1 = SubElement(output, "span") sub1.attrib["class"] = "sc" sub2 = SubElement(output, "span") sub2.attrib["class"] = "sc" # Fill in text output.text = "" sub1.text = "trucs" sub1.tail = " et" sub2.text = "astuces" sub2.tail = "" self.assertEqual(tostring(handle_caps(input)), tostring(output))
def test_handle_pinyin(self): value = "@at1 atA@at2 atB" input = Element("name", val=unicode(value)) # Create output element and sub-elements output = Element("name", val=unicode(value)) sub1 = SubElement(output, "span") sub1.attrib["class"] = "pinyin" sub2 = SubElement(output, "span") sub2.attrib["class"] = "pinyin" # Fill in text output.text = "" sub1.text = "at1" sub1.tail = " atA" sub2.text = "at2" sub2.tail = " atB" self.assertEqual(tostring(handle_pinyin(input)), tostring(output))
def test_handle_font(self): value = "blaA{bla1} blaB {bla2}blaC {bla3}" input = Element("name", val=unicode(value)) # Create output element and sub-elements output = Element("name", val=unicode(value)) sub1 = SubElement(output, "span") sub1.attrib["class"] = "ipa" sub2 = SubElement(output, "span") sub2.attrib["class"] = "ipa" sub3 = SubElement(output, "span") sub3.attrib["class"] = "ipa" # Fill in text output.text = "blaA" sub1.text = "bla1" sub1.tail = " blaB " sub2.text = "bla2" sub2.tail = "blaC " sub3.text = "bla3" sub3.tail = "" self.assertEqual(tostring(handle_font(input)), tostring(output))
def add_link(object, element): """Insert an hyperlink <a href=xxx>xxx<a/> in XML. """ # To access options from pylmflib import options global options if options.cross_references: # Retrieve identifier try: id = object.get_lexical_entry().get_id() except AttributeError: id = None if id is not None: # Create link a = Element("a") a.attrib["href"] = id a.text = element.attrib["targets"] # Insert link in element element.insert(0, a) return (object, element)
def test_handle_fv(self): value1 = "fv:something here and fv:there" value2 = "|fv{something here} and fv:there" for value in [value1, value2]: input = Element("name", val=unicode(value)) # Create output element and sub-elements output = Element("name", val=unicode(value)) sub1 = SubElement(output, "span") sub1.attrib["class"] = "vernacular" sub2 = SubElement(output, "span") sub2.attrib["class"] = "vernacular" # Fill in text output.text = "" if value == value1: sub1.text = "something" sub1.tail = " here and " elif value == value2: sub1.text = "something here" sub1.tail = " and " sub2.text = "there" sub2.tail = "" self.assertEqual(tostring(handle_fv(input)), tostring(output))
def test_handle_fn(self): value1 = "textfn:this fn:but not this" value2 = "textfn:this |fn{and this}" for value in [value1, value2]: input = Element("name", val=unicode(value)) # Create output element and sub-elements output = Element("name", val=unicode(value)) sub1 = SubElement(output, "span") sub1.attrib["class"] = "national" sub2 = SubElement(output, "span") sub2.attrib["class"] = "national" # Fill in text output.text = "text" sub1.text = "this" sub1.tail = " " if value == value1: sub2.text = "but" sub2.tail = " not this" elif value == value2: sub2.text = "and this" sub2.tail = "" self.assertEqual(tostring(handle_fn(input)), tostring(output))
def handle_fv(element): """Replace 'fv:xxx' and '|fv{xxx}' by '<span class="vernacular">xxx</span>'. """ import re # Find text to display in vernacular font pattern = r"(([^:\|]*)fv:([^\s\.,)]*)(.*))|(([^:\|]*)\|fv{([^}]*)}(.*))" result = re.match(pattern, element.attrib["val"]) # Initialize loop variables previous_span = None index = 0 while result: if result.group(1) is not None: before = result.group(2) vernacular = result.group(3) after = result.group(4) elif result.group(5) is not None: before = result.group(6) vernacular = result.group(7) after = result.group(8) # Handle previous span or element if previous_span is None: element.text = before else: previous_span.tail = before # Create span span = Element("span") span.attrib["class"] = "vernacular" span.text = vernacular # Insert span in element element.insert(index, span) # Update result result = re.match(pattern, after) if not result: span.tail = after # Update loop variables previous_span = span index += 1 return element
def handle_tones(element): """Replace tones subscripts by '<sub>xxx</sub>'. """ from utils.io import ENCODING import re if element.attrib["att"] == "tone": # Initialize loop variables previous_sub = None if element.text is None: element.text = "" index = 0 for c in element.attrib["val"]: if c in set("abcd123"): # Create sub sub = Element("sub") sub.text = c # Insert sub in element element.insert(index, sub) # Update loop variables previous_sub = sub previous_sub.tail = "" index += 1 else: # Handle previous sub or element if previous_sub is None: element.text += c else: previous_sub.tail += c if element.text == element.attrib["val"]: # Reset if identical element.text = None return element if element.attrib["att"] != "lexeme": return element # Find text to display as subscript tones = "˩˧˥".decode(encoding=ENCODING) # Monosyllabic current_pattern = "([^" + tones + "#$]+)(#?[" + tones + "]{1,2}[$#]?)([abcd123]?)" pattern = "^" + current_pattern + "$" if re.search(pattern, element.attrib["val"]): result = re.match(pattern, element.attrib["val"]) before = result.group(1) + result.group(2) subscript = result.group(3) element.text = before if len(subscript) != 0: # Create sub sub = Element("sub") sub.text = subscript # Insert sub in element element.insert(0, sub) if element.text == element.attrib["val"]: # Reset if identical element.text = None return element # Disyllabic: add a constraint on other syllables which must have at least 2 characters (maximum 5) syllable = "([^" + tones + "#$]{2,5})(#?[" + tones + "]{1,2}[$#]?)([abcd123]?)" # Handle words composed of 2, 3, 4, 5 syllables for syllable_nb in range (2, 6): current_pattern += syllable pattern = "^" + current_pattern + "$" if re.search(pattern, element.attrib["val"]): result = re.match(pattern, element.attrib["val"]) # Initialize loop variables previous_sub = None if element.text is None: element.text = "" for i in range (0, syllable_nb): before = result.group(i*3+1) + result.group(i*3+2) subscript = result.group(i*3+3) if i != syllable_nb - 1: before += subscript subscript = "" # Handle previous sub or element if previous_sub is None: element.text += before else: previous_sub.tail += before if len(subscript) != 0: # Create sub sub = Element("sub") sub.text = subscript # Insert sub in element element.insert(i, sub) # Update loop variable previous_sub = sub previous_sub.tail = "" if element.text == element.attrib["val"]: # Reset if identical element.text = None return element
def handle_tones(element): """Replace tones subscripts by '<sub>xxx</sub>'. """ from utils.io import ENCODING import re if element.attrib["att"] == "tone": # Initialize loop variables previous_sub = None if element.text is None: element.text = "" index = 0 for c in element.attrib["val"]: if c in set("abcd123"): # Create sub sub = Element("sub") sub.text = c # Insert sub in element element.insert(index, sub) # Update loop variables previous_sub = sub previous_sub.tail = "" index += 1 else: # Handle previous sub or element if previous_sub is None: element.text += c else: previous_sub.tail += c if element.text == element.attrib["val"]: # Reset if identical element.text = None return element if element.attrib["att"] != "lexeme": return element # Find text to display as subscript tones = "˩˧˥".decode(encoding=ENCODING) # Monosyllabic current_pattern = "([^" + tones + "#$]+)(#?[" + tones + "]{1,2}[$#]?)([abcd123]?)" pattern = "^" + current_pattern + "$" if re.search(pattern, element.attrib["val"]): result = re.match(pattern, element.attrib["val"]) before = result.group(1) + result.group(2) subscript = result.group(3) element.text = before if len(subscript) != 0: # Create sub sub = Element("sub") sub.text = subscript # Insert sub in element element.insert(0, sub) if element.text == element.attrib["val"]: # Reset if identical element.text = None return element # Disyllabic: add a constraint on other syllables which must have at least 2 characters (maximum 5) syllable = "([^" + tones + "#$]{2,5})(#?[" + tones + "]{1,2}[$#]?)([abcd123]?)" # Handle words composed of 2, 3, 4, 5 syllables for syllable_nb in range(2, 6): current_pattern += syllable pattern = "^" + current_pattern + "$" if re.search(pattern, element.attrib["val"]): result = re.match(pattern, element.attrib["val"]) # Initialize loop variables previous_sub = None if element.text is None: element.text = "" for i in range(0, syllable_nb): before = result.group(i * 3 + 1) + result.group(i * 3 + 2) subscript = result.group(i * 3 + 3) if i != syllable_nb - 1: before += subscript subscript = "" # Handle previous sub or element if previous_sub is None: element.text += before else: previous_sub.tail += before if len(subscript) != 0: # Create sub sub = Element("sub") sub.text = subscript # Insert sub in element element.insert(i, sub) # Update loop variable previous_sub = sub previous_sub.tail = "" if element.text == element.attrib["val"]: # Reset if identical element.text = None return element