Пример #1
0
 def test_handle_tones(self):
     ## Test "tone"
     value = u"LaM1H"
     input = Element("name", att="tone", val=value)
     # Create output element and sub-elements
     output = Element("name", att="tone", val=value)
     sub1 = SubElement(output, "sub")
     sub2 = SubElement(output, "sub")
     # Fill in text
     output.text = "L"
     sub1.text = "a"
     sub1.tail = "M"
     sub2.text = "1"
     sub2.tail = "H"
     self.assertEqual(tostring(handle_tones(input)), tostring(output))
     ## Test "lexeme"
     value = "aa˩abb˧bcc˥c".decode(encoding=ENCODING)
     input = Element("name", att="lexeme", val=value)
     # Create output element and sub-elements
     output = Element("name", att="lexeme", val=value)
     sub = SubElement(output, "sub")
     # Fill in text
     output.text = "aa˩abb˧bcc˥".decode(encoding=ENCODING)
     sub.text = "c"
     self.assertEqual(tostring(handle_tones(input)), tostring(output))
     ## Test others
     input = Element("name", att="other", val=value)
     output = Element("name", att="other", val=value)
     self.assertEqual(tostring(handle_tones(input)), tostring(output))
Пример #2
0
def handle_caps(element):
    """Handle small caps.
    Replace '°xxx' by '<span class="sc">xxx</span>'.
    """
    import re
    pattern = r"([^°]*)°([^\s\.,)+/:]*)(.*)"
    # Find text to display in small caps
    result = re.match(pattern, element.attrib["val"].encode(ENCODING))
    # Initialize loop variables
    previous_span = None
    index = 0
    while result:
        before = result.group(1).decode(ENCODING)
        sc = result.group(2).decode(ENCODING)
        after = result.group(3).decode(ENCODING)
        # Handle previous span or element
        if previous_span is None:
            element.text = before
        else:
            previous_span.tail = before
        # Create span
        span = Element("span")
        span.attrib["class"] = "sc"
        span.text = sc
        # Insert span in element
        element.insert(index, span)
        # Update result
        result = re.match(pattern, after.encode(ENCODING))
        if not result:
            span.tail = after
        # Update loop variables
        previous_span = span
        index += 1
    return element
Пример #3
0
def handle_caps(element):
    """Handle small caps.
    Replace '°xxx' by '<span class="sc">xxx</span>'.
    """
    import re
    pattern = r"([^°]*)°([^\s\.,)+/:]*)(.*)"
    # Find text to display in small caps
    result = re.match(pattern, element.attrib["val"].encode(ENCODING))
    # Initialize loop variables
    previous_span = None
    index = 0
    while result:
        before = result.group(1).decode(ENCODING)
        sc = result.group(2).decode(ENCODING)
        after = result.group(3).decode(ENCODING)
        # Handle previous span or element
        if previous_span is None:
            element.text = before
        else:
            previous_span.tail = before
        # Create span
        span = Element("span")
        span.attrib["class"] = "sc"
        span.text = sc
        # Insert span in element
        element.insert(index, span)
        # Update result
        result = re.match(pattern, after.encode(ENCODING))
        if not result:
            span.tail = after
        # Update loop variables
        previous_span = span
        index += 1
    return element
Пример #4
0
def handle_font(element):
    """Replace '{xxx}' by '<span class="ipa">xxx</span>'.
    """
    import re
    # Find text to display in IPA
    pattern = r"([^{}]*){([^}]*)}(.*)"
    result = re.match(pattern, element.attrib["val"])
    # Initialize loop variables
    previous_span = None
    index = 0
    while result:
        before = result.group(1)
        ipa = result.group(2)
        after = result.group(3)
        # Handle previous span or element
        if previous_span is None:
            element.text = before
        else:
            previous_span.tail = before
        # Create span
        span = Element("span")
        span.attrib["class"] = "ipa"
        span.text = ipa
        # Insert span in element
        element.insert(index, span)
        # Update result
        result = re.match(pattern, after)
        if not result:
            span.tail = after
        # Update loop variables
        previous_span = span
        index += 1
    return element
Пример #5
0
def handle_pinyin(element):
    """Replace '@xxx' by '<span class="pinyin">xxx</span>'.
    """
    import re
    # Find pinyin
    pattern = r"([^@]*)@(\w*)(.*)"
    result = re.match(pattern, element.attrib["val"])
    # Initialize loop variables
    previous_span = None
    index = 0
    while result:
        before = result.group(1)
        pinyin = result.group(2)
        after = result.group(3)
        # Handle previous span or element
        if previous_span is None:
            element.text = before
        else:
            previous_span.tail = before
        # Create span
        span = Element("span")
        span.attrib["class"] = "pinyin"
        span.text = pinyin
        # Insert span in element
        element.insert(index, span)
        # Update result
        result = re.match(pattern, after)
        if not result:
            span.tail = after
        # Update loop variables
        previous_span = span
        index += 1
    return element
Пример #6
0
def handle_pinyin(element):
    """Replace '@xxx' by '<span class="pinyin">xxx</span>'.
    """
    import re
    # Find pinyin
    pattern = r"([^@]*)@(\w*)(.*)"
    result = re.match(pattern, element.attrib["val"])
    # Initialize loop variables
    previous_span = None
    index = 0
    while result:
        before = result.group(1)
        pinyin = result.group(2)
        after = result.group(3)
        # Handle previous span or element
        if previous_span is None:
            element.text = before
        else:
            previous_span.tail = before
        # Create span
        span = Element("span")
        span.attrib["class"] = "pinyin"
        span.text = pinyin
        # Insert span in element
        element.insert(index, span)
        # Update result
        result = re.match(pattern, after)
        if not result:
            span.tail = after
        # Update loop variables
        previous_span = span
        index += 1
    return element
Пример #7
0
def handle_font(element):
    """Replace '{xxx}' by '<span class="ipa">xxx</span>'.
    """
    import re
    # Find text to display in IPA
    pattern = r"([^{}]*){([^}]*)}(.*)"
    result = re.match(pattern, element.attrib["val"])
    # Initialize loop variables
    previous_span = None
    index = 0
    while result:
        before = result.group(1)
        ipa = result.group(2)
        after = result.group(3)
        # Handle previous span or element
        if previous_span is None:
            element.text = before
        else:
            previous_span.tail = before
        # Create span
        span = Element("span")
        span.attrib["class"] = "ipa"
        span.text = ipa
        # Insert span in element
        element.insert(index, span)
        # Update result
        result = re.match(pattern, after)
        if not result:
            span.tail = after
        # Update loop variables
        previous_span = span
        index += 1
    return element
Пример #8
0
 def test_handle_caps(self):
     value = u"°trucs et°astuces"
     input = Element("name", val=value)
     # Create output element and sub-elements
     output = Element("name", val=value)
     sub1 = SubElement(output, "span")
     sub1.attrib["class"] = "sc"
     sub2 = SubElement(output, "span")
     sub2.attrib["class"] = "sc"
     # Fill in text
     output.text = ""
     sub1.text = "trucs"
     sub1.tail = " et"
     sub2.text = "astuces"
     sub2.tail = ""
     self.assertEqual(tostring(handle_caps(input)), tostring(output))
Пример #9
0
 def test_handle_pinyin(self):
     value = "@at1 atA@at2 atB"
     input = Element("name", val=unicode(value))
     # Create output element and sub-elements
     output = Element("name", val=unicode(value))
     sub1 = SubElement(output, "span")
     sub1.attrib["class"] = "pinyin"
     sub2 = SubElement(output, "span")
     sub2.attrib["class"] = "pinyin"
     # Fill in text
     output.text = ""
     sub1.text = "at1"
     sub1.tail = " atA"
     sub2.text = "at2"
     sub2.tail = " atB"
     self.assertEqual(tostring(handle_pinyin(input)), tostring(output))
Пример #10
0
 def test_handle_font(self):
     value = "blaA{bla1} blaB {bla2}blaC {bla3}"
     input = Element("name", val=unicode(value))
     # Create output element and sub-elements
     output = Element("name", val=unicode(value))
     sub1 = SubElement(output, "span")
     sub1.attrib["class"] = "ipa"
     sub2 = SubElement(output, "span")
     sub2.attrib["class"] = "ipa"
     sub3 = SubElement(output, "span")
     sub3.attrib["class"] = "ipa"
     # Fill in text
     output.text = "blaA"
     sub1.text = "bla1"
     sub1.tail = " blaB "
     sub2.text = "bla2"
     sub2.tail = "blaC "
     sub3.text = "bla3"
     sub3.tail = ""
     self.assertEqual(tostring(handle_font(input)), tostring(output))
Пример #11
0
def add_link(object, element):
    """Insert an hyperlink <a href=xxx>xxx<a/> in XML.
    """
    # To access options
    from pylmflib import options
    global options
    if options.cross_references:
        # Retrieve identifier
        try:
            id = object.get_lexical_entry().get_id()
        except AttributeError:
            id = None
        if id is not None:
            # Create link
            a = Element("a")
            a.attrib["href"] = id
            a.text = element.attrib["targets"]
            # Insert link in element
            element.insert(0, a)
    return (object, element)
Пример #12
0
def add_link(object, element):
    """Insert an hyperlink <a href=xxx>xxx<a/> in XML.
    """
    # To access options
    from pylmflib import options
    global options
    if options.cross_references:
        # Retrieve identifier
        try:
            id = object.get_lexical_entry().get_id()
        except AttributeError:
            id = None
        if id is not None:
            # Create link
            a = Element("a")
            a.attrib["href"] = id
            a.text = element.attrib["targets"]
            # Insert link in element
            element.insert(0, a)
    return (object, element)
Пример #13
0
 def test_handle_fv(self):
     value1 = "fv:something here and fv:there"
     value2 = "|fv{something here} and fv:there"
     for value in [value1, value2]:
         input = Element("name", val=unicode(value))
         # Create output element and sub-elements
         output = Element("name", val=unicode(value))
         sub1 = SubElement(output, "span")
         sub1.attrib["class"] = "vernacular"
         sub2 = SubElement(output, "span")
         sub2.attrib["class"] = "vernacular"
         # Fill in text
         output.text = ""
         if value == value1:
             sub1.text = "something"
             sub1.tail = " here and "
         elif value == value2:
             sub1.text = "something here"
             sub1.tail = " and "
         sub2.text = "there"
         sub2.tail = ""
         self.assertEqual(tostring(handle_fv(input)), tostring(output))
Пример #14
0
 def test_handle_fn(self):
     value1 = "textfn:this fn:but not this"
     value2 = "textfn:this |fn{and this}"
     for value in [value1, value2]:
         input = Element("name", val=unicode(value))
         # Create output element and sub-elements
         output = Element("name", val=unicode(value))
         sub1 = SubElement(output, "span")
         sub1.attrib["class"] = "national"
         sub2 = SubElement(output, "span")
         sub2.attrib["class"] = "national"
         # Fill in text
         output.text = "text"
         sub1.text = "this"
         sub1.tail = " "
         if value == value1:
             sub2.text = "but"
             sub2.tail = " not this"
         elif value == value2:
             sub2.text = "and this"
             sub2.tail = ""
         self.assertEqual(tostring(handle_fn(input)), tostring(output))
Пример #15
0
def handle_fv(element):
    """Replace 'fv:xxx' and '|fv{xxx}' by '<span class="vernacular">xxx</span>'.
    """
    import re
    # Find text to display in vernacular font
    pattern = r"(([^:\|]*)fv:([^\s\.,)]*)(.*))|(([^:\|]*)\|fv{([^}]*)}(.*))"
    result = re.match(pattern, element.attrib["val"])
    # Initialize loop variables
    previous_span = None
    index = 0
    while result:
        if result.group(1) is not None:
            before = result.group(2)
            vernacular = result.group(3)
            after = result.group(4)
        elif result.group(5) is not None:
            before = result.group(6)
            vernacular = result.group(7)
            after = result.group(8)
        # Handle previous span or element
        if previous_span is None:
            element.text = before
        else:
            previous_span.tail = before
        # Create span
        span = Element("span")
        span.attrib["class"] = "vernacular"
        span.text = vernacular
        # Insert span in element
        element.insert(index, span)
        # Update result
        result = re.match(pattern, after)
        if not result:
            span.tail = after
        # Update loop variables
        previous_span = span
        index += 1
    return element
Пример #16
0
def handle_fv(element):
    """Replace 'fv:xxx' and '|fv{xxx}' by '<span class="vernacular">xxx</span>'.
    """
    import re
    # Find text to display in vernacular font
    pattern = r"(([^:\|]*)fv:([^\s\.,)]*)(.*))|(([^:\|]*)\|fv{([^}]*)}(.*))"
    result = re.match(pattern, element.attrib["val"])
    # Initialize loop variables
    previous_span = None
    index = 0
    while result:
        if result.group(1) is not None:
            before = result.group(2)
            vernacular = result.group(3)
            after = result.group(4)
        elif result.group(5) is not None:
            before = result.group(6)
            vernacular = result.group(7)
            after = result.group(8)
        # Handle previous span or element
        if previous_span is None:
            element.text = before
        else:
            previous_span.tail = before
        # Create span
        span = Element("span")
        span.attrib["class"] = "vernacular"
        span.text = vernacular
        # Insert span in element
        element.insert(index, span)
        # Update result
        result = re.match(pattern, after)
        if not result:
            span.tail = after
        # Update loop variables
        previous_span = span
        index += 1
    return element
Пример #17
0
def handle_tones(element):
    """Replace tones subscripts by '<sub>xxx</sub>'.
    """
    from utils.io import ENCODING
    import re
    if element.attrib["att"] == "tone":
        # Initialize loop variables
        previous_sub = None
        if element.text is None:
            element.text = ""
        index = 0
        for c in element.attrib["val"]:
            if c in set("abcd123"):
                # Create sub
                sub = Element("sub")
                sub.text = c
                # Insert sub in element
                element.insert(index, sub)
                # Update loop variables
                previous_sub = sub
                previous_sub.tail = ""
                index += 1
            else:
                # Handle previous sub or element
                if previous_sub is None:
                    element.text += c
                else:
                    previous_sub.tail += c
        if element.text == element.attrib["val"]:
            # Reset if identical
            element.text = None
        return element
    if element.attrib["att"] != "lexeme":
        return element
    # Find text to display as subscript
    tones = "˩˧˥".decode(encoding=ENCODING)
    # Monosyllabic
    current_pattern = "([^" + tones + "#$]+)(#?[" + tones + "]{1,2}[$#]?)([abcd123]?)"
    pattern = "^" + current_pattern + "$"
    if re.search(pattern, element.attrib["val"]):
        result = re.match(pattern, element.attrib["val"])
        before = result.group(1) + result.group(2)
        subscript = result.group(3)
        element.text = before
        if len(subscript) != 0:
            # Create sub
            sub = Element("sub")
            sub.text = subscript
            # Insert sub in element
            element.insert(0, sub)
        if element.text == element.attrib["val"]:
            # Reset if identical
            element.text = None
        return element
    # Disyllabic: add a constraint on other syllables which must have at least 2 characters (maximum 5)
    syllable = "([^" + tones + "#$]{2,5})(#?[" + tones + "]{1,2}[$#]?)([abcd123]?)"
    # Handle words composed of 2, 3, 4, 5 syllables
    for syllable_nb in range (2, 6):
        current_pattern += syllable
        pattern = "^" + current_pattern + "$"
        if re.search(pattern, element.attrib["val"]):
            result = re.match(pattern, element.attrib["val"])
            # Initialize loop variables
            previous_sub = None
            if element.text is None:
                element.text = ""
            for i in range (0, syllable_nb):
                before = result.group(i*3+1) + result.group(i*3+2)
                subscript = result.group(i*3+3)
                if i != syllable_nb - 1:
                    before += subscript
                    subscript = ""
                # Handle previous sub or element
                if previous_sub is None:
                    element.text += before
                else:
                    previous_sub.tail += before
                if len(subscript) != 0:
                    # Create sub
                    sub = Element("sub")
                    sub.text = subscript
                    # Insert sub in element
                    element.insert(i, sub)
                    # Update loop variable
                    previous_sub = sub
                    previous_sub.tail = ""
    if element.text == element.attrib["val"]:
        # Reset if identical
        element.text = None
    return element
Пример #18
0
def handle_tones(element):
    """Replace tones subscripts by '<sub>xxx</sub>'.
    """
    from utils.io import ENCODING
    import re
    if element.attrib["att"] == "tone":
        # Initialize loop variables
        previous_sub = None
        if element.text is None:
            element.text = ""
        index = 0
        for c in element.attrib["val"]:
            if c in set("abcd123"):
                # Create sub
                sub = Element("sub")
                sub.text = c
                # Insert sub in element
                element.insert(index, sub)
                # Update loop variables
                previous_sub = sub
                previous_sub.tail = ""
                index += 1
            else:
                # Handle previous sub or element
                if previous_sub is None:
                    element.text += c
                else:
                    previous_sub.tail += c
        if element.text == element.attrib["val"]:
            # Reset if identical
            element.text = None
        return element
    if element.attrib["att"] != "lexeme":
        return element
    # Find text to display as subscript
    tones = "˩˧˥".decode(encoding=ENCODING)
    # Monosyllabic
    current_pattern = "([^" + tones + "#$]+)(#?[" + tones + "]{1,2}[$#]?)([abcd123]?)"
    pattern = "^" + current_pattern + "$"
    if re.search(pattern, element.attrib["val"]):
        result = re.match(pattern, element.attrib["val"])
        before = result.group(1) + result.group(2)
        subscript = result.group(3)
        element.text = before
        if len(subscript) != 0:
            # Create sub
            sub = Element("sub")
            sub.text = subscript
            # Insert sub in element
            element.insert(0, sub)
        if element.text == element.attrib["val"]:
            # Reset if identical
            element.text = None
        return element
    # Disyllabic: add a constraint on other syllables which must have at least 2 characters (maximum 5)
    syllable = "([^" + tones + "#$]{2,5})(#?[" + tones + "]{1,2}[$#]?)([abcd123]?)"
    # Handle words composed of 2, 3, 4, 5 syllables
    for syllable_nb in range(2, 6):
        current_pattern += syllable
        pattern = "^" + current_pattern + "$"
        if re.search(pattern, element.attrib["val"]):
            result = re.match(pattern, element.attrib["val"])
            # Initialize loop variables
            previous_sub = None
            if element.text is None:
                element.text = ""
            for i in range(0, syllable_nb):
                before = result.group(i * 3 + 1) + result.group(i * 3 + 2)
                subscript = result.group(i * 3 + 3)
                if i != syllable_nb - 1:
                    before += subscript
                    subscript = ""
                # Handle previous sub or element
                if previous_sub is None:
                    element.text += before
                else:
                    previous_sub.tail += before
                if len(subscript) != 0:
                    # Create sub
                    sub = Element("sub")
                    sub.text = subscript
                    # Insert sub in element
                    element.insert(i, sub)
                    # Update loop variable
                    previous_sub = sub
                    previous_sub.tail = ""
    if element.text == element.attrib["val"]:
        # Reset if identical
        element.text = None
    return element