Exemplo n.º 1
0
 def test_line_word():
     """
     Get a sample word with unicode chars and check serialization is
     correct.
     """
     line = Line("1")
     line.words.append(u"\u2086")
     line_ser = line.serialize()
     assert line_ser == "1.\t" + u"\u2086"
Exemplo n.º 2
0
 def test_line_word():
     """
     Get a sample word with unicode chars and check serialization is
     correct.
     """
     line = Line("1")
     line.words.append(u"\u2086")
     line_ser = line.serialize()
     assert line_ser == "1.\t" + u"\u2086"
Exemplo n.º 3
0
 def p_translationlabelledline(self, p):
     """translationlabeledline : translationlabel NEWLINE
                               | translationrangelabel NEWLINE
                               | translationlabel CLOSER
                               | translationrangelabel CLOSER
     """
     p[0] = Line(p[1])
Exemplo n.º 4
0
def convert(atf_text):
    """
    Create a TEI representation of a file-like object containing ATF.
    """

    # Parse the ATF input string.
    atf = AtfFile(atf_text, 'cdli', False)
    if verbose:
        print("Parsed {} -- {}".format(atf.text.code, atf.text.description))

    # Construct a TEI Document to hold the converted text.
    doc = tei.Document()
    doc.language = atf.text.language
    doc.header = tei.Header()
    doc.header.title = atf.text.description
    doc.header.cdli_code = atf.text.code

    # Traverse the parse tree, recording lines under labels.
    translations = {}
    objects = [
        item for item in atf.text.children if isinstance(item, OraccObject)
    ]
    edition = tei.Edition()
    doc.parts.append(edition)
    for item in objects:
        part = tei.TextPart(item.objecttype)
        edition.append(part)
        for section in item.children:
            if isinstance(section, OraccObject):
                try:
                    name = section.name
                except AttributeError:
                    name = section.objecttype
                div = tei.TextPart(name)
                part.append(div)
            elif isinstance(section, Translation):
                # Handle in another pass.
                continue
            else:
                print('Skipping unknown section type', type(section).__name__)
                continue
            for obj in section.children:
                if isinstance(obj, Line):
                    text = normalize_transliteration(obj.words)
                    line = tei.Line(obj.label, text)
                    div.append(line)
                    # Older pyoracc parses interlinear translatsions
                    # as notes. Remember them for serialization below.
                    for note in obj.notes:
                        if note.content.startswith('tr.'):
                            lang, text = note.content.split(':', maxsplit=1)
                            _, lang = lang.split('.')
                            # tr.ts is used for normalization, so mark
                            # this with the primary object's language.
                            if lang == 'ts':
                                lang == atf.text.language
                            tr_line = Line(obj.label)
                            tr_line.words = text.strip().split()
                            if lang not in translations:
                                translations[lang] = []
                            translations[lang].append(tr_line)
                elif isinstance(obj, State) or isinstance(obj, Ruling):
                    text = str(obj).strip()
                    # Strip the initial '$' off the ATF representation.
                    text = text[1:].strip()
                    div.append(tei.Note(text))
                else:
                    print('Skipping unknown section child type',
                          type(obj).__name__)
                    continue

    # Add accumulated interlinear translations to the document.
    for lang, tr_lines in translations.items():
        translation = tei.Translation()
        translation.language = lang
        doc.parts.append(translation)
        for tr_line in tr_lines:
            text = ' '.join(tr_line.words)
            line = tei.Line(tr_line.label, text)
            translation.append(line)

    # Traverse the tree again, recording any parallel translation sections.
    # pyoracc only supports these for English.
    translation = tei.Translation()
    translation.language = 'eng'
    translation_empty = True
    for item in objects:
        part = tei.TextPart(item.objecttype)
        translation.append(part)
        for section in item.children:
            # Skip anything which is not a translation for this pass.
            if not isinstance(section, Translation):
                continue
            for surface in section.children:
                if isinstance(surface, OraccObject):
                    div = tei.TextPart(surface.objecttype)
                    part.append(div)
                    for obj in surface.children:
                        if isinstance(obj, Line):
                            text = ' '.join(obj.words)
                            line = tei.Line(obj.label, text)
                            div.append(line)
                            translation_empty = False
                        else:
                            print('Skipping unknown section child type',
                                  {type(obj).__name__})
                            continue
    if not translation_empty:
        doc.parts.append(translation)

    return doc
Exemplo n.º 5
0
def convert(atf_text):
    """
    Create a TEI representation of a file-like object containing ATF.
    """
    atf = AtfFile(atf_text, 'cdli', False)
    if verbose:
        print("Parsed {} -- {}".format(atf.text.code, atf.text.description))
    result = '''<?xml version="1.0" encoding="UTF-8"?>
<TEI xmlns="http://www.tei-c.org/ns/1.0">

<teiHeader>
<fileDesc>
  <titleStmt>
    <title>{description}</title>
  </titleStmt>
  <publicationStmt>
    <p>Converted from ATF by atf2tei.</p>
  </publicationStmt>
  <sourceDesc>
    <idno type="CDLI">{code}</idno>
  </sourceDesc>
</fileDesc>
<encodingDesc>
  <refsDecl n="CTS">
    <cRefPattern n="line"
                 matchPattern="(\\w+)\\.(\\w+)\\.(\\w+)"
                 replacementPattern="#xpath(/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n=\'$1\']/tei:div[@n=\'$2\']/tei:l[@n=\'$3\'])">
      <p>This pointer pattern extracts a specific line.</p>
    </cRefPattern>
    <cRefPattern n="surface"
                 matchPattern="(\\w+)\\.(\\w+)"
                 replacementPattern="#xpath(/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n=\'$1\']/tei:div[@n=\'$2\'])">
      <p>This pointer pattern extracts an inscribed surface.</p>
    </cRefPattern>
    <cRefPattern n="object"
                 matchPattern="(\\w+)"
                 replacementPattern="#xpath(/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n=\'$1\'])">
      <p>This pointer pattern extracts a specific artefact,
         usually a tablet.</p>
    </cRefPattern>
  </refsDecl>
</encodingDesc>
</teiHeader>
'''.format(description=escape(atf.text.description),
           code=escape(atf.text.code))
    urn = f'urn:cts:cdli:test.{atf.text.code}'
    result += f'<text n="{urn}"'
    if atf.text.language:
        result += f' xml:lang="{atf.text.language}"'
    result += '>\n'
    result += '<body>\n'
    translations = {}
    objects = [item for item in atf.text.children
               if isinstance(item, OraccObject)]
    result += '''  <div type="edition">\n'''
    for item in objects:
        result += f'  <div type="textpart" n="{item.objecttype}">\n'
        for section in item.children:
            if isinstance(section, OraccObject):
                result += '    <div type="textpart"' \
                          f' n="{section.objecttype}">\n'
            elif isinstance(section, Translation):
                # Handle in another pass.
                continue
            else:
                result += '    <div>\n' \
                         f'<!-- {type(section).__name__}: {section} -->\n'
            for line in section.children:
                if isinstance(line, Line):
                    text = normalize_transliteration(line.words)
                    result += f'      <l n="{line.label}">{text}</l>\n'
                    # Older pyoracc parses interlinear translatsions
                    # as notes. Remember them for serialization below.
                    for note in line.notes:
                        if note.content.startswith('tr.'):
                            lang, text = note.content.split(':', maxsplit=1)
                            _, lang = lang.split('.')
                            # tr.ts is used for normalization, so mark
                            # this with the primary object's language.
                            if lang == 'ts':
                                lang == atf.text.language
                            tr_line = Line(line.label)
                            tr_line.words = text.strip().split()
                            if lang not in translations:
                                translations[lang] = []
                            translations[lang].append(tr_line)
                else:
                    result += f'      <!-- {type(line).__name__}: {line} -->\n'
            result += '    </div>\n'
        result += '  </div>\n'
    result += '  </div>\n'
    objects = [item for item in atf.text.children
               if isinstance(item, OraccObject)]
    result += '  <div type="translation">\n'
    for item in objects:
        result += f'    <div type="textpart" n="{item.objecttype}">\n'
        for section in item.children:
            # Skip anything which is not a translation for this pass.
            if not isinstance(section, Translation):
                continue
            for surface in section.children:
                result += f'      <div type="textpart" ' \
                          f'n="{surface.objecttype}">\n'
                if isinstance(surface, OraccObject):
                    for line in surface.children:
                        if isinstance(line, Line):
                            text = ' '.join(line.words)
                            result += '        ' \
                                      f'<l n="{line.label}">{text}</l>\n'
                        else:
                            result += '        <!-- ' \
                                      f'{type(line).__name__}: {line} -->\n'
                    result += '      </div>\n'
        result += '    </div>\n'
    result += '  </div>\n'
    for lang, translation in translations.items():
        result += f'  <div type="translation" xml:lang="{lang}">\n'
        for line in translation:
            text = ' '.join(line.words)
            result += f'    <l n="{line.label}">{escape(text)}</l>\n'
        result += '  </div>\n'
    result += '''
</body>
</text>
</TEI>'''
    return result
Exemplo n.º 6
0
 def p_multilingual_sequence(self, p):
     "multilingual_sequence : MULTILINGUAL ID "
     p[0] = Line(p[2][1:])  # Slice off the percent
Exemplo n.º 7
0
 def p_scorelabel(self, p):
     "line_sequence : SCORELABEL ID"
     p[0] = Line(p[1])
     p[0].words.append(p[2])
Exemplo n.º 8
0
 def p_linelabel(self, p):
     "line_sequence : LINELABEL ID"
     p[0] = Line(p[1])
     p[0].words.append(p[2])