Exemplo n.º 1
0
class ZefaniaXMLBible(Bible):
    """
    Class for reading, validating, and converting ZefaniaXMLBible XML.
    """
    XMLNameSpace = "{http://www.w3.org/2001/XMLSchema-instance}"
    treeTag = 'XMLBIBLE'
    infoTag = 'INFORMATION'
    bookTag = 'BIBLEBOOK'
    chapterTag = 'CHAPTER'
    captionTag = 'CAPTION'
    verseTag = 'VERS'
    noteTag = 'NOTE'
    styleTag = 'STYLE'
    breakTag = 'BR'

    def __init__(self, sourceFolder, givenName, encoding='utf-8'):
        """
        Constructor: just sets up the Zefania Bible object.
        """
        # Setup and initialise the base class first
        Bible.__init__(self)
        self.objectNameString = "Zefania XML Bible object"
        self.objectTypeString = "Zefania"

        # Now we can set our object variables
        self.sourceFolder, self.givenName, self.encoding = sourceFolder, givenName, encoding
        self.sourceFilepath = os.path.join(self.sourceFolder, self.givenName)

        self.tree = self.header = None  # Will hold the XML data

        # Get the data tables that we need for proper checking
        #self.ISOLanguages = ISO_639_3_Languages().loadData()
        self.genericBOS = BibleOrganizationalSystem("GENERIC-KJV-66-ENG")

        # Do a preliminary check on the readability of our file
        if not os.access(self.sourceFilepath, os.R_OK):
            print("ZefaniaXMLBible: File '{}' is unreadable".format(
                self.sourceFilepath))

        self.name = self.givenName
        #if self.name is None:
        #pass

    # end of ZefaniaXMLBible.__init__

    def load(self):
        """
        Load a single source XML file and load book elements.
        """
        if Globals.verbosityLevel > 2:
            print(_("Loading {}...").format(self.sourceFilepath))
        self.tree = ElementTree().parse(self.sourceFilepath)
        if Globals.debugFlag:
            assert (len(self.tree)
                    )  # Fail here if we didn't load anything at all

        # Find the main (bible) container
        if self.tree.tag == ZefaniaXMLBible.treeTag:
            location = "Zefania XML file"
            Globals.checkXMLNoText(self.tree, location, '4f6h')
            Globals.checkXMLNoTail(self.tree, location, '1wk8')

            schema = None
            name = status = BibleType = revision = version = lgid = None
            for attrib, value in self.tree.items():
                if attrib == ZefaniaXMLBible.XMLNameSpace + 'noNamespaceSchemaLocation':
                    schema = value
                elif attrib == "biblename":
                    name = value
                elif attrib == "lgid":
                    lgid = value  # In italian.xml this is set to "german"
                elif attrib == "status":
                    status = value
                elif attrib == "type":
                    BibleType = value
                elif attrib == "revision":
                    revision = value
                elif attrib == "version":
                    version = value
                else:
                    logging.warning(
                        "Unprocessed '{}' attribute ({}) in main element".
                        format(attrib, value))
            if name: self.name = name
            if status: self.status = status
            if revision: self.revision = revision
            if version: self.version = version

            if self.tree[0].tag == 'INFORMATION':
                self.header = self.tree[0]
                self.tree.remove(self.header)
                self.__validateAndExtractHeader()
            else:  # Handle information records at the END of the file
                ix = len(self.tree) - 1
                if self.tree[ix].tag == 'INFORMATION':
                    self.header = self.tree[ix]
                    self.tree.remove(self.header)
                    self.__validateAndExtractHeader()

            # Find the submain (book) containers
            for element in self.tree:
                if element.tag == ZefaniaXMLBible.bookTag:
                    sublocation = "book in " + location
                    Globals.checkXMLNoText(element, sublocation, 'g3g5')
                    Globals.checkXMLNoTail(element, sublocation, 'd3f6')
                    self.__validateAndExtractBook(element)
                else:
                    logging.error("Expected to find '{}' but got '{}'".format(
                        ZefaniaXMLBible.bookTag, element.tag))
        else:
            logging.error("Expected to load '{}' but got '{}'".format(
                ZefaniaXMLBible.treeTag, self.tree.tag))
        self.doPostLoadProcessing()

    # end of ZefaniaXMLBible.load

    def __validateAndExtractHeader(self):
        """
        Extracts information out of the header record, such as:
            <INFORMATION>
            <title>King James Version</title>
            <creator></creator>
            <subject>The Holy Bible</subject>
            <description>In 1604, King James I of England authorized that a new translation of the Bible into English be started. It was finished in 1611, just 85 years after the first translation of the New Testament into English appeared (Tyndale, 1526). The Authorized Version, or King James Version, quickly became the standard for English-speaking Protestants. Its flowing language and prose rhythm has had a profound influence on the literature of the past 300 years.</description>
            <publisher>FREE BIBLE SOFTWARE GROUP</publisher>
            <contributors />
            <date>2009-01-23</date>
            <type>Bible</type>
            <format>Zefania XML Bible Markup Language</format>
            <identifier>kjv</identifier>
            <source>http://www.unboundbible.com/zips/index.cfm?lang=English</source>
            <language>ENG</language>
            <coverage>provide the Bible to the nations of the world</coverage>
            <rights>We believe that this Bible is found in the Public Domain.</rights>
        </INFORMATION>
        """
        if Globals.debugFlag: assert (self.header)
        location = 'Header'
        Globals.checkXMLNoAttributes(self.header, location, 'j4j6')
        Globals.checkXMLNoText(self.header, location, 'sk4l')
        Globals.checkXMLNoTail(self.header, location, 'a2d4')

        # TODO: We probably need to rationalise some of the self.xxx stores
        for element in self.header:
            #print( "header", element.tag )
            if element.tag == 'title':
                sublocation = "title in {}".format(location)
                Globals.checkXMLNoTail(element, sublocation, 'al1d')
                Globals.checkXMLNoAttributes(element, sublocation, 'j3jd')
                Globals.checkXMLNoSubelements(element, sublocation, '5g78')
                if Globals.debugFlag: assert (element.text)
                self.title = element.text
            elif element.tag == 'creator':
                sublocation = "creator in {}".format(location)
                Globals.checkXMLNoTail(element, sublocation, 'al1d')
                Globals.checkXMLNoAttributes(element, sublocation, 'j3jd')
                Globals.checkXMLNoSubelements(element, sublocation, '5g78')
                if element.text: self.creator = element.text
            elif element.tag == 'subject':
                sublocation = "subject in {}".format(location)
                Globals.checkXMLNoTail(element, sublocation, 'al1d')
                Globals.checkXMLNoAttributes(element, sublocation, 'j3jd')
                Globals.checkXMLNoSubelements(element, sublocation, '5g78')
                if element.text: self.subject = element.text
            elif element.tag == 'description':
                sublocation = "description in {}".format(location)
                Globals.checkXMLNoTail(element, sublocation, 'al1d')
                Globals.checkXMLNoAttributes(element, sublocation, 'j3jd')
                Globals.checkXMLNoSubelements(element, sublocation, '5g78')
                if Globals.debugFlag: assert (element.text)
                self.description = element.text
            elif element.tag == 'publisher':
                sublocation = "publisher in {}".format(location)
                Globals.checkXMLNoTail(element, sublocation, 'al1d')
                Globals.checkXMLNoAttributes(element, sublocation, 'j3jd')
                Globals.checkXMLNoSubelements(element, sublocation, '5g78')
                if element.text: self.publisher = element.text
            elif element.tag == 'contributors':
                sublocation = "contributors in {}".format(location)
                Globals.checkXMLNoTail(element, sublocation, 'al1d')
                Globals.checkXMLNoAttributes(element, sublocation, 'j3jd')
                Globals.checkXMLNoSubelements(element, sublocation, '5g78')
                if element.text: self.contributors = element.text
            elif element.tag == 'date':
                sublocation = "date in {}".format(location)
                Globals.checkXMLNoTail(element, sublocation, 'al1d')
                Globals.checkXMLNoAttributes(element, sublocation, 'j3jd')
                Globals.checkXMLNoSubelements(element, sublocation, '5g78')
                if Globals.debugFlag: assert (element.text)
                self.date = element.text
            elif element.tag == 'type':
                sublocation = "type in {}".format(location)
                Globals.checkXMLNoTail(element, sublocation, 'al1d')
                Globals.checkXMLNoAttributes(element, sublocation, 'j3jd')
                Globals.checkXMLNoSubelements(element, sublocation, '5g78')
                if element.text: self.documentType = element.text
            elif element.tag == 'format':
                sublocation = "format in {}".format(location)
                Globals.checkXMLNoTail(element, sublocation, 'al1d')
                Globals.checkXMLNoAttributes(element, sublocation, 'j3jd')
                Globals.checkXMLNoSubelements(element, sublocation, '5g78')
                if Globals.debugFlag: assert (element.text)
                if Globals.debugFlag:
                    assert (
                        element.text == 'Zefania XML Bible Markup Language')
            elif element.tag == 'identifier':
                sublocation = "identifier in {}".format(location)
                Globals.checkXMLNoTail(element, sublocation, 'al1d')
                Globals.checkXMLNoAttributes(element, sublocation, 'j3jd')
                Globals.checkXMLNoSubelements(element, sublocation, '5g78')
                if Globals.debugFlag: assert (element.text)
                self.identifier = element.text
            elif element.tag == 'source':
                sublocation = "source in {}".format(location)
                Globals.checkXMLNoTail(element, sublocation, 'al1d')
                Globals.checkXMLNoAttributes(element, sublocation, 'j3jd')
                Globals.checkXMLNoSubelements(element, sublocation, '5g78')
                if Globals.debugFlag: assert (element.text)
                self.source = element.text
            elif element.tag == 'language':
                sublocation = "language in {}".format(location)
                Globals.checkXMLNoTail(element, sublocation, 'al1d')
                Globals.checkXMLNoAttributes(element, sublocation, 'j3jd')
                Globals.checkXMLNoSubelements(element, sublocation, '5g78')
                if Globals.debugFlag: assert (element.text)
                self.language = element.text
            elif element.tag == 'coverage':
                sublocation = "coverage in {}".format(location)
                Globals.checkXMLNoTail(element, sublocation, 'al1d')
                Globals.checkXMLNoAttributes(element, sublocation, 'j3jd')
                Globals.checkXMLNoSubelements(element, sublocation, '5g78')
                if element.text: self.coverage = element.text
            elif element.tag == 'rights':
                sublocation = "rights in {}".format(location)
                Globals.checkXMLNoTail(element, sublocation, 'al1d')
                Globals.checkXMLNoAttributes(element, sublocation, 'j3jd')
                Globals.checkXMLNoSubelements(element, sublocation, '5g78')
                if element.text: self.rights = element.text
            else:
                logging.error("Found unexpected '{}' tag in {}".format(
                    element.tag, location))

    # end of ZefaniaXMLBible.__validateAndExtractHeader

    def __validateAndExtractBook(self, book):
        """
        Check/validate and extract book data from the given XML book record
            finding chapter subelements.
        """

        if Globals.verbosityLevel > 3: print(_("Validating XML book..."))

        # Process the div attributes first
        BBB = bookName = bookShortName = bookNumber = None
        for attrib, value in book.items():
            if attrib == "bnumber":
                bookNumber = value
            elif attrib == "bname":
                bookName = value
            elif attrib == "bsname":
                bookShortName = value
            else:
                logging.warning(
                    "Unprocessed '{}' attribute ({}) in book element".format(
                        attrib, value))
        if bookNumber:
            try:
                BBB = Globals.BibleBooksCodes.getBBBFromReferenceNumber(
                    bookNumber)
            except KeyError:
                logging.warning( "Unable to deduce which book is number={}, name={}, shortName={} -- ignoring it" \
                                                                        .format( bookNumber, bookName, bookShortName ) )
        elif bookName:
            BBB = self.genericBOS.getBBB(bookName)

        if BBB:
            if Globals.verbosityLevel > 2:
                print(_("Validating {} {}...").format(BBB, bookName))
            thisBook = BibleBook(self.name, BBB)
            thisBook.objectNameString = "Zefania XML Bible Book object"
            thisBook.objectTypeString = "Zefania"
            #thisBook.sourceFilepath = self.sourceFilepath
            for element in book:
                if element.tag == ZefaniaXMLBible.chapterTag:
                    sublocation = "chapter in {}".format(BBB)
                    Globals.checkXMLNoText(element, sublocation, 'j3jd')
                    Globals.checkXMLNoTail(element, sublocation, 'al1d')
                    self.__validateAndExtractChapter(BBB, thisBook, element)
                else:
                    logging.error("Expected to find '{}' but got '{}'".format(
                        ZefaniaXMLBible.chapterTag, element.tag))
            if Globals.verbosityLevel > 2:
                print("  Saving {} into results...".format(BBB))
            self.saveBook(thisBook)

    # end of ZefaniaXMLBible.__validateAndExtractBook

    def __validateAndExtractChapter(self, BBB, thisBook, chapter):
        """
        Check/validate and extract chapter data from the given XML book record
            finding and saving chapter numbers and
            finding and saving verse elements.
        """

        if Globals.verbosityLevel > 3: print(_("Validating XML chapter..."))

        # Process the chapter attributes first
        chapterNumber = numVerses = None
        for attrib, value in chapter.items():
            if attrib == "cnumber":
                chapterNumber = value
            else:
                logging.warning(
                    "Unprocessed '{}' attribute ({}) in chapter element".
                    format(attrib, value))
        if chapterNumber:
            #print( BBB, 'c', chapterNumber )
            thisBook.appendLine('c', chapterNumber)
        else:
            logging.error(
                "Missing 'n' attribute in chapter element for BBB".format(BBB))

        for element in chapter:
            if element.tag == ZefaniaXMLBible.verseTag:
                location = "verse in {} {}".format(BBB, chapterNumber)
                self.__validateAndExtractVerse(BBB, chapterNumber, thisBook,
                                               element)
            elif element.tag == ZefaniaXMLBible.captionTag:  # Used in Psalms
                location = "caption in {} {}".format(BBB, chapterNumber)
                Globals.checkXMLNoTail(element, location, 'k5k8')
                Globals.checkXMLNoSubelements(element, location, 'd3f5')
                # Handle caption attributes
                vRef = None
                for attrib, value in element.items():
                    if attrib == "vref":
                        vRef = value
                        if Globals.debugFlag: assert (vRef == '1')
                    else:
                        logging.warning(
                            "Unprocessed '{}' attribute ({}) in caption element"
                            .format(attrib, value))
                if Globals.debugFlag: assert (vRef)
                vText = element.text
                if not vText:
                    logging.warning("{} {}:{} has no text".format(
                        BBB, chapterNumber, vRef))
                if vText:  # This is the main text of the caption
                    #print( "{} {}:{} '{}'".format( BBB, chapterNumber, verseNumber, vText ) )
                    thisBook.appendLine('v', '0' + ' ' +
                                        vText)  # We save it as verse zero
            else:
                logging.error("Expected to find '{}' but got '{}'".format(
                    ZefaniaXMLBible.verseTag, element.tag))

    # end of ZefaniaXMLBible.__validateAndExtractChapter

    def __validateAndExtractVerse(self, BBB, chapterNumber, thisBook, verse):
        """
        Check/validate and extract chapter data from the given XML book record
            finding and saving chapter numbers and
            finding and saving verse elements.
        """

        if Globals.verbosityLevel > 3: print(_("Validating XML verse..."))

        location = "verse in {} {}".format(BBB, chapterNumber)
        Globals.checkXMLNoTail(verse, location, 'l5ks')

        # Handle verse attributes
        verseNumber = toVerseNumber = None
        for attrib, value in verse.items():
            if attrib == "vnumber":
                verseNumber = value
            else:
                logging.warning(
                    "Unprocessed '{}' attribute ({}) in verse element".format(
                        attrib, value))
        if Globals.debugFlag: assert (verseNumber)
        location = "{}:{}".format(
            location, verseNumber)  # Get a better location description
        #thisBook.appendLine( 'v', verseNumber )
        vText = verse.text
        if vText: vText = vText.strip()
        #if not vText: # This happens if a verse starts immediately with a style or note
        #logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, verseNumber ) )

        # Handle verse subelements (notes and styled portions)
        for subelement in verse:
            if subelement.tag == ZefaniaXMLBible.noteTag:
                sublocation = "note in " + location
                noteType = None
                for attrib, value in subelement.items():
                    if attrib == "type":
                        noteType = value
                    else:
                        logging.warning(
                            "Unprocessed '{}' attribute ({}) in style subelement"
                            .format(attrib, value))
                if noteType not in (
                        'n-studynote',
                        'x-studynote',
                ):
                    logging.warning("Unexpected {} note type in {}".format(
                        noteType, BBB))
                if Globals.debugFlag: assert (noteType)
                nText, nTail = subelement.text, subelement.tail
                #print( "note", BBB, chapterNumber, verseNumber, noteType, repr(nText), repr(nTail) )
                #thisBook.appendLine( 'ST', css ) # XXXXXXXXXXXXXXXXXXXXXXXXXX Losing data here (for now)
                #thisBook.appendLine( 'ST=', nText )
                if nTail:
                    if '\n' in nTail:
                        print(
                            "ZefaniaXMLBible.__validateAndExtractVerse: nTail {} {}:{} '{}'"
                            .format(BBB, chapterNumber, verseNumber, nTail))
                        nTail = nTail.replace('\n', ' ')
                    thisBook.appendLine('v~', nTail)
                for subsubelement in subelement:
                    if subsubelement.tag == ZefaniaXMLBible.styleTag:
                        subsublocation = "style in " + sublocation
                        Globals.checkXMLNoSubelements(subsubelement,
                                                      subsublocation, 'fyt4')
                        css = idStyle = None
                        for attrib, value in subsubelement.items():
                            if attrib == "css":
                                css = value
                            elif attrib == "id":
                                idStyle = value
                            else:
                                logging.warning(
                                    "Unprocessed '{}' attribute ({}) in style subsubelement"
                                    .format(attrib, value))
                        if Globals.debugFlag: assert (css or idStyle)
                        SFM = None
                        if css == "font-style:italic": SFM = '\\it'
                        elif css == "font-style:italic;font-weight:bold":
                            SFM = '\\bdit'
                        elif css == "color:#FF0000":
                            SFM = '\\em'
                        elif css == "font-size: x-small; color:#8B8378":
                            SFM = '\\add'
                        elif css is None and idStyle == 'cl:divineName':
                            SFM = '\\nd'
                        else:
                            print("css is", css, "idStyle is", idStyle)
                            halt
                        sText, sTail = subsubelement.text.strip(
                        ), subsubelement.tail
                        if Globals.debugFlag: assert (sText)
                        if SFM: vText += SFM + ' ' + sText + SFM + '*'
                        else:
                            vText += '\\sc ' + '[' + css + ']' + sText + '\\sc* '  # Use sc for unknown styles
                        if sTail: vText += sTail.strip()
                    else:
                        logging.error(
                            "Expected to find {} but got '{}' in {}".format(
                                ZefaniaXMLBible.styleTag, subsubelement.tag,
                                sublocation))

            elif subelement.tag == ZefaniaXMLBible.styleTag:
                sublocation = "style in " + location
                Globals.checkXMLNoSubelements(subelement, sublocation, 'f5gh')
                css = idStyle = None
                for attrib, value in subelement.items():
                    if attrib == "css":
                        css = value
                    elif attrib == "id":
                        idStyle = value
                    else:
                        logging.warning(
                            "Unprocessed '{}' attribute ({}) in style subelement"
                            .format(attrib, value))
                if Globals.debugFlag: assert (css or idStyle)
                SFM = None
                if css == "font-style:italic": SFM = '\\it'
                elif css == "font-style:italic;font-weight:bold":
                    SFM = '\\bdit'
                elif css == "color:#FF0000":
                    SFM = '\\em'
                elif css == "font-size: x-small; color:#8B8378":
                    SFM = '\\add'
                elif css is None and idStyle == 'cl:divineName':
                    SFM = '\\nd'
                else:
                    print("css is", css, "idStyle is", idStyle)
                    halt
                sText, sTail = subelement.text.strip(), subelement.tail
                if Globals.debugFlag: assert (sText)
                if SFM: vText += SFM + ' ' + sText + SFM + '*'
                else:
                    vText += '\\sc ' + '[' + css + ']' + sText + '\\sc* '  # Use sc for unknown styles
                if sTail: vText += sTail.strip()

            elif subelement.tag == ZefaniaXMLBible.breakTag:
                sublocation = "line break in " + location
                Globals.checkXMLNoText(subelement, sublocation, 'c1d4')
                Globals.checkXMLNoSubelements(subelement, sublocation, 'g4g8')
                art = None
                for attrib, value in subelement.items():
                    if attrib == "art":
                        art = value
                    else:
                        logging.warning(
                            "Unprocessed '{}' attribute ({}) in style subelement"
                            .format(attrib, value))
                if Globals.debugFlag: assert (art == 'x-nl')
                #print( BBB, chapterNumber, verseNumber )
                #assert( vText )
                if vText:
                    thisBook.appendLine('v', verseNumber + ' ' + vText)
                    vText = ''
                thisBook.appendLine(
                    'm',
                    subelement.tail.strip() if subelement.tail else '')
                #bTail = subelement.tail
                #if bTail: vText = bTail.strip()
            else:
                logging.error(
                    "Expected to find NOTE or STYLE but got '{}' in {}".format(
                        subelement.tag, location))

        if vText:  # This is the main text of the verse (follows the verse milestone)
            if '\n' in vText:
                print(
                    "ZefaniaXMLBible.__validateAndExtractVerse: vText {} {}:{} '{}'"
                    .format(BBB, chapterNumber, verseNumber, vText))
                vText = vText.replace('\n', ' ')
            thisBook.appendLine('v', verseNumber + ' ' + vText)
Exemplo n.º 2
0
class USXXMLBibleBook( BibleBook ):
    """
    Class to load, validate, and manipulate a single Bible book in USX XML.
    """
    def __init__( self, name, BBB ):
        """
        Create the USX Bible book object.
        """
        BibleBook.__init__( self, name, BBB ) # Initialise the base class
        self.objectNameString = "USX XML Bible Book object"
        self.objectTypeString = "USX"

        #self.bookReferenceCode = bookReferenceCode
    # end of USXXMLBibleBook.__init__


    def load( self, filename, folder=None, encoding='utf-8' ):
        """
        Load a single source USX XML file and extract the information.
        """

        def loadParagraph( paragraphXML, paragraphlocation ):
            """ Load a paragraph from the USX XML.
                Uses (and updates) c,v information from the containing function. """
            nonlocal c, v

            # Process the attributes first
            paragraphStyle = None
            for attrib,value in paragraphXML.items():
                if attrib=='style':
                    paragraphStyle = value # This is basically the USFM marker name
                else:
                    logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )

            # Now process the paragraph text (or write a paragraph marker anyway)
            self.appendLine( paragraphStyle, paragraphXML.text if paragraphXML.text and paragraphXML.text.strip() else '' )

            # Now process the paragraph subelements
            for element in paragraphXML:
                location = element.tag + ' ' + paragraphlocation
                #print( "USXXMLBibleBook.load", c, v, element.tag, location )
                if element.tag == 'verse': # milestone (not a container)
                    Globals.checkXMLNoText( element, location )
                    Globals.checkXMLNoSubelements( element, location )
                    # Process the attributes first
                    verseStyle = None
                    for attrib,value in element.items():
                        if attrib=='number':
                            v = value
                        elif attrib=='style':
                            verseStyle = value
                        else:
                            logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                    if verseStyle != 'v':
                        logging.warning( _("Unexpected style attribute ({}) in {}").format( verseStyle, location ) )
                    self.appendLine( verseStyle, v + ' ' )
                    # Now process the tail (if there's one) which is the verse text
                    if element.tail:
                        vText = element.tail.strip()
                        if vText:
                            #print( repr(vText) )
                            self.appendToLastLine( vText )
                elif element.tag == 'char':
                    Globals.checkXMLNoSubelements( element, location )
                    # Process the attributes first
                    charStyle = None
                    for attrib,value in element.items():
                        if attrib=='style':
                            charStyle = value # This is basically the USFM character marker name
                            #print( "  charStyle", charStyle )
                            assert( not Globals.USFMMarkers.isNewlineMarker( charStyle ) )
                        else:
                            logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                    # A character field must be added to the previous field
                    tail = '' if element.tail is None else element.tail.strip()
                    additionalText = "\\{} {}\\{}*{}".format( charStyle, element.text, charStyle, tail )
                    print( "USX.loadParagraph:", c, v, paragraphStyle, charStyle, repr(additionalText) )
                    self.appendToLastLine( additionalText )
                elif element.tag == 'note':
                    Globals.checkXMLNoText( element, location )
                    # Process the attributes first
                    noteStyle = noteCaller = None
                    for attrib,value in element.items():
                        if attrib=='style':
                            noteStyle = value # This is basically the USFM marker name
                            assert( noteStyle in ('x','f',) )
                        elif attrib=='caller':
                            noteCaller = value # Usually hyphen or a symbol to be used for the note
                        else:
                            logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                    assert( noteStyle and noteCaller ) # both compulsory
                    noteLine = "\\{} {} ".format( noteStyle, noteCaller )
                    # Now process the subelements -- notes are one of the few multiply embedded fields in USX
                    for subelement in element:
                        sublocation = subelement.tag + ' ' + location
                        #print( c, v, element.tag )
                        if subelement.tag == 'char': # milestone (not a container)
                            Globals.checkXMLNoTail( subelement, sublocation )
                            Globals.checkXMLNoSubelements( subelement, sublocation )
                            # Process the attributes first
                            charStyle, charClosed = None, True
                            for attrib,value in subelement.items():
                                if attrib=='style':
                                    charStyle = value
                                elif attrib=='closed':
                                    assert( value=='false' )
                                    charClosed = False
                                else:
                                    logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
                            noteLine += "\\{} {}".format( charStyle, subelement.text )
                            if charClosed: noteLine += "\\{}*".format( charStyle )
                        else:
                            logging.warning( _("Unprocessed {} subelement after {} {}:{} in {}").format( subelement.tag, self.bookReferenceCode, c, v, sublocation ) )
                            self.addPriorityError( 1, c, v, _("Unprocessed {} subelement").format( subelement.tag ) )
                    if subelement.tail and subelement.tail.strip(): noteLine += subelement.tail
                    #noteLine += "\\{}*".format( charStyle )
                    noteLine += "\\{}*".format( noteStyle )
                    if element.tail:
                        noteText = element.tail.strip()
                        noteLine += noteText
                    self.appendToLastLine( noteLine )
                elif element.tag == 'unmatched': # Used to denote errors in the source text
                    Globals.checkXMLNoText( element, location )
                    Globals.checkXMLNoTail( element, location )
                    Globals.checkXMLNoAttributes( element, location )
                    Globals.checkXMLNoSubelements( element, location )
                else:
                    logging.warning( _("Unprocessed {} element after {} {}:{} in {}").format( element.tag, self.bookReferenceCode, c, v, location ) )
                    self.addPriorityError( 1, c, v, _("Unprocessed {} element").format( element.tag ) )
                    for x in range(max(0,len(self)-10),len(self)): print( x, self._rawLines[x] )
                    if Globals.debugFlag: halt
        # end of loadParagraph

        if Globals.verbosityLevel > 2: print( "  " + _("Loading {}...").format( filename ) )
        self.isOneChapterBook = self.bookReferenceCode in Globals.BibleBooksCodes.getSingleChapterBooksList()
        self.sourceFilename = filename
        self.sourceFolder = folder
        self.sourceFilepath = os.path.join( folder, filename ) if folder else filename
        self.tree = ElementTree().parse( self.sourceFilepath )
        assert( len ( self.tree ) ) # Fail here if we didn't load anything at all

        c = v = '0'
        loadErrors = []

        # Find the main container
        if self.tree.tag=='usx' or self.tree.tag=='usfm': # Not sure why both are allowable
            location = "USX ({}) file".format( self.tree.tag )
            Globals.checkXMLNoText( self.tree, location )
            Globals.checkXMLNoTail( self.tree, location )

            # Process the attributes first
            self.schemaLocation = ''
            version = None
            for attrib,value in self.tree.items():
                if attrib=='version': version = value
                logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
            if version not in ( None, '2.0' ):
                logging.warning( _("Not sure if we can handle v{} USX files").format( version ) )

            # Now process the data
            for element in self.tree:
                sublocation = element.tag + " " + location
                if element.tag == 'book': # milestone (not a container)
                    Globals.checkXMLNoSubelements( element, sublocation )
                    Globals.checkXMLNoTail( element, sublocation )
                    # Process the attributes
                    idField = bookStyle = None
                    for attrib,value in element.items():
                        if attrib=='id' or attrib=='code':
                            idField = value # Should be USFM bookcode (not like bookReferenceCode which is BibleOrgSys BBB bookcode)
                            #if idField != bookReferenceCode:
                            #    logging.warning( _("Unexpected book code ({}) in {}").format( idField, sublocation ) )
                        elif attrib=='style':
                            bookStyle = value
                        else:
                            logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
                    if bookStyle != 'id':
                        logging.warning( _("Unexpected style attribute ({}) in {}").format( bookStyle, sublocation ) )
                    idLine = idField
                    if element.text and element.text.strip(): idLine += ' ' + element.text
                    self.appendLine( 'id', idLine )
                elif element.tag == 'chapter': # milestone (not a container)
                    v = '0'
                    Globals.checkXMLNoText( element, sublocation )
                    Globals.checkXMLNoTail( element, sublocation )
                    Globals.checkXMLNoSubelements( element, sublocation )
                    # Process the attributes
                    chapterStyle = None
                    for attrib,value in element.items():
                        if attrib=='number':
                            c = value
                        elif attrib=='style':
                            chapterStyle = value
                        else:
                            logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
                    if chapterStyle != 'c':
                        logging.warning( _("Unexpected style attribute ({}) in {}").format( chapterStyle, sublocation ) )
                    self.appendLine( 'c', c )
                elif element.tag == 'para':
                    Globals.checkXMLNoTail( element, sublocation )
                    USFMMarker = element.attrib['style'] # Get the USFM code for the paragraph style
                    if Globals.USFMMarkers.isNewlineMarker( USFMMarker ):
                        #if lastMarker: self.appendLine( lastMarker, lastText )
                        #lastMarker, lastText = USFMMarker, text
                        loadParagraph( element, sublocation )
                    elif Globals.USFMMarkers.isInternalMarker( USFMMarker ): # the line begins with an internal USFM Marker -- append it to the previous line
                        text = element.text
                        if text is None: text = ''
                        if Globals.debugFlag:
                            print( _("{} {}:{} Found '\\{}' internal USFM marker at beginning of line with text: {}").format( self.bookReferenceCode, c, v, USFMMarker, text ) )
                            #halt # Not checked yet
                        if text:
                            loadErrors.append( _("{} {}:{} Found '\\{}' internal USFM marker at beginning of line with text: {}").format( self.bookReferenceCode, c, v, USFMMarker, text ) )
                            logging.warning( _("Found '\\{}' internal USFM Marker after {} {}:{} at beginning of line with text: {}").format( USFMMarker, self.bookReferenceCode, c, v, text ) )
                        else: # no text
                            loadErrors.append( _("{} {}:{} Found '\\{}' internal USFM Marker at beginning of line (with no text)").format( self.bookReferenceCode, c, v, USFMMarker ) )
                            logging.warning( _("Found '\\{}' internal USFM Marker after {} {}:{} at beginning of line (with no text)").format( USFMMarker, self.bookReferenceCode, c, v ) )
                        self.addPriorityError( 97, c, v, _("Found \\{} internal USFM Marker on new line in file").format( USFMMarker ) )
                        #lastText += '' if lastText.endswith(' ') else ' ' # Not always good to add a space, but it's their fault!
                        lastText =  '\\' + USFMMarker + ' ' + text
                        #print( "{} {} {} Now have {}:'{}'".format( self.bookReferenceCode, c, v, lastMarker, lastText ) )
                    else: # the line begins with an unknown USFM Marker
                        text = element.text
                        if text:
                            loadErrors.append( _("{} {}:{} Found '\\{}' unknown USFM Marker at beginning of line with text: {}").format( self.bookReferenceCode, c, v, USFMMarker, text ) )
                            logging.error( _("Found '\\{}' unknown USFM Marker after {} {}:{} at beginning of line with text: {}").format( USFMMarker, self.bookReferenceCode, c, v, text ) )
                        else: # no text
                            loadErrors.append( _("{} {}:{} Found '\\{}' unknown USFM Marker at beginning of line (with no text").format( self.bookReferenceCode, c, v, USFMMarker ) )
                            logging.error( _("Found '\\{}' unknown USFM Marker after {} {}:{} at beginning of line (with no text)").format( USFMMarker, self.bookReferenceCode, c, v ) )
                        self.addPriorityError( 100, c, v, _("Found \\{} unknown USFM Marker on new line in file").format( USFMMarker ) )
                        for tryMarker in sortedNLMarkers: # Try to do something intelligent here -- it might be just a missing space
                            if USFMMarker.startswith( tryMarker ): # Let's try changing it
                                if lastMarker: self.appendLine( lastMarker, lastText )
                                lastMarker, lastText = tryMarker, USFMMarker[len(tryMarker):] + ' ' + text
                                loadErrors.append( _("{} {}:{} Changed '\\{}' unknown USFM Marker to '{}' at beginning of line: {}").format( self.bookReferenceCode, c, v, USFMMarker, tryMarker, text ) )
                                logging.warning( _("Changed '\\{}' unknown USFM Marker to '{}' after {} {}:{} at beginning of line: {}").format( USFMMarker, tryMarker, self.bookReferenceCode, c, v, text ) )
                                break
                        # Otherwise, don't bother processing this line -- it'll just cause more problems later on
                else:
                    logging.warning( _("Unprocessed {} element after {} {}:{} in {}").format( element.tag, self.bookReferenceCode, c, v, sublocation ) )
                    self.addPriorityError( 1, c, v, _("Unprocessed {} element").format( element.tag ) )

        if loadErrors: self.errorDictionary['Load Errors'] = loadErrors
Exemplo n.º 3
0
class HaggaiXMLBible( Bible ):
    """
    Class for reading, validating, and converting HaggaiXMLBible XML.
    """
    XMLNameSpace = "{http://www.w3.org/2001/XMLSchema-instance}"
    treeTag = 'XMLBIBLE'
    infoTag = 'INFORMATION'
    bookTag = 'BIBLEBOOK'
    chapterTag = 'CHAPTER'
    captionTag = 'CAPTION'
    paragraphTag = 'PARAGRAPH'
    verseTag = 'VERSE'
    noteTag = 'NOTE'
    styleTag = 'STYLE'
    breakTag = 'BR'


    def __init__( self, sourceFolder, givenName, encoding='utf-8' ):
        """
        Constructor: just sets up the Haggai Bible object.
        """
         # Setup and initialise the base class first
        Bible.__init__( self )
        self.objectNameString = 'Haggai XML Bible object'
        self.objectTypeString = 'Haggai'

        # Now we can set our object variables
        self.sourceFolder, self.givenName, self.encoding = sourceFolder, givenName, encoding
        self.sourceFilepath =  os.path.join( self.sourceFolder, self.givenName )

        self.tree = self.header = None # Will hold the XML data

        # Get the data tables that we need for proper checking
        #self.ISOLanguages = ISO_639_3_Languages().loadData()
        self.genericBOS = BibleOrganizationalSystem( 'GENERIC-KJV-66-ENG' )

        # Do a preliminary check on the readability of our file
        if not os.access( self.sourceFilepath, os.R_OK ):
            print( "HaggaiXMLBible: File {!r} is unreadable".format( self.sourceFilepath ) )

        self.name = self.givenName
        #if self.name is None:
            #pass
    # end of HaggaiXMLBible.__init__


    def load( self ):
        """
        Load a single source XML file and load book elements.
        """
        if BibleOrgSysGlobals.verbosityLevel > 2: print( _("Loading {}…").format( self.sourceFilepath ) )
        try: self.tree = ElementTree().parse( self.sourceFilepath )
        except ParseError as err:
            logging.critical( exp("Loader parse error in xml file {}: {} {}").format( self.givenName, sys.exc_info()[0], err ) )
            #loadErrors.append( exp("Loader parse error in xml file {}: {} {}").format( self.givenName, sys.exc_info()[0], err ) )
            #self.addPriorityError( 100, C, V, _("Loader parse error in xml file {}: {}").format( self.givenName, err ) )
        if BibleOrgSysGlobals.debugFlag: assert len ( self.tree ) # Fail here if we didn't load anything at all

        # Find the main (bible) container
        if self.tree.tag == HaggaiXMLBible.treeTag:
            location = "Haggai XML file"
            BibleOrgSysGlobals.checkXMLNoText( self.tree, location, '4f6h' )
            BibleOrgSysGlobals.checkXMLNoTail( self.tree, location, '1wk8' )

            schema = name = status = BibleType = revision = version = lgid = None
            for attrib,value in self.tree.items():
                if attrib == HaggaiXMLBible.XMLNameSpace + 'noNamespaceSchemaLocation':
                    schema = value
                elif attrib == "biblename":
                    name = value
                elif attrib == "lgid":
                    lgid = value # In italian.xml this is set to "german"
                elif attrib == "status":
                    status = value
                elif attrib == "type":
                    BibleType = value
                elif attrib == "revision":
                    revision = value
                elif attrib == 'version':
                    version = value
                else: logging.warning( "Unprocessed {!r} attribute ({}) in main element".format( attrib, value ) )
            if name: self.name = name
            if status: self.status = status
            if revision: self.revision = revision
            if version: self.version = version

            if self.tree[0].tag == 'INFORMATION':
                self.header = self.tree[0]
                self.tree.remove( self.header )
                self.__validateAndExtractHeader()
            else: # Handle information records at the END of the file
                ix = len(self.tree) - 1
                if self.tree[ix].tag == 'INFORMATION':
                    self.header = self.tree[ix]
                    self.tree.remove( self.header )
                    self.__validateAndExtractHeader()

            # Find the submain (book) containers
            for element in self.tree:
                if element.tag == HaggaiXMLBible.bookTag:
                    sublocation = "book in " + location
                    BibleOrgSysGlobals.checkXMLNoText( element, sublocation, 'g3g5' )
                    BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'd3f6' )
                    self.__validateAndExtractBook( element )
                else: logging.error( "Expected to find {!r} but got {!r}".format( HaggaiXMLBible.bookTag, element.tag ) )
        else: logging.error( "Expected to load {!r} but got {!r}".format( HaggaiXMLBible.treeTag, self.tree.tag ) )
        self.doPostLoadProcessing()
    # end of HaggaiXMLBible.load


    def __validateAndExtractHeader( self ):
        """
        Extracts information out of the header record, such as:
            <INFORMATION>
            <title>King James Version</title>
            <creator></creator>
            <subject>The Holy Bible</subject>
            <description>In 1604, King James I of England authorized that a new translation of the Bible into English be started. It was finished in 1611, just 85 years after the first translation of the New Testament into English appeared (Tyndale, 1526). The Authorized Version, or King James Version, quickly became the standard for English-speaking Protestants. Its flowing language and prose rhythm has had a profound influence on the literature of the past 300 years.</description>
            <publisher>FREE BIBLE SOFTWARE GROUP</publisher>
            <contributors />
            <date>2009-01-23</date>
            <type>Bible</type>
            <format>Haggai XML Bible Markup Language</format>
            <identifier>kjv</identifier>
            <source>http://www.unboundbible.com/zips/index.cfm?lang=English</source>
            <language>ENG</language>
            <coverage>provide the Bible to the nations of the world</coverage>
            <rights>We believe that this Bible is found in the Public Domain.</rights>
        </INFORMATION>
        """
        if BibleOrgSysGlobals.debugFlag: assert self.header
        location = 'Header'
        BibleOrgSysGlobals.checkXMLNoAttributes( self.header, location, 'j4j6' )
        BibleOrgSysGlobals.checkXMLNoText( self.header, location, 'sk4l' )
        BibleOrgSysGlobals.checkXMLNoTail( self.header, location, 'a2d4' )

        # TODO: We probably need to rationalise some of the self.xxx stores
        for element in self.header:
            #print( "header", element.tag )
            if element.tag == 'title':
                sublocation = "title in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if BibleOrgSysGlobals.debugFlag: assert element.text
                self.title = element.text
            elif element.tag == 'creator':
                sublocation = "creator in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if element.text: self.creator = element.text
            elif element.tag == 'subject':
                sublocation = "subject in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if element.text: self.subject = element.text
            elif element.tag == 'description':
                sublocation = "description in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if BibleOrgSysGlobals.debugFlag: assert element.text
                self.description = element.text
            elif element.tag == 'publisher':
                sublocation = "publisher in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if element.text: self.publisher = element.text
            elif element.tag == 'contributor':
                sublocation = "contributor in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'alj1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jjd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5gk78' )
                if element.text:
                    try: self.contributor = [ self.contributor, element.text ] # Put multiples into a list
                    except AttributeError: self.contributor = element.text # Must be the first (and possibly only) one
            elif element.tag == 'contributors':
                sublocation = "contributors in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if element.text: self.contributors = element.text
            elif element.tag == 'date':
                sublocation = "date in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if BibleOrgSysGlobals.debugFlag: assert element.text
                self.date = element.text
            elif element.tag == 'type':
                sublocation = "type in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if element.text: self.documentType = element.text
            elif element.tag == 'format':
                sublocation = "format in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if BibleOrgSysGlobals.debugFlag: assert element.text
                if BibleOrgSysGlobals.debugFlag: assert element.text == 'Haggai XML Bible Markup Language'
            elif element.tag == 'identifier':
                sublocation = "identifier in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if BibleOrgSysGlobals.debugFlag: assert element.text
                self.identifier = element.text
            elif element.tag == 'source':
                sublocation = "source in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if BibleOrgSysGlobals.debugFlag: assert element.text
                self.source = element.text
            elif element.tag == 'language':
                sublocation = "language in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if BibleOrgSysGlobals.debugFlag: assert element.text
                self.language = element.text
            elif element.tag == 'coverage':
                sublocation = "coverage in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if element.text: self.coverage = element.text
            elif element.tag == 'rights':
                sublocation = "rights in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if element.text: self.rights = element.text
            else: logging.error( "Found unexpected {!r} tag in {}".format( element.tag, location ) )
    # end of HaggaiXMLBible.__validateAndExtractHeader


    def __validateAndExtractBook( self, book ):
        """
        Check/validate and extract book data from the given XML book record
            finding chapter subelements.
        """

        if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML book…") )

        # Process the div attributes first
        BBB = bookName = bookShortName = bookNumber = None
        for attrib,value in book.items():
            if attrib=="bnumber":
                bookNumber = value
            elif attrib=="bname":
                bookName = value
            elif attrib=="bsname":
                bookShortName = value
            else: logging.warning( "Unprocessed {!r} attribute ({}) in book element".format( attrib, value ) )
        if bookNumber:
            try: BBB = BibleOrgSysGlobals.BibleBooksCodes.getBBBFromReferenceNumber( bookNumber )
            except KeyError:
                logging.warning( "Unable to deduce which book is number={}, name={}, shortName={} -- ignoring it" \
                                                                        .format( bookNumber, bookName, bookShortName ) )
        elif bookName:
            BBB = self.genericBOS.getBBBFromText( bookName )

        if BBB:
            if BibleOrgSysGlobals.verbosityLevel > 2: print( _("Validating {} {}…").format( BBB, bookName ) )
            thisBook = BibleBook( self, BBB )
            thisBook.objectNameString = 'Haggai XML Bible Book object'
            thisBook.objectTypeString = 'Haggai'
            #thisBook.sourceFilepath = self.sourceFilepath
            for element in book:
                if element.tag == HaggaiXMLBible.captionTag:
                    sublocation = "caption in {}".format( BBB )
                    BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jhl6' )
                    BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'jk21' )
                    BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'kjh6' )
                    thisBook.addLine( 'mt', element.text )
                elif element.tag == HaggaiXMLBible.chapterTag:
                    sublocation = "chapter in {}".format( BBB )
                    BibleOrgSysGlobals.checkXMLNoText( element, sublocation, 'j3jd' )
                    BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                    self.__validateAndExtractChapter( BBB, thisBook, element )
                else: logging.error( "Expected to find {!r} but got {!r}".format( HaggaiXMLBible.chapterTag, element.tag ) )
            if BibleOrgSysGlobals.verbosityLevel > 2: print( "  Saving {} into results…".format( BBB ) )
            self.stashBook( thisBook )
    # end of HaggaiXMLBible.__validateAndExtractBook


    def __validateAndExtractChapter( self, BBB, thisBook, chapter ):
        """
        Check/validate and extract chapter data from the given XML book record
            finding and saving chapter numbers and
            finding and saving verse elements.
        """

        if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML chapter…") )

        # Process the chapter attributes first
        chapterNumber = numVerses = None
        for attrib,value in chapter.items():
            if attrib=="cnumber":
                chapterNumber = value
            else: logging.warning( "Unprocessed {!r} attribute ({}) in chapter element".format( attrib, value ) )
        if chapterNumber:
            #print( BBB, 'c', chapterNumber )
            thisBook.addLine( 'c', chapterNumber )
        else: logging.error( "Missing 'n' attribute in chapter element for {}".format( BBB ) )

        for element in chapter:
            if element.tag == HaggaiXMLBible.paragraphTag:
                location = "paragraph in {} {}".format( BBB, chapterNumber )
                self.__validateAndExtractParagraph( BBB, chapterNumber, thisBook, element )
            elif element.tag == HaggaiXMLBible.verseTag+'disabled':
                location = "verse in {} {}".format( BBB, chapterNumber )
                self.__validateAndExtractVerse( BBB, chapterNumber, thisBook, element )
            elif element.tag == HaggaiXMLBible.captionTag+'disabled': # Used in Psalms
                location = "caption in {} {}".format( BBB, chapterNumber )
                BibleOrgSysGlobals.checkXMLNoTail( element, location, 'k5k8' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'd3f5' )
                # Handle caption attributes
                vRef = None
                for attrib,value in element.items():
                    if attrib=="vref":
                        vRef = value
                        if BibleOrgSysGlobals.debugFlag: assert vRef == '1'
                    else: logging.warning( "Unprocessed {!r} attribute ({}) in caption element".format( attrib, value ) )
                if BibleOrgSysGlobals.debugFlag: assert vRef
                vText = element.text
                if not vText:
                    logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, vRef ) )
                if vText: # This is the main text of the caption
                    #print( "{} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) )
                    thisBook.addLine( 'v', '0' + ' ' + vText ) # We save it as verse zero
            else: logging.error( "Expected to find {!r} but got {!r}".format( HaggaiXMLBible.verseTag, element.tag ) )
    # end of HaggaiXMLBible.__validateAndExtractChapter


    def __validateAndExtractParagraph( self, BBB, chapterNumber, thisBook, paragraph ):
        """
        Check/validate and extract paragraph data from the given XML book record
            finding and saving paragraphs and
            finding and saving verse elements.
        """

        if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML paragraph…") )

        location = "paragraph in {} {}".format( BBB, chapterNumber )
        BibleOrgSysGlobals.checkXMLNoAttributes( paragraph, location, 'brgw3' )
        BibleOrgSysGlobals.checkXMLNoText( paragraph, location, 'brgw3' )
        BibleOrgSysGlobals.checkXMLNoTail( paragraph, location, 'brgw3' )
        thisBook.addLine( 'p', '' )

        # Handle verse subelements (verses)
        for element in paragraph:
            if element.tag == HaggaiXMLBible.verseTag:
                location = "verse in {} {}".format( BBB, chapterNumber )
                self.__validateAndExtractVerse( BBB, chapterNumber, thisBook, element )
            elif element.tag == HaggaiXMLBible.captionTag+'disabled': # Used in Psalms
                location = "caption in {} {}".format( BBB, chapterNumber )
                BibleOrgSysGlobals.checkXMLNoTail( element, location, 'k5k8' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'd3f5' )
                # Handle caption attributes
                vRef = None
                for attrib,value in element.items():
                    if attrib=="vref":
                        vRef = value
                        if BibleOrgSysGlobals.debugFlag: assert vRef == '1'
                    else: logging.warning( "Unprocessed {!r} attribute ({}) in caption element".format( attrib, value ) )
                if BibleOrgSysGlobals.debugFlag: assert vRef
                vText = element.text
                if not vText:
                    logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, vRef ) )
                if vText: # This is the main text of the caption
                    #print( "{} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) )
                    thisBook.addLine( 'v', '0' + ' ' + vText ) # We save it as verse zero
            else: logging.error( "Expected to find {!r} but got {!r}".format( HaggaiXMLBible.verseTag, element.tag ) )
    # end of HaggaiXMLBible.__validateAndExtractParagraph


    def __validateAndExtractVerse( self, BBB, chapterNumber, thisBook, verse ):
        """
        Check/validate and extract verse data from the given XML book record
            finding and saving verse elements.
        """

        if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML verse…") )

        location = "verse in {} {}".format( BBB, chapterNumber )
        BibleOrgSysGlobals.checkXMLNoTail( verse, location, 'l5ks' )

        # Handle verse attributes
        verseNumber = toVerseNumber = None
        for attrib,value in verse.items():
            if attrib=="vnumber":
                verseNumber = value
            else: logging.warning( "Unprocessed {!r} attribute ({}) in verse element".format( attrib, value ) )
        if BibleOrgSysGlobals.debugFlag: assert verseNumber
        location = "{}:{}".format( location, verseNumber ) # Get a better location description
        #thisBook.addLine( 'v', verseNumber )
        vText = '' if verse.text is None else verse.text
        if vText: vText = vText.strip()
        #if not vText: # This happens if a verse starts immediately with a style or note
            #logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, verseNumber ) )

        # Handle verse subelements (notes and styled portions)
        for subelement in verse:
            if subelement.tag == HaggaiXMLBible.noteTag:
                sublocation = "note in " + location
                noteType = None
                for attrib,value in subelement.items():
                    if attrib=="type": noteType = value
                    else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) )
                if noteType and noteType not in ('variant',):
                    logging.warning( "Unexpected {} note type in {}".format( noteType, BBB ) )
                nText, nTail = subelement.text, subelement.tail
                #print( "note", BBB, chapterNumber, verseNumber, noteType, repr(nText), repr(nTail) )
                vText += "\\f + \\fk {} \\ft {}\\f*".format( noteType, nText ) if noteType else "\\f + \\ft {}\\f*".format( nText )
                if nTail:
                    if '\n' in nTail:
                        print( "HaggaiXMLBible.__validateAndExtractVerse: nTail {} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, nTail ) )
                        nTail = nTail.replace( '\n', ' ' )
                    vText += nTail
                for subsubelement in subelement:
                    if subsubelement.tag == HaggaiXMLBible.styleTag:
                        subsublocation = "style in " + sublocation
                        BibleOrgSysGlobals.checkXMLNoSubelements( subsubelement, subsublocation, 'fyt4' )
                        fs = css = idStyle = None
                        for attrib,value in subsubelement.items():
                            if attrib=='fs': fs = value
                            #elif attrib=="css": css = value
                            #elif attrib=="id": idStyle = value
                            else: logging.warning( "Unprocessed {!r} attribute ({}) in style subsubelement".format( attrib, value ) )
                        if BibleOrgSysGlobals.debugFlag: assert fs or css or idStyle
                        SFM = None
                        if fs == 'italic': SFM = '\\it'
                        elif fs == 'super': SFM = '\\bdit'
                        elif fs == 'emphasis': SFM = '\\em'
                        else: print( "fs is", fs, "css is", css, "idStyle is", idStyle ); halt
                        #if css == "font-style:italic": SFM = '\\it'
                        #elif css == "font-style:italic;font-weight:bold": SFM = '\\bdit'
                        #elif css == "color:#FF0000": SFM = '\\em'
                        #elif css == "font-size: x-small; color:#8B8378": SFM = '\\add'
                        #elif css is None and idStyle=='cl:divineName': SFM = '\\nd'
                        #else: print( "css is", css, "idStyle is", idStyle ); halt
                        sText, sTail = subsubelement.text.strip(), subsubelement.tail
                        if BibleOrgSysGlobals.debugFlag: assert sText
                        if SFM: vText += SFM+' ' + sText + SFM+'*'
                        else: vText += '\\sc ' + '['+css+']' + sText + '\\sc* ' # Use sc for unknown styles
                        if sTail: vText += sTail.strip()
                    else: logging.error( "Expected to find {} but got {!r} in {}".format( HaggaiXMLBible.styleTag, subsubelement.tag, sublocation ) )

            elif subelement.tag == HaggaiXMLBible.styleTag:
                sublocation = "style in " + location
                BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'f5gh' )
                fs = css = idStyle = None
                for attrib,value in subelement.items():
                    if attrib=="fs": fs = value
                    #elif attrib=="css": css = value
                    #elif attrib=="id": idStyle = value
                    else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) )
                if BibleOrgSysGlobals.debugFlag: assert fs
                SFM = None
                if fs == 'super': SFM = '\\bdit'
                elif fs == 'emphasis': SFM = '\\em'
                else: print( "fs is", fs, "css is", css, "idStyle is", idStyle ); halt
                #if css == "font-style:italic": SFM = '\\it'
                #elif css == "font-style:italic;font-weight:bold": SFM = '\\bdit'
                #elif css == "color:#FF0000": SFM = '\\em'
                #elif css == "font-size: x-small; color:#8B8378": SFM = '\\add'
                #elif css is None and idStyle=='cl:divineName': SFM = '\\nd'
                #else: print( "css is", css, "idStyle is", idStyle ); halt
                sText, sTail = subelement.text.strip(), subelement.tail
                if BibleOrgSysGlobals.debugFlag: assert sText
                #print( BBB, chapterNumber, sublocation )
                if SFM: vText += SFM+' ' + sText + SFM+'*'
                else: vText += '\\sc ' + '['+css+']' + sText + '\\sc* ' # Use sc for unknown styles
                if sTail: vText += sTail.strip()

            elif subelement.tag == HaggaiXMLBible.breakTag:
                sublocation = "line break in " + location
                BibleOrgSysGlobals.checkXMLNoText( subelement, sublocation, 'c1d4' )
                BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'g4g8' )
                art = None
                for attrib,value in subelement.items():
                    if attrib=="art":
                        art = value
                    else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) )
                if BibleOrgSysGlobals.debugFlag: assert art == 'x-nl'
                #print( BBB, chapterNumber, verseNumber )
                #assert vText
                if vText:
                    thisBook.addLine( 'v', verseNumber + ' ' + vText ); verseNumber = None
                    vText = ''
                thisBook.addLine( 'm', subelement.tail.strip() if subelement.tail else '' )
                #bTail = subelement.tail
                #if bTail: vText = bTail.strip()
            else: logging.error( "Expected to find NOTE or STYLE but got {!r} in {}".format( subelement.tag, location ) )

        if vText: # This is the main text of the verse (follows the verse milestone)
            if '\n' in vText:
                print( "HaggaiXMLBible.__validateAndExtractVerse: vText {} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) )
                vText = vText.replace( '\n', ' ' )
            thisBook.addLine( 'v', verseNumber + ' ' + vText ); verseNumber = None
Exemplo n.º 4
0
class USXXMLBibleBook( BibleBook ):
    """
    Class to load, validate, and manipulate a single Bible book in USX XML.
    """
    def __init__( self, name, BBB ):
        """
        Create the USX Bible book object.
        """
        BibleBook.__init__( self, name, BBB ) # Initialise the base class
        self.objectNameString = 'USX XML Bible Book object'
        self.objectTypeString = 'USX'

        global sortedNLMarkers
        if sortedNLMarkers is None:
            sortedNLMarkers = sorted( BibleOrgSysGlobals.USFMMarkers.getNewlineMarkersList('Combined'), key=len, reverse=True )
        #self.BBB = BBB
    # end of USXXMLBibleBook.__init__


    def load( self, filename, folder=None, encoding='utf-8' ):
        """
        Load a single source USX XML file and extract the information.
        """
        if BibleOrgSysGlobals.debugFlag and debuggingThisModule:
            print( exp("load( {}, {}, {} )").format( filename, folder, encoding ) )

        def loadParagraph( paragraphXML, paragraphlocation ):
            """
            Load a paragraph from the USX XML.
            In this context, paragraph means heading and intro lines,
                as well as paragraphs of verses.

            Uses (and updates) C,V information from the containing function.
            """
            nonlocal C, V

            # Process the attributes first
            paragraphStyle = None
            for attrib,value in paragraphXML.items():
                if attrib=='style':
                    paragraphStyle = value # This is basically the USFM marker name
                else:
                    logging.warning( _("CH46 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )

            # Now process the paragraph text (or write a paragraph marker anyway)
            paragraphText = paragraphXML.text if paragraphXML.text and paragraphXML.text.strip() else ''
            if version is None: paragraphText = paragraphText.rstrip() # Don't need to strip extra spaces in v2
            self.addLine( paragraphStyle, paragraphText )

            # Now process the paragraph subelements
            for element in paragraphXML:
                location = element.tag + ' ' + paragraphlocation
                #print( "USXXMLBibleBook.load", C, V, element.tag, location )
                if element.tag == 'verse': # milestone (not a container)
                    BibleOrgSysGlobals.checkXMLNoText( element, location )
                    BibleOrgSysGlobals.checkXMLNoSubelements( element, location )
                    # Process the attributes first
                    verseStyle = altNumber = None
                    for attrib,value in element.items():
                        if attrib=='number':
                            V = value
                        elif attrib=='style':
                            verseStyle = value
                        elif attrib=='altnumber':
                            altNumber = value
                        else:
                            logging.error( _("KR60 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                    if verseStyle != 'v':
                        logging.error( _("Unexpected style attribute ({}) in {}").format( verseStyle, location ) )
                    #if altNumber: print( repr(verseStyle), repr(altNumber) ); halt
                    altStuff = ' \\va {}\\va*'.format( altNumber ) if altNumber else ''
                    self.addLine( verseStyle, V + altStuff + ' ' )
                    # Now process the tail (if there's one) which is the verse text
                    if element.tail:
                        vText = element.tail
                        if vText[0]=='\n': vText = vText.lstrip() # Paratext puts cross references on a new line
                        if vText:
                            #print( repr(vText) )
                            self.appendToLastLine( vText )
                elif element.tag == 'char':
                    # Process the attributes first
                    charStyle = None
                    for attrib,value in element.items():
                        if attrib=='style':
                            charStyle = value # This is basically the USFM character marker name
                            #print( "  charStyle", charStyle )
                            assert not BibleOrgSysGlobals.USFMMarkers.isNewlineMarker( charStyle )
                        else:
                            logging.error( _("QU52 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                    charLine = "\\{} {} ".format( charStyle, element.text )
                    # Now process the subelements -- chars are one of the few multiply embedded fields in USX
                    for subelement in element:
                        sublocation = subelement.tag + ' ' + location
                        #print( '{} {}:{} {}'.format( self.BBB, C, V, element.tag ) )
                        if subelement.tag == 'char': # milestone (not a container)
                            BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation )
                            # Process the attributes first
                            subCharStyle, charClosed = None, True
                            for attrib,value in subelement.items():
                                if attrib=='style': subCharStyle = value
                                elif attrib=='closed':
                                    assert value=='false'
                                    charClosed = False
                                else:
                                    logging.error( _("KS41 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
                            charLine += "\\{} {}".format( subCharStyle, subelement.text )
                            if charClosed: charLine += "\\{}*".format( subCharStyle )
                            #if subelement.tail is not None: print( "  tail1", repr(subelement.tail) )
                            charLine += '' if subelement.tail is None else subelement.tail
                        else:
                            logging.error( _("Unprocessed {} subelement after {} {}:{} in {}").format( subelement.tag, self.BBB, C, V, sublocation ) )
                            self.addPriorityError( 1, C, V, _("Unprocessed {} subelement").format( subelement.tag ) )
                    # A character field must be added to the previous field
                    #if element.tail is not None: print( " tail2", repr(element.tail) )
                    charTail = ''
                    if element.tail:
                        charTail = element.tail
                        if charTail[0]=='\n': charTail = charTail.lstrip() # Paratext puts footnote parts on new lines
                    charLine += "\\{}*{}".format( charStyle, charTail )
                    #if debuggingThisModule: print( "USX.loadParagraph:", C, V, paragraphStyle, charStyle, repr(charLine) )
                    self.appendToLastLine( charLine )
                elif element.tag == 'note':
                    #print( "NOTE", BibleOrgSysGlobals.elementStr( element ) )
                    # Process the attributes first
                    noteStyle = noteCaller = None
                    for attrib,value in element.items():
                        if attrib=='style':
                            noteStyle = value # This is basically the USFM marker name
                            assert noteStyle in ('x','f',)
                        elif attrib=='caller': noteCaller = value # Usually hyphen or a symbol to be used for the note
                        else:
                            logging.error( _("CY38 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                    if noteCaller=='' and self.BBB=='NUM' and C=='10' and V=='36': noteCaller = '+' # Hack
                    assert noteStyle and noteCaller # both compulsory
                    noteLine = "\\{} {} ".format( noteStyle, noteCaller )
                    if element.text:
                        noteText = element.text.strip()
                        noteLine += noteText
                    # Now process the subelements -- notes are one of the few multiply embedded fields in USX
                    for subelement in element:
                        sublocation = subelement.tag + ' ' + location
                        #print( C, V, subelement.tag )
                        if subelement.tag == 'char': # milestone (not a container)
                            # Process the attributes first
                            charStyle, charClosed = None, True
                            for attrib,value in subelement.items():
                                if attrib=='style':
                                    charStyle = value
                                elif attrib=='closed':
                                    assert value=='false'
                                    charClosed = False
                                else:
                                    logging.warning( _("GJ67 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
                            noteLine += "\\{} {}".format( charStyle, subelement.text )
                            # Now process the subelements -- notes are one of the few multiply embedded fields in USX
                            for sub2element in subelement:
                                sub2location = sub2element.tag + ' ' + sublocation
                                #print( C, V, sub2element.tag )
                                if sub2element.tag == 'char': # milestone (not a container)
                                    BibleOrgSysGlobals.checkXMLNoSubelements( sub2element, sub2location )
                                    # Process the attributes first
                                    char2Style, char2Closed = None, True
                                    for attrib,value in sub2element.items():
                                        if attrib=='style':
                                            char2Style = value
                                        elif attrib=='closed':
                                            assert value=='false'
                                            char2Closed = False
                                        else:
                                            logging.warning( _("VH36 Unprocessed {} attribute ({}) in {}").format( attrib, value, sub2location ) )
                                    assert char2Closed
                                    noteLine += "\\{} {}\\{}*{}".format( char2Style, sub2element.text, char2Style, sub2element.tail if sub2element.tail else '' )
                            if charClosed: noteLine += "\\{}*".format( charStyle )
                            if subelement.tail:
                                charTail = subelement.tail
                                if charTail[0]=='\n': charTail = charTail.lstrip() # Paratext puts cross reference parts on a new line
                                noteLine += charTail
                        elif subelement.tag == 'unmatched': # Used to denote errors in the source text
                            BibleOrgSysGlobals.checkXMLNoText( subelement, sublocation )
                            BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation )
                            # Process the attributes first
                            unmmatchedMarker = None
                            for attrib,value in subelement.items():
                                if attrib=='marker':
                                    unmmatchedMarker = value
                                else:
                                    logging.warning( _("NV21 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
                            self.addPriorityError( 2, C, V, _("Unmatched subelement for {} in {}").format( repr(unmmatchedMarker), sublocation) if unmmatchedMarker else _("Unmatched subelement in {}").format( sublocation) )
                        else:
                            logging.warning( _("Unprocessed {} subelement after {} {}:{} in {}").format( subelement.tag, self.BBB, C, V, sublocation ) )
                            self.addPriorityError( 1, C, V, _("Unprocessed {} subelement").format( subelement.tag ) )
                        if subelement.tail and subelement.tail.strip(): noteLine += subelement.tail
                    #noteLine += "\\{}*".format( charStyle )
                    noteLine += "\\{}*".format( noteStyle )
                    if element.tail:
                        #if '\n' in element.tail: halt
                        noteTail = element.tail
                        if noteTail[0]=='\n': noteTail = noteTail.lstrip() # Paratext puts multiple cross-references on new lines
                        noteLine += noteTail
                    #print( "NoteLine", repr(noteLine) )
                    self.appendToLastLine( noteLine )
                elif element.tag == 'link': # Used to include extra resources
                    BibleOrgSysGlobals.checkXMLNoText( element, location )
                    BibleOrgSysGlobals.checkXMLNoTail( element, location )
                    BibleOrgSysGlobals.checkXMLNoSubelements( element, location )
                    # Process the attributes first
                    linkStyle = linkDisplay = linkTarget = None
                    for attrib,value in element.items():
                        if attrib=='style':
                            linkStyle = value
                            assert linkStyle in ('jmp',)
                        elif attrib=='display':
                            linkDisplay = value # e.g., "click here"
                        elif attrib=='target':
                            linkTarget = value # e.g., some reference
                        else:
                            logging.warning( _("KW54 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                    self.addPriorityError( 3, C, V, _("Unprocessed {} link to {} in {}").format( repr(linkDisplay), repr(linkTarget), location) )
                elif element.tag == 'unmatched': # Used to denote errors in the source text
                    BibleOrgSysGlobals.checkXMLNoText( element, location )
                    BibleOrgSysGlobals.checkXMLNoTail( element, location )
                    BibleOrgSysGlobals.checkXMLNoAttributes( element, location )
                    BibleOrgSysGlobals.checkXMLNoSubelements( element, location )
                    self.addPriorityError( 2, C, V, _("Unmatched element in {}").format( location) )
                else:
                    logging.warning( _("Unprocessed {} element after {} {}:{} in {}").format( element.tag, self.BBB, C, V, location ) )
                    self.addPriorityError( 1, C, V, _("Unprocessed {} element").format( element.tag ) )
                    for x in range(max(0,len(self)-10),len(self)): print( x, self._rawLines[x] )
                    if BibleOrgSysGlobals.debugFlag: halt
        # end of loadParagraph

        C = V = '0'
        loadErrors = []
        lastMarker = None

        if BibleOrgSysGlobals.verbosityLevel > 3: print( "  " + _("Loading {} from {}…").format( filename, folder ) )
        elif BibleOrgSysGlobals.verbosityLevel > 2: print( "  " + _("Loading {}…").format( filename ) )
        self.isOneChapterBook = self.BBB in BibleOrgSysGlobals.BibleBooksCodes.getSingleChapterBooksList()
        self.sourceFilename = filename
        self.sourceFolder = folder
        self.sourceFilepath = os.path.join( folder, filename ) if folder else filename
        try: self.tree = ElementTree().parse( self.sourceFilepath )
        except ParseError as err:
            logging.critical( exp("Loader parse error in xml file {}: {} {}").format( filename, sys.exc_info()[0], err ) )
            loadErrors.append( exp("Loader parse error in xml file {}: {} {}").format( filename, sys.exc_info()[0], err ) )
            self.addPriorityError( 100, C, V, _("Loader parse error in xml file {}: {}").format( filename, err ) )
        if BibleOrgSysGlobals.debugFlag: assert len ( self.tree ) # Fail here if we didn't load anything at all

        # Find the main container
        if 'tree' in dir(self) \
        and ( self.tree.tag=='usx' or self.tree.tag=='usfm' ): # Not sure why both are allowable
            location = "USX ({}) file".format( self.tree.tag )
            BibleOrgSysGlobals.checkXMLNoText( self.tree, location )
            BibleOrgSysGlobals.checkXMLNoTail( self.tree, location )

            # Process the attributes first
            self.schemaLocation = ''
            version = None
            for attrib,value in self.tree.items():
                if attrib=='version': version = value
                else: logging.warning( _("DG84 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
            if version not in ( None, '2.0' ):
                logging.warning( _("Not sure if we can handle v{} USX files").format( version ) )

            # Now process the data
            for element in self.tree:
                sublocation = element.tag + " " + location
                if element.tag == 'book': # milestone (not a container)
                    BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation )
                    BibleOrgSysGlobals.checkXMLNoTail( element, sublocation )
                    # Process the attributes
                    idField = bookStyle = None
                    for attrib,value in element.items():
                        if attrib=='id' or attrib=='code':
                            idField = value # Should be USFM bookcode (not like BBB which is BibleOrgSys BBB bookcode)
                            #if idField != BBB:
                            #    logging.warning( _("Unexpected book code ({}) in {}").format( idField, sublocation ) )
                        elif attrib=='style':
                            bookStyle = value
                        else:
                            logging.warning( _("MD12 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
                    if bookStyle != 'id':
                        logging.warning( _("Unexpected style attribute ({}) in {}").format( bookStyle, sublocation ) )
                    idLine = idField
                    if element.text and element.text.strip(): idLine += ' ' + element.text
                    self.addLine( 'id', idLine )
                elif element.tag == 'chapter': # milestone (not a container)
                    V = '0'
                    BibleOrgSysGlobals.checkXMLNoText( element, sublocation )
                    BibleOrgSysGlobals.checkXMLNoTail( element, sublocation )
                    BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation )
                    # Process the attributes
                    chapterStyle = pubNumber = None
                    for attrib,value in element.items():
                        if attrib=='number':
                            C = value
                        elif attrib=='style':
                            chapterStyle = value
                        elif attrib=='pubnumber':
                            pubNumber = value
                        else:
                            logging.error( _("LY76 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
                    if chapterStyle != 'c':
                        logging.warning( _("Unexpected style attribute ({}) in {}").format( chapterStyle, sublocation ) )
                    #if pubNumber: print( self.BBB, C, repr(pubNumber) ); halt
                    self.addLine( 'c', C )
                    if pubNumber: self.addLine( 'cp', pubNumber )
                elif element.tag == 'para':
                    BibleOrgSysGlobals.checkXMLNoTail( element, sublocation )
                    USFMMarker = element.attrib['style'] # Get the USFM code for the paragraph style
                    if BibleOrgSysGlobals.USFMMarkers.isNewlineMarker( USFMMarker ):
                        #if lastMarker: self.addLine( lastMarker, lastText )
                        #lastMarker, lastText = USFMMarker, text
                        loadParagraph( element, sublocation )
                    elif BibleOrgSysGlobals.USFMMarkers.isInternalMarker( USFMMarker ): # the line begins with an internal USFM Marker -- append it to the previous line
                        text = element.text
                        if text is None: text = ''
                        if BibleOrgSysGlobals.debugFlag:
                            print( _("{} {}:{} Found '\\{}' internal USFM marker at beginning of line with text: {}").format( self.BBB, C, V, USFMMarker, text ) )
                            #halt # Not checked yet
                        if text:
                            loadErrors.append( _("{} {}:{} Found '\\{}' internal USFM marker at beginning of line with text: {}").format( self.BBB, C, V, USFMMarker, text ) )
                            logging.warning( _("Found '\\{}' internal USFM Marker after {} {}:{} at beginning of line with text: {}").format( USFMMarker, self.BBB, C, V, text ) )
                        else: # no text
                            loadErrors.append( _("{} {}:{} Found '\\{}' internal USFM Marker at beginning of line (with no text)").format( self.BBB, C, V, USFMMarker ) )
                            logging.warning( _("Found '\\{}' internal USFM Marker after {} {}:{} at beginning of line (with no text)").format( USFMMarker, self.BBB, C, V ) )
                        self.addPriorityError( 97, C, V, _("Found \\{} internal USFM Marker on new line in file").format( USFMMarker ) )
                        #lastText += '' if lastText.endswith(' ') else ' ' # Not always good to add a space, but it's their fault!
                        lastText =  '\\' + USFMMarker + ' ' + text
                        #print( "{} {} {} Now have {}:{!r}".format( self.BBB, C, V, lastMarker, lastText ) )
                    else: # the line begins with an unknown USFM Marker
                        try: status = element.attrib['status']
                        except KeyError: status = None
                        text = element.text
                        if text:
                            loadErrors.append( _("{} {}:{} Found '\\{}' unknown USFM Marker at beginning of line with text: {}").format( self.BBB, C, V, USFMMarker, text ) )
                            logging.error( _("Found '\\{}' unknown USFM Marker after {} {}:{} at beginning of line with text: {}").format( USFMMarker, self.BBB, C, V, text ) )
                        else: # no text
                            loadErrors.append( _("{} {}:{} Found '\\{}' unknown USFM Marker at beginning of line (with no text").format( self.BBB, C, V, USFMMarker ) )
                            logging.error( _("Found '\\{}' unknown USFM Marker after {} {}:{} at beginning of line (with no text)").format( USFMMarker, self.BBB, C, V ) )
                        self.addPriorityError( 100, C, V, _("Found \\{} unknown USFM Marker on new line in file").format( USFMMarker ) )
                        if status == 'unknown': # USX exporter already knew it was a bad marker
                            pass # Just drop it completely
                        else:
                            for tryMarker in sortedNLMarkers: # Try to do something intelligent here -- it might be just a missing space
                                if USFMMarker.startswith( tryMarker ): # Let's try changing it
                                    if lastMarker: self.addLine( lastMarker, lastText )
                                    lastMarker, lastText = tryMarker, USFMMarker[len(tryMarker):] + ' ' + text
                                    loadErrors.append( _("{} {}:{} Changed '\\{}' unknown USFM Marker to {!r} at beginning of line: {}").format( self.BBB, C, V, USFMMarker, tryMarker, text ) )
                                    logging.warning( _("Changed '\\{}' unknown USFM Marker to {!r} after {} {}:{} at beginning of line: {}").format( USFMMarker, tryMarker, self.BBB, C, V, text ) )
                                    break
                        # Otherwise, don't bother processing this line -- it'll just cause more problems later on
                else:
                    logging.error( _("Unprocessed {} element after {} {}:{} in {}").format( element.tag, self.BBB, C, V, sublocation ) )
                    self.addPriorityError( 1, C, V, _("Unprocessed {} element").format( element.tag ) )

        if loadErrors: self.errorDictionary['Load Errors'] = loadErrors
Exemplo n.º 5
0
class OpenSongXMLBible(Bible):
    """
    Class for reading, validating, and converting OpenSong Bible XML.
    """
    treeTag = 'bible'
    bookTag = 'b'
    chapterTag = 'c'
    verseTag = 'v'

    def __init__(self, sourceFolder, givenName, encoding='utf-8'):
        """
        Constructor: just sets up the XML Bible file converter object.
        """
        # Setup and initialise the base class first
        if BibleOrgSysGlobals.debugFlag:
            print("OpenSongXMLBible( {}, {}, {} )".format(
                sourceFolder, givenName, encoding))
        Bible.__init__(self)
        self.objectNameString = 'OpenSong XML Bible object'
        self.objectTypeString = 'OpenSong'

        # Now we can set our object variables
        self.sourceFolder, self.givenName, self.encoding = sourceFolder, givenName, encoding
        self.sourceFilepath = os.path.join(self.sourceFolder, self.givenName)

        self.tree = None  # Will hold the XML data

        # Get the data tables that we need for proper checking
        #self.ISOLanguages = ISO_639_3_Languages().loadData()
        self.genericBOS = BibleOrganizationalSystem('GENERIC-KJV-66-ENG')

        # Do a preliminary check on the readability of our file
        if not os.access(self.sourceFilepath, os.R_OK):
            print("OpenSongXMLBible: File {!r} is unreadable".format(
                self.sourceFilepath))

        self.name = self.givenName
        #if self.name is None:
        #pass

    # end of OpenSongXMLBible.__init__

    def load(self):
        """
        Load a single source XML file and load book elements.
        """
        if BibleOrgSysGlobals.verbosityLevel > 2:
            print(_("Loading {}…").format(self.sourceFilepath))
        self.tree = ElementTree().parse(self.sourceFilepath)
        if BibleOrgSysGlobals.debugFlag:
            assert len(
                self.tree)  # Fail here if we didn't load anything at all

        # Find the main (bible) container
        if self.tree.tag == OpenSongXMLBible.treeTag:
            location = "XML file"
            BibleOrgSysGlobals.checkXMLNoText(self.tree, location, '4f6h')
            BibleOrgSysGlobals.checkXMLNoTail(self.tree, location, '1wk8')

            name = shortName = None
            for attrib, value in self.tree.items():
                if attrib == "n":
                    name = value
                elif attrib == "sn":
                    shortName = value
                else:
                    logging.warning(
                        "Unprocessed {!r} attribute ({}) in main element".
                        format(attrib, value))

            # Find the submain (book) containers
            for element in self.tree:
                if element.tag == OpenSongXMLBible.bookTag:
                    sublocation = "book in " + location
                    BibleOrgSysGlobals.checkXMLNoText(element, sublocation,
                                                      'g3g5')
                    BibleOrgSysGlobals.checkXMLNoTail(element, sublocation,
                                                      'd3f6')
                    self.__validateAndExtractBook(element)
                elif element.tag == 'OT':
                    pass
                elif element.tag == 'NT':
                    pass
                else:
                    logging.error("Expected to find {!r} but got {!r}".format(
                        OpenSongXMLBible.bookTag, element.tag))
        else:
            logging.error("Expected to load {!r} but got {!r}".format(
                OpenSongXMLBible.treeTag, self.tree.tag))
        self.doPostLoadProcessing()

    # end of OpenSongXMLBible.load

    def __validateAndExtractBook(self, book):
        """
        Check/validate and extract book data from the given XML book record
            finding chapter subelements.
        """
        global BibleBooksNames

        if BibleOrgSysGlobals.verbosityLevel > 3:
            print(_("Validating OpenSong XML book…"))

        # Process the div attributes first
        BBB = bookName = None
        for attrib, value in book.items():
            if attrib == "n":
                bookName = value
            else:
                logging.warning(
                    "Unprocessed {!r} attribute ({}) in book element".format(
                        attrib, value))
        if bookName:
            BBB = self.genericBOS.getBBBFromText(
                bookName)  # Booknames are usually in English
            if not BBB:  # wasn't English
                if BibleBooksNames is None:
                    BibleBooksNames = BibleBooksNamesSystems().loadData()
                BBB = BibleBooksNames.getBBBFromText(
                    bookName)  # Try non-English booknames
                #print( "bookName", bookName, BBB )
            if BBB:
                if BibleOrgSysGlobals.verbosityLevel > 2:
                    print(_("Validating {} {}…").format(BBB, bookName))
                thisBook = BibleBook(self, BBB)
                thisBook.objectNameString = 'OpenSong XML Bible Book object'
                thisBook.objectTypeString = 'OpenSong'
                #thisBook.sourceFilepath = self.sourceFilepath
                USFMAbbreviation = BibleOrgSysGlobals.BibleBooksCodes.getUSFMAbbreviation(
                    BBB)
                thisBook.addLine(
                    'id', '{} imported by {}'.format(USFMAbbreviation.upper(),
                                                     ProgNameVersion))
                thisBook.addLine('h', bookName)
                thisBook.addLine('mt1', bookName)
                for element in book:
                    if element.tag == OpenSongXMLBible.chapterTag:
                        sublocation = "chapter in {}".format(BBB)
                        BibleOrgSysGlobals.checkXMLNoText(
                            element, sublocation, 'j3jd')
                        BibleOrgSysGlobals.checkXMLNoTail(
                            element, sublocation, 'al1d')
                        self.__validateAndExtractChapter(
                            BBB, thisBook, element)
                    else:
                        logging.error(
                            "Expected to find {!r} but got {!r}".format(
                                OpenSongXMLBible.chapterTag, element.tag))
                if BibleOrgSysGlobals.verbosityLevel > 2:
                    print("  Saving {} into results…".format(BBB))
                self.stashBook(thisBook)
            else:
                logging.error(
                    _("OpenSong load doesn't recognize book name: {!r}").
                    format(bookName))  # no BBB
        else:
            logging.error(
                _("OpenSong load can't find a book name"))  # no bookName

    # end of OpenSongXMLBible.__validateAndExtractBook

    def __validateAndExtractChapter(self, BBB, thisBook, chapter):
        """
        Check/validate and extract chapter data from the given XML book record
            finding and saving chapter numbers and
            finding and saving verse elements.
        """

        if BibleOrgSysGlobals.verbosityLevel > 3:
            print(_("Validating XML chapter…"))

        # Process the div attributes first
        chapterNumber = numVerses = None
        for attrib, value in chapter.items():
            if attrib == "n":
                chapterNumber = value
            elif attrib == "VERSES":
                numVerses = value
            else:
                logging.warning(
                    "Unprocessed {!r} attribute ({}) in chapter element".
                    format(attrib, value))
        if chapterNumber:
            #print( BBB, 'c', chapterNumber )
            chapterNumber = chapterNumber.replace(
                'of Solomon ', '')  # Fix a mistake in the Chinese_SU module
            thisBook.addLine('c', chapterNumber)
        else:
            logging.error(
                "Missing 'n' attribute in chapter element for {}".format(BBB))

        for element in chapter:
            if element.tag == OpenSongXMLBible.verseTag:
                sublocation = "verse in {} {}".format(BBB, chapterNumber)
                BibleOrgSysGlobals.checkXMLNoTail(element, sublocation, 'l5ks')
                verseNumber = toVerseNumber = None
                for attrib, value in element.items():
                    if attrib == "n":
                        verseNumber = value
                    elif attrib == "t":
                        toVerseNumber = value
                    else:
                        logging.warning(
                            "Unprocessed {!r} attribute ({}) in verse element".
                            format(attrib, value))
                if BibleOrgSysGlobals.debugFlag: assert verseNumber
                #thisBook.addLine( 'v', verseNumber )
                vText = element.text if element.text else ''
                for subelement in element:
                    sub2location = "{} in {}".format(subelement.tag,
                                                     sublocation)
                    BibleOrgSysGlobals.checkXMLNoAttributes(
                        subelement, sub2location, 'ks03')
                    BibleOrgSysGlobals.checkXMLNoSubelements(
                        subelement, sub2location, 'ks05')
                    if subelement.tag == 'i':
                        vText += '\\it {}\\it*{}'.format(
                            subelement.text, subelement.tail)
                    else:
                        logging.error(
                            "Expected to find 'i' but got {!r}".format(
                                subelement.tag))
                vText += element.tail if element.tail else ''
                if not vText:
                    logging.warning("{} {}:{} has no text".format(
                        BBB, chapterNumber, verseNumber))
                #print( 'vText1', vText )
                if vText:  # This is the main text of the verse (follows the verse milestone)
                    #print( "{} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) )
                    if '\n' in vText:  # This is how they represent poety
                        #print( "vText", repr(vText), repr(element.text) )
                        for j, textBit in enumerate(vText.split('\n')):
                            if j == 0:
                                thisBook.addLine('q1', '')
                                thisBook.addLine('v',
                                                 verseNumber + ' ' + textBit)
                            else:
                                thisBook.addLine('q1', textBit)
                    else:  # Just one verse line
                        thisBook.addLine('v', verseNumber + ' ' + vText)
                #print( 'vText2', vText )
            else:
                logging.error("Expected to find {!r} but got {!r}".format(
                    OpenSongXMLBible.verseTag, element.tag))
Exemplo n.º 6
0
class USFXXMLBible( Bible ):
    """
    Class to load and manipulate USFX Bibles.

    """
    def __init__( self, sourceFolder, givenName=None, encoding='utf-8' ):
        """
        Create the internal USFX Bible object.
        """
         # Setup and initialise the base class first
        Bible.__init__( self )
        self.objectNameString = "USFX XML Bible object"
        self.objectTypeString = "USFX"

        self.sourceFolder, self.givenName, self.encoding = sourceFolder, givenName, encoding # Remember our parameters

        # Now we can set our object variables
        self.name = self.givenName
        if not self.name: self.name = os.path.basename( self.sourceFolder )
        if not self.name: self.name = os.path.basename( self.sourceFolder[:-1] ) # Remove the final slash
        if not self.name: self.name = "USFX Bible"
        if self.name.endswith( '_usfx' ): self.name = self.name[:-5] # Remove end of name for Haiola projects

        # Do a preliminary check on the readability of our folder
        if not os.access( self.sourceFolder, os.R_OK ):
            logging.error( "USFXXMLBible: Folder {!r} is unreadable".format( self.sourceFolder ) )

        # Do a preliminary check on the contents of our folder
        self.sourceFilename = self.sourceFilepath = None
        foundFiles, foundFolders = [], []
        for something in os.listdir( self.sourceFolder ):
            somepath = os.path.join( self.sourceFolder, something )
            if os.path.isdir( somepath ): foundFolders.append( something )
            elif os.path.isfile( somepath ):
                somethingUpper = something.upper()
                somethingUpperProper, somethingUpperExt = os.path.splitext( somethingUpper )
                ignore = False
                for ending in filenameEndingsToIgnore:
                    if somethingUpper.endswith( ending): ignore=True; break
                if ignore: continue
                if not somethingUpperExt[1:] in extensionsToIgnore: # Compare without the first dot
                    foundFiles.append( something )
            else: logging.error( "Not sure what {!r} is in {}!".format( somepath, self.sourceFolder ) )
        if foundFolders: logging.info( "USFXXMLBible: Surprised to see subfolders in {!r}: {}".format( self.sourceFolder, foundFolders ) )
        if not foundFiles:
            if BibleOrgSysGlobals.verbosityLevel > 0: print( "USFXXMLBible: Couldn't find any files in {!r}".format( self.sourceFolder ) )
            return # No use continuing

        #print( self.sourceFolder, foundFolders, len(foundFiles), foundFiles )
        numFound = 0
        for thisFilename in sorted( foundFiles ):
            firstLines = BibleOrgSysGlobals.peekIntoFile( thisFilename, sourceFolder, numLines=3 )
            if not firstLines or len(firstLines)<2: continue
            if not firstLines[0].startswith( '<?xml version="1.0"' ) \
            and not firstLines[0].startswith( '\ufeff<?xml version="1.0"' ): # same but with BOM
                if BibleOrgSysGlobals.verbosityLevel > 2: print( "USFXB (unexpected) first line was {!r} in {}".format( firstLines, thisFilename ) )
                continue
            if "<usfx " not in firstLines[0]:
                continue
            lastFilenameFound = thisFilename
            numFound += 1
        if numFound:
            if BibleOrgSysGlobals.verbosityLevel > 2: print( "USFXXMLBible got", numFound, sourceFolder, lastFilenameFound )
            if numFound == 1:
                self.sourceFilename = lastFilenameFound
                self.sourceFilepath = os.path.join( self.sourceFolder, self.sourceFilename )
        elif looksHopeful and BibleOrgSysGlobals.verbosityLevel > 2: print( "    Looked hopeful but no actual files found" )
    # end of USFXXMLBible.__init_


    def load( self ):
        """
        Load the XML data file -- we should already know the filepath.
        """
        if BibleOrgSysGlobals.verbosityLevel > 1:
            print( _("USFXXMLBible: Loading {} from {}...").format( self.name, self.sourceFolder ) )

                                #if BibleOrgSysGlobals.verbosityLevel > 2: print( _("  It seems we have {}...").format( BBB ) )
                        #self.thisBook = BibleBook( self, BBB )
                        #self.thisBook.objectNameString = "OSIS XML Bible Book object"
                        #self.thisBook.objectTypeString = "OSIS"
                        #self.haveBook = True

        try: self.tree = ElementTree().parse( self.sourceFilepath )
        except ParseError:
            errorString = sys.exc_info()[1]
            logging.critical( "USFXXMLBible.load: failed loading the xml file {}: {!r}.".format( self.sourceFilepath, errorString ) )
            return
        if BibleOrgSysGlobals.debugFlag: assert( len ( self.tree ) ) # Fail here if we didn't load anything at all

        # Find the main (osis) container
        if self.tree.tag == 'usfx':
            location = "USFX file"
            BibleOrgSysGlobals.checkXMLNoText( self.tree, location, '4f6h' )
            BibleOrgSysGlobals.checkXMLNoTail( self.tree, location, '1wk8' )
            # Process the attributes first
            self.schemaLocation = None
            for attrib,value in self.tree.items():
                #print( "attrib", repr(attrib), repr(value) )
                if attrib.endswith("SchemaLocation"):
                    self.schemaLocation = value
                else:
                    logging.warning( "fv6g Unprocessed {} attribute ({}) in {}".format( attrib, value, location ) )
            BBB = C = V = None
            for element in self.tree:
                #print( "element", repr(element.tag) )
                sublocation = element.tag + " " + location
                if element.tag == 'languageCode':
                    self.languageCode = element.text
                    BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'cff3' )
                    BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'des1' )
                    BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'dwf2' )
                elif element.tag == 'book':
                    self.loadBook( element )
                    ##BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '54f2' )
                    #BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'hd35' )
                    ## Process the attributes
                    #idField = bookStyle = None
                    #for attrib,value in element.items():
                        #if attrib=='id' or attrib=='code':
                            #idField = value # Should be USFM bookcode (not like BBB which is BibleOrgSys BBB bookcode)
                            ##if idField != BBB:
                            ##    logging.warning( _("Unexpected book code ({}) in {}").format( idField, sublocation ) )
                        #elif attrib=='style':
                            #bookStyle = value
                        #else:
                            #logging.warning( _("gfw2 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
                else:
                    logging.warning( _("dbw1 Unprocessed {} element after {} {}:{} in {}").format( element.tag, BBB, C, V, sublocation ) )
                    #self.addPriorityError( 1, c, v, _("Unprocessed {} element").format( element.tag ) )

        if not self.books: # Didn't successfully load any regularly named books -- maybe the files have weird names??? -- try to be intelligent here
            if BibleOrgSysGlobals.verbosityLevel > 2:
                print( "USFXXMLBible.load: Didn't find any regularly named USFX files in {!r}".format( self.sourceFolder ) )
            for thisFilename in foundFiles:
                # Look for BBB in the ID line (which should be the first line in a USFX file)
                isUSFX = False
                thisPath = os.path.join( self.sourceFolder, thisFilename )
                with open( thisPath ) as possibleUSXFile: # Automatically closes the file when done
                    for line in possibleUSXFile:
                        if line.startswith( '\\id ' ):
                            USXId = line[4:].strip()[:3] # Take the first three non-blank characters after the space after id
                            if BibleOrgSysGlobals.verbosityLevel > 2: print( "Have possible USFX ID {!r}".format( USXId ) )
                            BBB = BibleOrgSysGlobals.BibleBooksCodes.getBBBFromUSFM( USXId )
                            if BibleOrgSysGlobals.verbosityLevel > 2: print( "BBB is {!r}".format( BBB ) )
                            isUSFX = True
                        break # We only look at the first line
                if isUSFX:
                    UBB = USFXXMLBibleBook( self, BBB )
                    UBB.load( self.sourceFolder, thisFilename, self.encoding )
                    UBB.validateMarkers()
                    print( UBB )
                    self.books[BBB] = UBB
                    # Make up our book name dictionaries while we're at it
                    assumedBookNames = UBB.getAssumedBookNames()
                    for assumedBookName in assumedBookNames:
                        self.BBBToNameDict[BBB] = assumedBookName
                        assumedBookNameLower = assumedBookName.lower()
                        self.bookNameDict[assumedBookNameLower] = BBB # Store the deduced book name (just lower case)
                        self.combinedBookNameDict[assumedBookNameLower] = BBB # Store the deduced book name (just lower case)
                        if ' ' in assumedBookNameLower: self.combinedBookNameDict[assumedBookNameLower.replace(' ','')] = BBB # Store the deduced book name (lower case without spaces)
            if self.books: print( "USFXXMLBible.load: Found {} irregularly named USFX files".format( len(self.books) ) )
        self.doPostLoadProcessing()
    # end of USFXXMLBible.load


    def loadBook( self, bookElement ):
        """
        Load the book container from the XML data file.
        """
        if BibleOrgSysGlobals.verbosityLevel > 3:
            print( _("USFXXMLBible.loadBook: Loading {} from {}...").format( self.name, self.sourceFolder ) )
        assert( bookElement.tag == 'book' )
        mainLocation = self.name + " USFX book"

        # Process the attributes first
        bookCode = None
        for attrib,value in bookElement.items():
            if attrib == 'id':
                bookCode = value
            else:
                logging.warning( "bce3 Unprocessed {} attribute ({}) in {}".format( attrib, value, mainLocation ) )
        BBB = BibleOrgSysGlobals.BibleBooksCodes.getBBBFromUSFM( bookCode )
        mainLocation = "{} USFX {} book".format( self.name, BBB )
        if BibleOrgSysGlobals.verbosityLevel > 2:
            print( _("USFXXMLBible.loadBook: Loading {} from {}...").format( BBB, self.name ) )
        BibleOrgSysGlobals.checkXMLNoText( self.tree, mainLocation, '4f6h' )
        BibleOrgSysGlobals.checkXMLNoTail( self.tree, mainLocation, '1wk8' )

        # Now create our actual book
        self.thisBook = BibleBook( self, BBB )
        self.thisBook.objectNameString = "USFX XML Bible Book object"
        self.thisBook.objectTypeString = "USFX"

        C = V = '0'
        for element in bookElement:
            #print( "element", repr(element.tag) )
            location = "{} of {} {}:{}".format( element.tag, mainLocation, BBB, C, V )
            if element.tag == 'id':
                idText = clean( element.text )
                BibleOrgSysGlobals.checkXMLNoTail( element, location, 'vsg3' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'ksq2' )
                for attrib,value in element.items():
                    if attrib == 'id':
                        assert( value == bookCode )
                    else:
                        logging.warning( _("vsg4 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                self.thisBook.addLine( 'id', bookCode + ((' '+idText) if idText else '') )
            elif element.tag == 'ide':
                ideText = clean( element.text )
                BibleOrgSysGlobals.checkXMLNoTail( element, location, 'jsa0' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'ls01' )
                charset = None
                for attrib,value in element.items():
                    if attrib == 'charset': charset = value
                    else:
                        logging.warning( _("jx53 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                self.thisBook.addLine( 'ide', charset + ((' '+ideText) if ideText else '') )
            elif element.tag == 'h':
                hText = element.text
                BibleOrgSysGlobals.checkXMLNoTail( element, location, 'dj35' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, location, 'hs35' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'hs32' )
                self.thisBook.addLine( 'h', clean(hText) )
            elif element.tag == 'toc':
                tocText = element.text
                BibleOrgSysGlobals.checkXMLNoTail( element, location, 'ss13' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'js13' )
                level = None
                for attrib,value in element.items():
                    if attrib == 'level': # Seems compulsory
                        level = value
                    else:
                        logging.warning( _("dg36 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                self.thisBook.addLine( 'toc'+level, clean(tocText) )
            elif element.tag == 'c':
                BibleOrgSysGlobals.checkXMLNoText( element, location, 'ks35' )
                BibleOrgSysGlobals.checkXMLNoTail( element, location, 'gs35' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'kdr3' ) # This is a milestone
                for attrib,value in element.items():
                    if attrib == 'id':
                        C, V = value, '0'
                    else:
                        logging.warning( _("hj52 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                self.thisBook.addLine( 'c', C )
            elif element.tag == 's':
                sText = clean( element.text )
                BibleOrgSysGlobals.checkXMLNoTail( element, location, 'wxg0' )
                level = None
                for attrib,value in element.items():
                    if attrib == 'level': # Seems optional
                        level = value
                    else:
                        logging.warning( _("bdy6 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                marker = 's'
                if level: marker += level
                self.thisBook.addLine( marker, sText )
                for subelement in element:
                    #print( "subelement", repr(subelement.tag) )
                    sublocation = subelement.tag + " of " + location
                    if subelement.tag == 'f':
                        self.loadFootnote( subelement, sublocation, BBB, C, V )
                    elif subelement.tag == 'x':
                        self.loadCrossreference( subelement, sublocation )
                    elif subelement.tag == 'fig':
                        self.loadFigure( subelement, sublocation )
                    elif subelement.tag == 'table':
                        self.loadTable( subelement, sublocation )
                    elif subelement.tag in ('add','it','bd','bdit','sc',):
                        self.loadCharacterFormatting( subelement, sublocation, BBB, C, V )
                    elif subelement.tag == 'optionalLineBreak':
                        print( "What is loadBook optionalLineBreak?" )
                    else:
                        logging.warning( _("jx9q Unprocessed {} element after {} {}:{} in {}").format( subelement.tag, BBB, C, V, sublocation ) )
            elif element.tag in ('p','q','d',):
                V = self.loadParagraph( element, location, BBB, C )
            elif element.tag == 'b':
                BibleOrgSysGlobals.checkXMLNoText( element, location, 'ks35' )
                BibleOrgSysGlobals.checkXMLNoTail( element, location, 'gs35' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, location, 'nd04' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'kdr3' )
                self.thisBook.addLine( 'b', '' )
            elif element.tag in ('cl','cp'): # Simple single-line paragraph-level markers
                marker, text = element.tag, clean(element.text)
                BibleOrgSysGlobals.checkXMLNoTail( element, location, 'od01' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'gd92' )
                idField = None
                for attrib,value in element.items():
                    if attrib == 'id': idField = value
                    else:
                        logging.warning( _("dv35 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                if idField and text is None:
                    text = idField
                else:
                    logging.warning( _("dve4 Unprocessed idField ({}) in {}").format( idField, location ) )
                if text is None:
                    logging.critical( "Why is {} empty at {}".format( marker, location ) )
                assert( text is not None )
                self.thisBook.addLine( marker, text )
            elif element.tag == 'table':
                self.loadTable( element, location )
            elif element.tag == 've': # What's this in Psalms: <c id="4" /><ve /><d>For the Chief Musician; on stringed instruments. A Psalm of David.</d>
                BibleOrgSysGlobals.checkXMLNoText( element, location, 'kds3' )
                BibleOrgSysGlobals.checkXMLNoTail( element, location, 'ks29' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, location, 'kj24' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'js91' )
                #self.thisBook.addLine( 'b', '' )
                if BibleOrgSysGlobals.verbosityLevel > 2: print( "Ignoring 've' field", BBB, C, V )
            else:
                logging.critical( _("caf2 Unprocessed {} element after {} {}:{} in {}").format( element.tag, BBB, C, V, location ) )
                #self.addPriorityError( 1, c, v, _("Unprocessed {} element").format( element.tag ) )
                if BibleOrgSysGlobals.debugFlag and debuggingThisModule: halt
        self.saveBook( self.thisBook )
    # end of USFXXMLBible.loadBook


    def loadParagraph( self, paragraphElement, paragraphLocation, BBB, C ):
        """
        Load the paragraph (p or q) container from the XML data file.
        """
        #if BibleOrgSysGlobals.verbosityLevel > 3:
            #print( _("USFXXMLBible.loadParagraph: Loading {} from {}...").format( self.name, self.sourceFolder ) )

        V = None
        pTag, pText = paragraphElement.tag, clean(paragraphElement.text)
        BibleOrgSysGlobals.checkXMLNoTail( paragraphElement, paragraphLocation, 'vsg7' )

        # Process the attributes first
        sfm = level = style = None
        for attrib,value in paragraphElement.items():
            if attrib == 'sfm': sfm = value
            elif attrib == 'level': level = value
            elif attrib == 'style': style = value
            else:
                logging.warning( "vfh4 Unprocessed {} attribute ({}) in {}".format( attrib, value, paragraphLocation ) )

        if sfm:
            assert( pTag == 'p' )
            pTag = sfm
        if level:
            #assert( pTag == 'q' ) # Could also be mt, etc.
            pTag += level
        if style:
            #print( repr(pTag), repr(pText), repr(style) )
            if BibleOrgSysGlobals.verbosityLevel > 2: print( "Ignoring {!r} style".format( style ) )

        self.thisBook.addLine( pTag, '' if pText is None else pText )

        for element in paragraphElement:
            location = element.tag + " of " + paragraphLocation
            #print( "element", repr(element.tag) )
            if element.tag == 'v': # verse milestone
                vTail = clean( element.tail ) # Main verse text
                BibleOrgSysGlobals.checkXMLNoText( element, location, 'crc2' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'lct3' )
                lastV, V = V, None
                for attrib,value in element.items():
                    if attrib == 'id':
                        V = value
                    else:
                        logging.warning( _("cbs2 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                assert( V is not None )
                assert( V )
                self.thisBook.addLine( 'v', V + ((' '+vTail) if vTail else '' ) )
            elif element.tag == 've': # verse end milestone -- we can just ignore this
                BibleOrgSysGlobals.checkXMLNoText( element, location, 'lsc3' )
                BibleOrgSysGlobals.checkXMLNoTail( element, location, 'mfy4' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, location, 'bd24' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'ks35' )
            elif element.tag == 'fig':
                self.loadFigure( element, location )
            elif element.tag == 'table':
                self.loadTable( element, location )
            elif element.tag == 'f':
                #print( "USFX.loadParagraph Found footnote at", paragraphLocation, C, V, repr(element.text) )
                self.loadFootnote( element, location, BBB, C, V )
            elif element.tag == 'x':
                #print( "USFX.loadParagraph Found xref at", paragraphLocation, C, V, repr(element.text) )
                self.loadCrossreference( element, location )
            elif element.tag in ('add','nd','wj','rq','sig','sls','bk','k','tl','vp','pn','qs','qt','em','it','bd','bdit','sc','no',): # character formatting
                self.loadCharacterFormatting( element, location, BBB, C, V )
            elif element.tag == 'cs': # character style -- seems like a USFX hack
                text, tail = clean(element.text), clean(element.tail)
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'kf92' )
                sfm = None
                for attrib,value in element.items():
                    if attrib == 'sfm': sfm = value
                    else:
                        logging.warning( _("sh29 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                if sfm not in ('w','ior',): print( "cs sfm got", repr(sfm) )
                self.thisBook.appendToLastLine( ' \\{} {}\\{}*{}'.format( sfm, text, sfm, (' '+tail) if tail else '' ) )
            elif element.tag in ('cp',): # Simple single-line paragraph-level markers
                marker, text = element.tag, clean(element.text)
                BibleOrgSysGlobals.checkXMLNoTail( element, location, 'kdf0' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, location, 'lkj1' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'da13' )
                self.thisBook.addLine( marker, text )
            elif element.tag == 'ref': # encoded reference -- seems like a USFX hack
                text, tail = clean(element.text), clean(element.tail)
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'bd83' )
                target = None
                for attrib,value in element.items():
                    if attrib == 'tgt': target = value
                    else:
                        logging.warning( _("be83 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                #if target not in ('w','ior',): print( "ref sfm got", repr(sfm) )
                self.thisBook.appendToLastLine( ' \\{} {}\\{}*{}{}'.format( element.tag, target, element.tag, text, (' '+tail) if tail else '' ) )
                #print( "Saved", '\\{} {}\\{}*{}{}'.format( element.tag, target, element.tag, text, (' '+tail) if tail else '' ) )
            elif element.tag == 'optionalLineBreak':
                print( "What is loadParagraph optionalLineBreak?" )
                if BibleOrgSysGlobals.debugFlag: halt
            elif element.tag == 'milestone': # e.g., <milestone sfm="pb" attribute=""/> (pb = explicit page break)
                BibleOrgSysGlobals.checkXMLNoText( element, location, 'jzx2' )
                BibleOrgSysGlobals.checkXMLNoTail( element, location, 'ms23' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'dw24' )
                sfm = None
                for attrib,value in element.items():
                    if attrib == 'sfm': sfm = value
                    else:
                        logging.warning( _("mcd2 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                if sfm not in ('pb',): print( "milestone sfm got", repr(sfm) )
                self.thisBook.addLine( sfm, '' )
            else:
                logging.warning( _("df45 Unprocessed {} element after {} {}:{} in {}").format( repr(element.tag), self.thisBook.BBB, C, V, location ) )
        return V
    # end of USFXXMLBible.loadParagraph


    def loadCharacterFormatting( self, element, location, BBB, C, V ):
        """
        """
        marker, text, tail = element.tag, clean(element.text), clean(element.tail)
        BibleOrgSysGlobals.checkXMLNoAttributes( element, location, 'sd12' )
        self.thisBook.appendToLastLine( ' \\{} {}'.format( marker, text ) )
        for subelement in element:
            sublocation = subelement.tag + " of " + location
            #print( "element", repr(element.tag) )
            if subelement.tag == 'f':
                #print( "USFX.loadParagraph Found footnote at", sublocation, C, V, repr(subelement.text) )
                self.loadFootnote( subelement, sublocation, BBB, C, V )
            else:
                logging.warning( _("sf31 Unprocessed {} element after {} {}:{} in {}").format( repr(subelement.tag), self.thisBook.BBB, C, V, location ) )
                if BibleOrgSysGlobals.debugFlag and debuggingThisModule: halt
        self.thisBook.appendToLastLine( '\\{}*{}'.format( marker, (' '+tail) if tail else '' ) )
    # end of USFXXMLBible.loadCharacterFormatting


    def loadFigure( self, element, location ):
        """
        """
        BibleOrgSysGlobals.checkXMLNoText( element, location, 'ff36' )
        BibleOrgSysGlobals.checkXMLNoAttributes( element, location, 'cf35' )
        figDict = { 'description':'', 'catalog':'', 'size':'', 'location':'', 'copyright':'', 'caption':'', 'reference':'' }
        for subelement in element:
            sublocation = subelement.tag + " of " + location
            figTag, figText = subelement.tag, clean(subelement.text)
            assert( figTag in figDict )
            figDict[figTag] = '' if figText is None else figText
            BibleOrgSysGlobals.checkXMLNoTail( subelement, sublocation, 'jkf5' )
            BibleOrgSysGlobals.checkXMLNoAttributes( subelement, sublocation, 'ld18' )
            BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'hb46' )
        newString = ''
        for j,tag in enumerate( ('description', 'catalog', 'size', 'location', 'copyright', 'caption', 'reference',) ):
            newString += ('' if j==0 else '|') + figDict[tag]
        figTail = clean( element.tail )
        self.thisBook.appendToLastLine( ' \\fig {}\\fig*{}'.format( newString, (' '+figTail) if figTail else '' ) )
    # end of USFXXMLBible.loadFigure


    def loadTable( self, element, location ):
        """
        """
        BibleOrgSysGlobals.checkXMLNoText( element, location, 'kg92' )
        BibleOrgSysGlobals.checkXMLNoTail( element, location, 'ka92' )
        BibleOrgSysGlobals.checkXMLNoAttributes( element, location, 'ks63' )
        for subelement in element:
            sublocation = subelement.tag + " of " + location
            if subelement.tag == 'tr':
                #print( "table", sublocation )
                self.thisBook.addLine( 'tr', '' )
                BibleOrgSysGlobals.checkXMLNoText( subelement, sublocation, 'sg32' )
                BibleOrgSysGlobals.checkXMLNoTail( subelement, sublocation, 'dh82' )
                BibleOrgSysGlobals.checkXMLNoAttributes( subelement, sublocation, 'mniq' )
                for sub2element in subelement:
                    sub2location = sub2element.tag + " of " + sublocation
                    tag, text = sub2element.tag, clean(sub2element.text)
                    assert( tag in ('th', 'thr', 'tc', 'tcr',) )
                    BibleOrgSysGlobals.checkXMLNoTail( sub2element, sub2location, 'ah82' )
                    BibleOrgSysGlobals.checkXMLNoSubelements( sub2element, sub2location, 'ka63' )
                    level = None
                    for attrib,value in sub2element.items():
                        if attrib == 'level': level = value
                        else:
                            logging.warning( _("vx25 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                    marker = tag + (level if level else '')
                    self.thisBook.appendToLastLine( ' \\{} {}'.format( marker, text ) )
            else:
                logging.warning( _("kv64 Unprocessed {} element after {} {}:{} in {}").format( subelement.tag, self.thisBook.BBB, C, V, sublocation ) )
    # end of USFXXMLBible.loadTable


    def loadFootnote( self, element, location, BBB, C, V ):
        """
        Handles footnote fields, including xt field.
        """
        text, tail = clean(element.text), clean(element.tail)
        caller = None
        for attrib,value in element.items():
            if attrib == 'caller':
                caller = value
            else:
                logging.warning( _("dg35 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
        self.thisBook.appendToLastLine( ' \\f {}{}'.format( caller, (' '+text) if text else '' ) )
        for subelement in element:
            sublocation = subelement.tag + " of " + location
            marker, fText, fTail = subelement.tag, clean(subelement.text), clean(subelement.tail)
            #print( "USFX.loadFootnote", repr(caller), repr(text), repr(tail), repr(marker), repr(fText), repr(fTail) )
            #if BibleOrgSysGlobals.verbosityLevel > 0 and marker not in ('ref','fr','ft','fq','fv','fk','fqa','it','bd','rq',):
                #print( "USFX.loadFootnote found", repr(caller), repr(marker), repr(fText), repr(fTail) )
            if BibleOrgSysGlobals.debugFlag: assert( marker in ('ref','fr','ft','fq','fv','fk','fqa','it','bd','rq','xt',) )
            if marker=='ref':
                assert( fText )
                BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'ls13' )
                target = None
                for attrib,value in subelement.items():
                    if attrib == 'tgt': target = value
                    else:
                        logging.warning( _("gs35 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
                if target:
                    self.thisBook.appendToLastLine( ' \\{} {}\\{}*{}'.format( marker, target, marker, fText ) )
                else: halt
            else:
                BibleOrgSysGlobals.checkXMLNoAttributes( subelement, sublocation, 'dq54' )
                self.thisBook.appendToLastLine( ' \\{} {}'.format( marker, fText ) )
                if marker=='xt' or marker[0]=='f': # Starts with f, e.g., fr, ft
                    for sub2element in subelement:
                        sub2location = sub2element.tag + " of " + sublocation
                        marker2, fText2, fTail2 = sub2element.tag, clean(sub2element.text), clean(sub2element.tail)
                        BibleOrgSysGlobals.checkXMLNoSubelements( sub2element, sub2location, 'js72' )
                        if marker2 == 'ref':
                            #print( sub2location )
                            if fText2:
                                #print( 'ft2', marker2, repr(fText2), repr(fTail2), sub2location )
                                self.thisBook.appendToLastLine( fText2 )
                            target = None
                            for attrib,value in sub2element.items():
                                if attrib == 'tgt': target = value # OSIS style reference, e.g., '1SA.27.8'
                                else:
                                    logging.warning( _("hd52 Unprocessed {} attribute ({}) in {}").format( attrib, value, sub2location ) )
                            if target:
                                #print( 'tg', marker2, repr(target) )
                                self.thisBook.appendToLastLine( ' \\{} {}'.format( marker2, target ) )
                            else:
                                if debuggingThisModule: halt
                        elif marker2 in ('add','nd','wj','rq','sig','sls','bk','k','tl','vp','pn','qs','qt','em','it','bd','bdit','sc','no',): # character formatting
                            self.loadCharacterFormatting( sub2element, sub2location, BBB, C, V )
                        else:
                            print( 'Ignored marker2', repr(marker2), BBB, C, V )
                            if debuggingThisModule: halt
                        if fTail2: self.thisBook.appendToLastLine( fTail2 )
                elif marker in ('add','nd','wj','rq','sig','sls','bk','k','tl','vp','pn','qs','qt','em','it','bd','bdit','sc','no',): # character formatting
                    self.loadCharacterFormatting( subelement, sublocation, BBB, C, V )
                else:
                    print( 'Ignored marker', repr(marker), BBB, C, V )
                    halt
            if fTail:
                self.thisBook.appendToLastLine( '\\{}*{}'.format( marker, fTail ) )
        self.thisBook.appendToLastLine( '\\f*{}'.format( (' '+tail) if tail else '' ) )
    # end of USFXXMLBible.loadFootnote


    def loadCrossreference( self, element, location ):
        """
        Has to handle: <x caller="+"><ref tgt="EXO.30.12">Exodus 30:12</ref></x>
        """
        text, tail = clean(element.text), clean(element.tail)
        caller = None
        for attrib,value in element.items():
            if attrib == 'caller':
                caller = value
            else:
                logging.warning( _("fhj2 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
        self.thisBook.appendToLastLine( ' \\x {}'.format( caller ) )
        for subelement in element:
            sublocation = subelement.tag + " of " + location
            marker, xText, xTail = subelement.tag, clean(subelement.text), clean(subelement.tail)
            #print( "USFX.loadCrossreference", repr(caller), repr(text), repr(tail), repr(marker), repr(xText), repr(xTail) )
            #if BibleOrgSysGlobals.verbosityLevel > 0 and marker not in ('ref','xo','xt',):
                #print( "USFX.loadCrossreference found", repr(caller), repr(marker), repr(xText), repr(xTail) )
            if BibleOrgSysGlobals.debugFlag: assert( marker in ('ref','xo','xt',) )
            if marker=='ref':
                assert( xText )
                BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 's1sd' )
                target = None
                for attrib,value in subelement.items():
                    if attrib == 'tgt': target = value
                    else:
                        logging.warning( _("aj41 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
                if target:
                    self.thisBook.appendToLastLine( ' \\{} {}\\{}*{}'.format( marker, target, marker, xText ) )
                else: halt
            else:
                BibleOrgSysGlobals.checkXMLNoAttributes( subelement, sublocation, 'sc35' )
                self.thisBook.appendToLastLine( ' \\{} {}'.format( marker, xText ) )
                if marker[0] == 'x': # Starts with x, e.g., xo, xt
                    for sub2element in subelement:
                        sub2location = sub2element.tag + " of " + sublocation
                        marker2, xText2, xTail2 = sub2element.tag, clean(sub2element.text), clean(sub2element.tail)
                        BibleOrgSysGlobals.checkXMLNoSubelements( sub2element, sub2location, 'fs63' )
                        if marker2=='ref':
                            if xText2:
                                #print( 'xt2', marker2, repr(xText2), repr(xTail2), sub2location )
                                self.thisBook.appendToLastLine( xText2 )
                            target = None
                            for attrib,value in sub2element.items():
                                if attrib == 'tgt': target = value
                                else:
                                    logging.warning( _("gs34 Unprocessed {} attribute ({}) in {}").format( attrib, value, sub2location ) )
                            if target: self.thisBook.appendToLastLine( ' \\{} {}'.format( marker2, target ) )
                            else: halt
                        else: halt
                        if xTail2: self.thisBook.appendToLastLine( xTail2 )
                else: halt
            if xTail:
                self.thisBook.appendToLastLine( '\\{}*{}'.format( marker, xTail ) )
        self.thisBook.appendToLastLine( '\\x*{}'.format( (' '+tail) if tail else '' ) )
Exemplo n.º 7
0
class OpenSongXMLBible( Bible ):
    """
    Class for reading, validating, and converting OpenSong Bible XML.
    """
    treeTag = 'bible'
    bookTag = 'b'
    chapterTag = 'c'
    verseTag = 'v'


    def __init__( self, sourceFolder, givenName, encoding='utf-8' ):
        """
        Constructor: just sets up the XML Bible file converter object.
        """
        # Setup and initialise the base class first
        if BibleOrgSysGlobals.debugFlag: print( "OpenSongXMLBible( {}, {}, {} )".format( sourceFolder, givenName, encoding ) )
        Bible.__init__( self )
        self.objectNameString = "OpenSong XML Bible object"
        self.objectTypeString = "OpenSong"

        # Now we can set our object variables
        self.sourceFolder, self.givenName, self.encoding = sourceFolder, givenName, encoding
        self.sourceFilepath =  os.path.join( self.sourceFolder, self.givenName )

        self.tree = None # Will hold the XML data

        # Get the data tables that we need for proper checking
        #self.ISOLanguages = ISO_639_3_Languages().loadData()
        self.genericBOS = BibleOrganizationalSystem( "GENERIC-KJV-66-ENG" )

        # Do a preliminary check on the readability of our file
        if not os.access( self.sourceFilepath, os.R_OK ):
            print( "OpenSongXMLBible: File {!r} is unreadable".format( self.sourceFilepath ) )

        self.name = self.givenName
        #if self.name is None:
            #pass
    # end of OpenSongXMLBible.__init__


    def load( self ):
        """
        Load a single source XML file and load book elements.
        """
        if BibleOrgSysGlobals.verbosityLevel > 2: print( _("Loading {}...").format( self.sourceFilepath ) )
        self.tree = ElementTree().parse( self.sourceFilepath )
        if BibleOrgSysGlobals.debugFlag: assert( len ( self.tree ) ) # Fail here if we didn't load anything at all

        # Find the main (bible) container
        if self.tree.tag == OpenSongXMLBible.treeTag:
            location = "XML file"
            BibleOrgSysGlobals.checkXMLNoText( self.tree, location, '4f6h' )
            BibleOrgSysGlobals.checkXMLNoTail( self.tree, location, '1wk8' )

            name = shortName = None
            for attrib,value in self.tree.items():
                if attrib=="n":
                    name = value
                elif attrib=="sn":
                    shortName = value
                else: logging.warning( "Unprocessed {!r} attribute ({}) in main element".format( attrib, value ) )

            # Find the submain (book) containers
            for element in self.tree:
                if element.tag == OpenSongXMLBible.bookTag:
                    sublocation = "book in " + location
                    BibleOrgSysGlobals.checkXMLNoText( element, sublocation, 'g3g5' )
                    BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'd3f6' )
                    self.__validateAndExtractBook( element )
                elif element.tag == 'OT':
                    pass
                elif element.tag == 'NT':
                    pass
                else: logging.error( "Expected to find {!r} but got {!r}".format( OpenSongXMLBible.bookTag, element.tag ) )
        else: logging.error( "Expected to load {!r} but got {!r}".format( OpenSongXMLBible.treeTag, self.tree.tag ) )
        self.doPostLoadProcessing()
    # end of OpenSongXMLBible.load


    def __validateAndExtractBook( self, book ):
        """
        Check/validate and extract book data from the given XML book record
            finding chapter subelements.
        """

        if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating OpenSong XML book...") )

        # Process the div attributes first
        BBB = bookName = None
        for attrib,value in book.items():
            if attrib=="n":
                bookName = value
            else: logging.warning( "Unprocessed {!r} attribute ({}) in book element".format( attrib, value ) )
        if bookName:
            BBB = self.genericBOS.getBBB( bookName ) # Booknames are in English
            if BBB:
                if BibleOrgSysGlobals.verbosityLevel > 2: print( _("Validating {} {}...").format( BBB, bookName ) )
                thisBook = BibleBook( self, BBB )
                thisBook.objectNameString = "OpenSong XML Bible Book object"
                thisBook.objectTypeString = "OpenSong"
                #thisBook.sourceFilepath = self.sourceFilepath
                USFMAbbreviation = BibleOrgSysGlobals.BibleBooksCodes.getUSFMAbbreviation( BBB )
                thisBook.addLine( 'id', '{} imported by {}'.format( USFMAbbreviation.upper(), ProgNameVersion ) )
                thisBook.addLine( 'h', bookName )
                thisBook.addLine( 'mt1', bookName )
                for element in book:
                    if element.tag == OpenSongXMLBible.chapterTag:
                        sublocation = "chapter in {}".format( BBB )
                        BibleOrgSysGlobals.checkXMLNoText( element, sublocation, 'j3jd' )
                        BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                        self.__validateAndExtractChapter( BBB, thisBook, element )
                    else: logging.error( "Expected to find {!r} but got {!r}".format( OpenSongXMLBible.chapterTag, element.tag ) )
                if BibleOrgSysGlobals.verbosityLevel > 2: print( "  Saving {} into results...".format( BBB ) )
                self.saveBook( thisBook )
            else: logging.error( _("OpenSong load doesn't recognize book name: {!r}").format( bookName ) ) # no BBB
        else: logging.error( _("OpenSong load can't find a book name") ) # no bookName
    # end of OpenSongXMLBible.__validateAndExtractBook


    def __validateAndExtractChapter( self, BBB, thisBook, chapter ):
        """
        Check/validate and extract chapter data from the given XML book record
            finding and saving chapter numbers and
            finding and saving verse elements.
        """

        if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML chapter...") )

        # Process the div attributes first
        chapterNumber = numVerses = None
        for attrib,value in chapter.items():
            if attrib=="n":
                chapterNumber = value
            elif attrib=="VERSES":
                numVerses = value
            else: logging.warning( "Unprocessed {!r} attribute ({}) in chapter element".format( attrib, value ) )
        if chapterNumber:
            #print( BBB, 'c', chapterNumber )
            chapterNumber = chapterNumber.replace( 'of Solomon ', '' ) # Fix a mistake in the Chinese_SU module
            thisBook.addLine( 'c', chapterNumber )
        else: logging.error( "Missing 'n' attribute in chapter element for BBB".format( BBB ) )

        for element in chapter:
            if element.tag == OpenSongXMLBible.verseTag:
                sublocation = "verse in {} {}".format( BBB, chapterNumber )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'l5ks' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5f7h' )
                verseNumber = toVerseNumber = None
                for attrib,value in element.items():
                    if attrib=="n":
                        verseNumber = value
                    elif attrib=="t":
                        toVerseNumber = value
                    else: logging.warning( "Unprocessed {!r} attribute ({}) in verse element".format( attrib, value ) )
                if BibleOrgSysGlobals.debugFlag: assert( verseNumber )
                #thisBook.addLine( 'v', verseNumber )
                vText = element.text
                if not vText:
                    logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, verseNumber ) )
                if vText: # This is the main text of the verse (follows the verse milestone)
                    #print( "{} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) )
                    if '\n' in vText: # This is how they represent poety
                        #print( "vText", repr(vText), repr(element.text) )
                        for j, textBit in enumerate( vText.split( '\n' ) ):
                            if j==0:
                                thisBook.addLine( 'q1', '' )
                                thisBook.addLine( 'v', verseNumber + ' ' + textBit )
                            else: thisBook.addLine( 'q1', textBit )
                    else: # Just one verse line
                        thisBook.addLine( 'v', verseNumber + ' ' + vText )
            else: logging.error( "Expected to find {!r} but got {!r}".format( OpenSongXMLBible.verseTag, element.tag ) )
Exemplo n.º 8
0
class USFXXMLBible( Bible ):
    """
    Class to load and manipulate USFX Bibles.

    """
    def __init__( self, sourceFolder, givenName=None, encoding='utf-8' ):
        """
        Create the internal USFX Bible object.
        """
         # Setup and initialise the base class first
        Bible.__init__( self )
        self.objectNameString = "USFX XML Bible object"
        self.objectTypeString = "USFX"

        self.sourceFolder, self.givenName, self.encoding = sourceFolder, givenName, encoding # Remember our parameters

        # Now we can set our object variables
        self.name = self.givenName
        if not self.name: self.name = os.path.basename( self.sourceFolder )
        if not self.name: self.name = os.path.basename( self.sourceFolder[:-1] ) # Remove the final slash
        if not self.name: self.name = "USFX Bible"
        if self.name.endswith( '_usfx' ): self.name = self.name[:-5] # Remove end of name for Haiola projects

        # Do a preliminary check on the readability of our folder
        if not os.access( self.sourceFolder, os.R_OK ):
            logging.error( "USFXXMLBible: Folder '{}' is unreadable".format( self.sourceFolder ) )

        # Do a preliminary check on the contents of our folder
        self.sourceFilename = self.sourceFilepath = None
        foundFiles, foundFolders = [], []
        for something in os.listdir( self.sourceFolder ):
            somepath = os.path.join( self.sourceFolder, something )
            if os.path.isdir( somepath ): foundFolders.append( something )
            elif os.path.isfile( somepath ):
                somethingUpper = something.upper()
                somethingUpperProper, somethingUpperExt = os.path.splitext( somethingUpper )
                ignore = False
                for ending in filenameEndingsToIgnore:
                    if somethingUpper.endswith( ending): ignore=True; break
                if ignore: continue
                if not somethingUpperExt[1:] in extensionsToIgnore: # Compare without the first dot
                    foundFiles.append( something )
            else: logging.error( "Not sure what '{}' is in {}!".format( somepath, self.sourceFolder ) )
        if foundFolders: logging.info( "USFXXMLBible: Surprised to see subfolders in '{}': {}".format( self.sourceFolder, foundFolders ) )
        if not foundFiles:
            if Globals.verbosityLevel > 0: print( "USFXXMLBible: Couldn't find any files in '{}'".format( self.sourceFolder ) )
            return # No use continuing

        #print( self.sourceFolder, foundFolders, len(foundFiles), foundFiles )
        numFound = 0
        for thisFilename in sorted( foundFiles ):
            firstLines = Globals.peekIntoFile( thisFilename, sourceFolder, numLines=3 )
            if not firstLines or len(firstLines)<2: continue
            if not firstLines[0].startswith( '<?xml version="1.0"' ) \
            and not firstLines[0].startswith( '\ufeff<?xml version="1.0"' ): # same but with BOM
                if Globals.verbosityLevel > 2: print( "USFXB (unexpected) first line was '{}' in {}".format( firstLines, thisFilename ) )
                continue
            if "<usfx " not in firstLines[0]:
                continue
            lastFilenameFound = thisFilename
            numFound += 1
        if numFound:
            if Globals.verbosityLevel > 2: print( "USFXXMLBible got", numFound, sourceFolder, lastFilenameFound )
            if numFound == 1:
                self.sourceFilename = lastFilenameFound
                self.sourceFilepath = os.path.join( self.sourceFolder, self.sourceFilename )
        elif looksHopeful and Globals.verbosityLevel > 2: print( "    Looked hopeful but no actual files found" )
    # end of USFXXMLBible.__init_


    def load( self ):
        """
        Load the XML data file -- we should already know the filepath.
        """
        if Globals.verbosityLevel > 1:
            print( _("USFXXMLBible: Loading {} from {}...").format( self.name, self.sourceFolder ) )

                                #if Globals.verbosityLevel > 2: print( _("  It seems we have {}...").format( BBB ) )
                        #self.thisBook = BibleBook( self, BBB )
                        #self.thisBook.objectNameString = "OSIS XML Bible Book object"
                        #self.thisBook.objectTypeString = "OSIS"
                        #self.haveBook = True

        try: self.tree = ElementTree().parse( self.sourceFilepath )
        except ParseError:
            errorString = sys.exc_info()[1]
            logging.critical( "USFXXMLBible.load: failed loading the xml file {}: '{}'.".format( self.sourceFilepath, errorString ) )
            return
        if Globals.debugFlag: assert( len ( self.tree ) ) # Fail here if we didn't load anything at all

        # Find the main (osis) container
        if self.tree.tag == 'usfx':
            location = "USFX file"
            Globals.checkXMLNoText( self.tree, location, '4f6h' )
            Globals.checkXMLNoTail( self.tree, location, '1wk8' )
            # Process the attributes first
            self.schemaLocation = None
            for attrib,value in self.tree.items():
                #print( "attrib", repr(attrib), repr(value) )
                if attrib.endswith("SchemaLocation"):
                    self.schemaLocation = value
                else:
                    logging.warning( "fv6g Unprocessed {} attribute ({}) in {}".format( attrib, value, location ) )
            BBB = C = V = None
            for element in self.tree:
                #print( "element", repr(element.tag) )
                sublocation = element.tag + " " + location
                if element.tag == 'languageCode':
                    self.languageCode = element.text
                    Globals.checkXMLNoTail( element, sublocation, 'cff3' )
                    Globals.checkXMLNoAttributes( element, sublocation, 'des1' )
                    Globals.checkXMLNoSubelements( element, sublocation, 'dwf2' )
                elif element.tag == 'book':
                    self.loadBook( element )
                    ##Globals.checkXMLNoSubelements( element, sublocation, '54f2' )
                    #Globals.checkXMLNoTail( element, sublocation, 'hd35' )
                    ## Process the attributes
                    #idField = bookStyle = None
                    #for attrib,value in element.items():
                        #if attrib=='id' or attrib=='code':
                            #idField = value # Should be USFM bookcode (not like BBB which is BibleOrgSys BBB bookcode)
                            ##if idField != BBB:
                            ##    logging.warning( _("Unexpected book code ({}) in {}").format( idField, sublocation ) )
                        #elif attrib=='style':
                            #bookStyle = value
                        #else:
                            #logging.warning( _("gfw2 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
                else:
                    logging.warning( _("dbw1 Unprocessed {} element after {} {}:{} in {}").format( element.tag, BBB, C, V, sublocation ) )
                    #self.addPriorityError( 1, c, v, _("Unprocessed {} element").format( element.tag ) )

        if not self.books: # Didn't successfully load any regularly named books -- maybe the files have weird names??? -- try to be intelligent here
            if Globals.verbosityLevel > 2:
                print( "USFXXMLBible.load: Didn't find any regularly named USFX files in '{}'".format( self.sourceFolder ) )
            for thisFilename in foundFiles:
                # Look for BBB in the ID line (which should be the first line in a USFX file)
                isUSFX = False
                thisPath = os.path.join( self.sourceFolder, thisFilename )
                with open( thisPath ) as possibleUSXFile: # Automatically closes the file when done
                    for line in possibleUSXFile:
                        if line.startswith( '\\id ' ):
                            USXId = line[4:].strip()[:3] # Take the first three non-blank characters after the space after id
                            if Globals.verbosityLevel > 2: print( "Have possible USFX ID '{}'".format( USXId ) )
                            BBB = Globals.BibleBooksCodes.getBBBFromUSFM( USXId )
                            if Globals.verbosityLevel > 2: print( "BBB is '{}'".format( BBB ) )
                            isUSFX = True
                        break # We only look at the first line
                if isUSFX:
                    UBB = USFXXMLBibleBook( self, BBB )
                    UBB.load( self.sourceFolder, thisFilename, self.encoding )
                    UBB.validateMarkers()
                    print( UBB )
                    self.books[BBB] = UBB
                    # Make up our book name dictionaries while we're at it
                    assumedBookNames = UBB.getAssumedBookNames()
                    for assumedBookName in assumedBookNames:
                        self.BBBToNameDict[BBB] = assumedBookName
                        assumedBookNameLower = assumedBookName.lower()
                        self.bookNameDict[assumedBookNameLower] = BBB # Store the deduced book name (just lower case)
                        self.combinedBookNameDict[assumedBookNameLower] = BBB # Store the deduced book name (just lower case)
                        if ' ' in assumedBookNameLower: self.combinedBookNameDict[assumedBookNameLower.replace(' ','')] = BBB # Store the deduced book name (lower case without spaces)
            if self.books: print( "USFXXMLBible.load: Found {} irregularly named USFX files".format( len(self.books) ) )
        self.doPostLoadProcessing()
    # end of USFXXMLBible.load


    def loadBook( self, bookElement ):
        """
        Load the book container from the XML data file.
        """
        if Globals.verbosityLevel > 3:
            print( _("USFXXMLBible.loadBook: Loading {} from {}...").format( self.name, self.sourceFolder ) )
        assert( bookElement.tag == 'book' )
        mainLocation = self.name + " USFX book"

        # Process the attributes first
        bookCode = None
        for attrib,value in bookElement.items():
            if attrib == 'id':
                bookCode = value
            else:
                logging.warning( "bce3 Unprocessed {} attribute ({}) in {}".format( attrib, value, mainLocation ) )
        BBB = Globals.BibleBooksCodes.getBBBFromUSFM( bookCode )
        mainLocation = "{} USFX {} book".format( self.name, BBB )
        if Globals.verbosityLevel > 2:
            print( _("USFXXMLBible.loadBook: Loading {} from {}...").format( BBB, self.name ) )
        Globals.checkXMLNoText( self.tree, mainLocation, '4f6h' )
        Globals.checkXMLNoTail( self.tree, mainLocation, '1wk8' )

        # Now create our actual book
        self.thisBook = BibleBook( self, BBB )
        self.thisBook.objectNameString = "USFX XML Bible Book object"
        self.thisBook.objectTypeString = "USFX"

        C = V = '0'
        for element in bookElement:
            #print( "element", repr(element.tag) )
            location = "{} of {} {}:{}".format( element.tag, mainLocation, C, V )
            if element.tag == 'id':
                idText = clean( element.text )
                Globals.checkXMLNoTail( element, location, 'vsg3' )
                Globals.checkXMLNoSubelements( element, location, 'ksq2' )
                for attrib,value in element.items():
                    if attrib == 'id':
                        assert( value == bookCode )
                    else:
                        logging.warning( _("vsg4 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                self.thisBook.appendLine( 'id', bookCode + ((' '+idText) if idText else '') )
            elif element.tag == 'ide':
                ideText = clean( element.text )
                Globals.checkXMLNoTail( element, location, 'jsa0' )
                Globals.checkXMLNoSubelements( element, location, 'ls01' )
                charset = None
                for attrib,value in element.items():
                    if attrib == 'charset': charset = value
                    else:
                        logging.warning( _("jx53 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                self.thisBook.appendLine( 'ide', charset + ((' '+ideText) if ideText else '') )
            elif element.tag == 'h':
                hText = element.text
                Globals.checkXMLNoTail( element, location, 'dj35' )
                Globals.checkXMLNoAttributes( element, location, 'hs35' )
                Globals.checkXMLNoSubelements( element, location, 'hs32' )
                self.thisBook.appendLine( 'h', clean(hText) )
            elif element.tag == 'toc':
                tocText = element.text
                Globals.checkXMLNoTail( element, location, 'ss13' )
                Globals.checkXMLNoSubelements( element, location, 'js13' )
                level = None
                for attrib,value in element.items():
                    if attrib == 'level': # Seems compulsory
                        level = value
                    else:
                        logging.warning( _("dg36 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                self.thisBook.appendLine( 'toc'+level, clean(tocText) )
            elif element.tag == 'c':
                Globals.checkXMLNoText( element, location, 'ks35' )
                Globals.checkXMLNoTail( element, location, 'gs35' )
                Globals.checkXMLNoSubelements( element, location, 'kdr3' ) # This is a milestone
                for attrib,value in element.items():
                    if attrib == 'id':
                        C, V = value, '0'
                    else:
                        logging.warning( _("hj52 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                self.thisBook.appendLine( 'c', C )
            elif element.tag == 's':
                sText = clean( element.text )
                Globals.checkXMLNoTail( element, location, 'wxg0' )
                level = None
                for attrib,value in element.items():
                    if attrib == 'level': # Seems optional
                        level = value
                    else:
                        logging.warning( _("bdy6 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                marker = 's'
                if level: marker += level
                self.thisBook.appendLine( marker, sText )
                for subelement in element:
                    #print( "subelement", repr(subelement.tag) )
                    sublocation = subelement.tag + " of " + location
                    if subelement.tag == 'f':
                        self.loadFootnote( subelement, sublocation )
                    elif subelement.tag == 'x':
                        self.loadCrossreference( subelement, sublocation )
                    elif subelement.tag == 'fig':
                        self.loadFigure( subelement, sublocation )
                    elif subelement.tag == 'table':
                        self.loadTable( subelement, sublocation )
                    elif subelement.tag in ('add','it','bd','bdit','sc',):
                        self.loadCharacterFormatting( subelement, sublocation )
                    elif subelement.tag == 'optionalLineBreak':
                        print( "What is loadBook optionalLineBreak?" )
                    else:
                        logging.warning( _("jx9q Unprocessed {} element after {} {}:{} in {}").format( subelement.tag, BBB, C, V, sublocation ) )
            elif element.tag in ('p','q','d',):
                V = self.loadParagraph( element, location, C )
            elif element.tag == 'b':
                Globals.checkXMLNoText( element, location, 'ks35' )
                Globals.checkXMLNoTail( element, location, 'gs35' )
                Globals.checkXMLNoAttributes( element, location, 'nd04' )
                Globals.checkXMLNoSubelements( element, location, 'kdr3' )
                self.thisBook.appendLine( 'b', '' )
            elif element.tag in ('cl','cp'): # Simple single-line paragraph-level markers
                marker, text = element.tag, clean(element.text)
                Globals.checkXMLNoTail( element, location, 'od01' )
                Globals.checkXMLNoAttributes( element, location, 'us91' )
                Globals.checkXMLNoSubelements( element, location, 'gd92' )
                self.thisBook.appendLine( marker, text )
            elif element.tag == 'table':
                self.loadTable( element, location )
            else:
                logging.critical( _("caf2 Unprocessed {} element after {} {}:{} in {}").format( element.tag, BBB, C, V, location ) )
                #self.addPriorityError( 1, c, v, _("Unprocessed {} element").format( element.tag ) )
                if Globals.debugFlag: halt
        self.saveBook( self.thisBook )
    # end of USFXXMLBible.loadBook


    def loadParagraph( self, paragraphElement, paragraphLocation, C ):
        """
        Load the paragraph (p or q) container from the XML data file.
        """
        #if Globals.verbosityLevel > 3:
            #print( _("USFXXMLBible.loadParagraph: Loading {} from {}...").format( self.name, self.sourceFolder ) )

        V = None
        pText = paragraphElement.text
        Globals.checkXMLNoTail( paragraphElement, paragraphLocation, 'vsg7' )

        # Process the attributes first
        sfm = level = style = None
        for attrib,value in paragraphElement.items():
            if attrib == 'sfm':
                sfm = value
            elif attrib == 'level':
                level = value
            elif attrib == 'style':
                style = value
            else:
                logging.warning( "vfh4 Unprocessed {} attribute ({}) in {}".format( attrib, value, paragraphLocation ) )

        for element in paragraphElement:
            location = element.tag + " of " + paragraphLocation
            #print( "element", repr(element.tag) )
            if element.tag == 'v': # verse milestone
                vTail = clean( element.tail ) # Main verse text
                Globals.checkXMLNoText( element, location, 'crc2' )
                Globals.checkXMLNoSubelements( element, location, 'lct3' )
                lastV, V = V, None
                for attrib,value in element.items():
                    if attrib == 'id':
                        V = value
                    else:
                        logging.warning( _("cbs2 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                assert( V is not None )
                assert( V )
                self.thisBook.appendLine( 'v', V + ((' '+vTail) if vTail else '' ) )
            elif element.tag == 've': # verse end milestone -- we can just ignore this
                Globals.checkXMLNoText( element, location, 'lsc3' )
                Globals.checkXMLNoTail( element, location, 'mfy4' )
                Globals.checkXMLNoAttributes( element, location, 'bd24' )
                Globals.checkXMLNoSubelements( element, location, 'ks35' )
            elif element.tag == 'fig':
                self.loadFigure( element, location )
            elif element.tag == 'table':
                self.loadTable( element, location )
            elif element.tag == 'f':
                #print( "USFX.loadParagraph Found footnote at", paragraphLocation, C, V, repr(element.text) )
                self.loadFootnote( element, location )
            elif element.tag == 'x':
                #print( "USFX.loadParagraph Found xref at", paragraphLocation, C, V, repr(element.text) )
                self.loadCrossreference( element, location )
            elif element.tag in ('add','nd','wj','rq','sig','sls','bk','k','tl','vp','pn','qs','qt','em','it','bd','bdit','sc','no',): # character formatting
                self.loadCharacterFormatting( element, location )
            elif element.tag == 'cs': # character style -- seems like a USFX hack
                text, tail = clean(element.text), clean(element.tail)
                Globals.checkXMLNoSubelements( element, location, 'kf92' )
                sfm = None
                for attrib,value in element.items():
                    if attrib == 'sfm': sfm = value
                    else:
                        logging.warning( _("sh29 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                if sfm not in ('w','ior',): print( "cs sfm got", repr(sfm) )
                self.thisBook.appendToLastLine( ' \\{} {}\\{}*{}'.format( sfm, text, sfm, (' '+tail) if tail else '' ) )
            elif element.tag in ('cp',): # Simple single-line paragraph-level markers
                marker, text = element.tag, clean(element.text)
                Globals.checkXMLNoTail( element, location, 'kdf0' )
                Globals.checkXMLNoAttributes( element, location, 'lkj1' )
                Globals.checkXMLNoSubelements( element, location, 'da13' )
                self.thisBook.appendLine( marker, text )
            elif element.tag == 'ref': # encoded reference -- seems like a USFX hack
                text, tail = clean(element.text), clean(element.tail)
                Globals.checkXMLNoSubelements( element, location, 'bd83' )
                target = None
                for attrib,value in element.items():
                    if attrib == 'tgt': target = value
                    else:
                        logging.warning( _("be83 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                #if target not in ('w','ior',): print( "ref sfm got", repr(sfm) )
                self.thisBook.appendToLastLine( ' \\{} {}\\{}*{}{}'.format( element.tag, target, element.tag, text, (' '+tail) if tail else '' ) )
                #print( "Saved", '\\{} {}\\{}*{}{}'.format( element.tag, target, element.tag, text, (' '+tail) if tail else '' ) )
            elif element.tag == 'optionalLineBreak':
                print( "What is loadParagraph optionalLineBreak?" )
                if Globals.debugFlag: halt
            elif element.tag == 'milestone':
                print( "What is loadParagraph milestone?" )
                if Globals.debugFlag: halt
            else:
                logging.warning( _("df45 Unprocessed {} element after {} {}:{} in {}").format( repr(element.tag), self.thisBook.BBB, C, V, location ) )
        return V
    # end of USFXXMLBible.loadParagraph


    def loadCharacterFormatting( self, element, location ):
        """
        """
        marker, text, tail = element.tag, clean(element.text), clean(element.tail)
        Globals.checkXMLNoAttributes( element, location, 'sd12' )
        self.thisBook.appendToLastLine( ' \\{} {}'.format( marker, text ) )
        for subelement in element:
            sublocation = subelement.tag + " of " + location
            #print( "element", repr(element.tag) )
            if subelement.tag == 'f':
                #print( "USFX.loadParagraph Found footnote at", sublocation, C, V, repr(subelement.text) )
                self.loadFootnote( subelement, sublocation )
            else:
                logging.warning( _("sf31 Unprocessed {} element after {} {}:{} in {}").format( repr(subelement.tag), self.thisBook.BBB, C, V, location ) )
                halt
        self.thisBook.appendToLastLine( '\\{}*{}'.format( marker, (' '+tail) if tail else '' ) )
    # end of USFXXMLBible.loadCharacterFormatting


    def loadFigure( self, element, location ):
        """
        """
        Globals.checkXMLNoText( element, location, 'ff36' )
        Globals.checkXMLNoAttributes( element, location, 'cf35' )
        figDict = { 'description':'', 'catalog':'', 'size':'', 'location':'', 'copyright':'', 'caption':'', 'reference':'' }
        for subelement in element:
            sublocation = subelement.tag + " of " + location
            figTag, figText = subelement.tag, clean(subelement.text)
            assert( figTag in figDict )
            figDict[figTag] = '' if figText is None else figText
            Globals.checkXMLNoTail( subelement, sublocation, 'jkf5' )
            Globals.checkXMLNoAttributes( subelement, sublocation, 'ld18' )
            Globals.checkXMLNoSubelements( subelement, sublocation, 'hb46' )
        newString = ''
        for j,tag in enumerate( ('description', 'catalog', 'size', 'location', 'copyright', 'caption', 'reference',) ):
            newString += ('' if j==0 else '|') + figDict[tag]
        figTail = clean( element.tail )
        self.thisBook.appendToLastLine( ' \\fig {}\\fig*{}'.format( newString, (' '+figTail) if figTail else '' ) )
    # end of USFXXMLBible.loadFigure


    def loadTable( self, element, location ):
        """
        """
        Globals.checkXMLNoText( element, location, 'kg92' )
        Globals.checkXMLNoTail( element, location, 'ka92' )
        Globals.checkXMLNoAttributes( element, location, 'ks63' )
        for subelement in element:
            sublocation = subelement.tag + " of " + location
            if subelement.tag == 'tr':
                #print( "table", sublocation )
                self.thisBook.appendLine( 'tr', '' )
                Globals.checkXMLNoText( subelement, sublocation, 'sg32' )
                Globals.checkXMLNoTail( subelement, sublocation, 'dh82' )
                Globals.checkXMLNoAttributes( subelement, sublocation, 'mniq' )
                for sub2element in subelement:
                    sub2location = sub2element.tag + " of " + sublocation
                    tag, text = sub2element.tag, clean(sub2element.text)
                    assert( tag in ('th', 'thr', 'tc', 'tcr',) )
                    Globals.checkXMLNoTail( sub2element, sub2location, 'ah82' )
                    Globals.checkXMLNoSubelements( sub2element, sub2location, 'ka63' )
                    level = None
                    for attrib,value in sub2element.items():
                        if attrib == 'level': level = value
                        else:
                            logging.warning( _("vx25 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                    marker = tag + (level if level else '')
                    self.thisBook.appendToLastLine( ' \\{} {}'.format( marker, text ) )
            else:
                logging.warning( _("kv64 Unprocessed {} element after {} {}:{} in {}").format( subelement.tag, self.thisBook.BBB, C, V, sublocation ) )
    # end of USFXXMLBible.loadTable


    def loadFootnote( self, element, location ):
        """
        """
        text, tail = clean(element.text), clean(element.tail)
        caller = None
        for attrib,value in element.items():
            if attrib == 'caller':
                caller = value
            else:
                logging.warning( _("dg35 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
        self.thisBook.appendToLastLine( ' \\f {}{}'.format( caller, (' '+text) if text else '' ) )
        for subelement in element:
            sublocation = subelement.tag + " of " + location
            marker, fText, fTail = subelement.tag, clean(subelement.text), clean(subelement.tail)
            #print( "USFX.loadFootnote", repr(caller), repr(text), repr(tail), repr(marker), repr(fText), repr(fTail) )
            #if Globals.verbosityLevel > 0 and marker not in ('ref','fr','ft','fq','fv','fk','fqa','it','bd','rq',):
                #print( "USFX.loadFootnote found", repr(caller), repr(marker), repr(fText), repr(fTail) )
            if Globals.debugFlag: assert( marker in ('ref','fr','ft','fq','fv','fk','fqa','it','bd','rq',) )
            if marker=='ref':
                assert( fText )
                Globals.checkXMLNoSubelements( subelement, sublocation, 'ls13' )
                target = None
                for attrib,value in subelement.items():
                    if attrib == 'tgt': target = value
                    else:
                        logging.warning( _("gs35 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
                if target:
                    self.thisBook.appendToLastLine( ' \\{} {}\\{}*{}'.format( marker, target, marker, fText ) )
                else: halt
            else:
                Globals.checkXMLNoAttributes( subelement, sublocation, 'dq54' )
                self.thisBook.appendToLastLine( ' \\{} {}'.format( marker, fText ) )
                if marker[0] == 'f': # Starts with f, e.g., fr, ft
                    for sub2element in subelement:
                        sub2location = sub2element.tag + " of " + sublocation
                        marker2, fText2, fTail2 = sub2element.tag, clean(sub2element.text), clean(sub2element.tail)
                        Globals.checkXMLNoSubelements( sub2element, sub2location, 'js72' )
                        if marker2=='ref':
                            print( sub2location )
                            assert( not fText2 )
                            target = None
                            for attrib,value in sub2element.items():
                                if attrib == 'tgt': target = value
                                else:
                                    logging.warning( _("hd52 Unprocessed {} attribute ({}) in {}").format( attrib, value, sub2location ) )
                            if target:
                                self.thisBook.appendToLastLine( ' \\{} {}'.format( marker2, target ) )
                            else: halt
                        else: halt
                else: halt
            if fTail:
                self.thisBook.appendToLastLine( '\\{}*{}'.format( marker, fTail ) )
        self.thisBook.appendToLastLine( '\\f*{}'.format( (' '+tail) if tail else '' ) )
    # end of USFXXMLBible.loadFootnote


    def loadCrossreference( self, element, location ):
        """
        Has to handle: <x caller="+"><ref tgt="EXO.30.12">Exodus 30:12</ref></x>
        """
        text, tail = clean(element.text), clean(element.tail)
        caller = None
        for attrib,value in element.items():
            if attrib == 'caller':
                caller = value
            else:
                logging.warning( _("fhj2 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
        self.thisBook.appendToLastLine( ' \\x {}'.format( caller ) )
        for subelement in element:
            sublocation = subelement.tag + " of " + location
            marker, xText, xTail = subelement.tag, clean(subelement.text), clean(subelement.tail)
            #print( "USFX.loadCrossreference", repr(caller), repr(text), repr(tail), repr(marker), repr(xText), repr(xTail) )
            #if Globals.verbosityLevel > 0 and marker not in ('ref','xo','xt',):
                #print( "USFX.loadCrossreference found", repr(caller), repr(marker), repr(xText), repr(xTail) )
            if Globals.debugFlag: assert( marker in ('ref','xo','xt',) )
            if marker=='ref':
                assert( xText )
                Globals.checkXMLNoSubelements( subelement, sublocation, 's1sd' )
                target = None
                for attrib,value in subelement.items():
                    if attrib == 'tgt': target = value
                    else:
                        logging.warning( _("aj41 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
                if target:
                    self.thisBook.appendToLastLine( ' \\{} {}\\{}*{}'.format( marker, target, marker, xText ) )
                else: halt
            else:
                Globals.checkXMLNoAttributes( subelement, sublocation, 'sc35' )
                self.thisBook.appendToLastLine( ' \\{} {}'.format( marker, xText ) )
                if marker[0] == 'x': # Starts with x, e.g., xo, xt
                    for sub2element in subelement:
                        sub2location = sub2element.tag + " of " + sublocation
                        marker2, xText2, xTail2 = sub2element.tag, clean(sub2element.text), clean(sub2element.tail)
                        Globals.checkXMLNoSubelements( sub2element, sub2location, 'fs63' )
                        if marker2=='ref':
                            assert( not xText2 )
                            target = None
                            for attrib,value in sub2element.items():
                                if attrib == 'tgt': target = value
                                else:
                                    logging.warning( _("gs34 Unprocessed {} attribute ({}) in {}").format( attrib, value, sub2location ) )
                            if target:
                                self.thisBook.appendToLastLine( ' \\{} {}'.format( marker2, target ) )
                            else: halt
                        else: halt
                else: halt
            if xTail:
                self.thisBook.appendToLastLine( '\\{}*{}'.format( marker, xTail ) )
        self.thisBook.appendToLastLine( '\\x*{}'.format( (' '+tail) if tail else '' ) )
Exemplo n.º 9
0
class OpenSongXMLBible(Bible):
    """
    Class for reading, validating, and converting OpenSong Bible XML.
    """
    treeTag = 'bible'
    bookTag = 'b'
    chapterTag = 'c'
    verseTag = 'v'

    def __init__(self, sourceFolder, givenName, encoding='utf-8'):
        """
        Constructor: just sets up the XML Bible file converter object.
        """
        # Setup and initialise the base class first
        if Globals.debugFlag:
            print("OpenSongXMLBible( {}, {}, {} )".format(
                sourceFolder, givenName, encoding))
        Bible.__init__(self)
        self.objectNameString = "OpenSong XML Bible object"
        self.objectTypeString = "OpenSong"

        # Now we can set our object variables
        self.sourceFolder, self.givenName, self.encoding = sourceFolder, givenName, encoding
        self.sourceFilepath = os.path.join(self.sourceFolder, self.givenName)

        self.tree = None  # Will hold the XML data

        # Get the data tables that we need for proper checking
        #self.ISOLanguages = ISO_639_3_Languages().loadData()
        self.genericBOS = BibleOrganizationalSystem("GENERIC-KJV-66-ENG")

        # Do a preliminary check on the readability of our file
        if not os.access(self.sourceFilepath, os.R_OK):
            print("OpenSongXMLBible: File '{}' is unreadable".format(
                self.sourceFilepath))

        self.name = self.givenName
        #if self.name is None:
        #pass

    # end of OpenSongXMLBible.__init__

    def load(self):
        """
        Load a single source XML file and load book elements.
        """
        if Globals.verbosityLevel > 2:
            print(_("Loading {}...").format(self.sourceFilepath))
        self.tree = ElementTree().parse(self.sourceFilepath)
        if Globals.debugFlag:
            assert (len(self.tree)
                    )  # Fail here if we didn't load anything at all

        # Find the main (bible) container
        if self.tree.tag == OpenSongXMLBible.treeTag:
            location = "XML file"
            Globals.checkXMLNoText(self.tree, location, '4f6h')
            Globals.checkXMLNoTail(self.tree, location, '1wk8')

            name = shortName = None
            for attrib, value in self.tree.items():
                if attrib == "n":
                    name = value
                elif attrib == "sn":
                    shortName = value
                else:
                    logging.warning(
                        "Unprocessed '{}' attribute ({}) in main element".
                        format(attrib, value))

            # Find the submain (book) containers
            for element in self.tree:
                if element.tag == OpenSongXMLBible.bookTag:
                    sublocation = "book in " + location
                    Globals.checkXMLNoText(element, sublocation, 'g3g5')
                    Globals.checkXMLNoTail(element, sublocation, 'd3f6')
                    self.__validateAndExtractBook(element)
                elif element.tag == 'OT':
                    pass
                elif element.tag == 'NT':
                    pass
                else:
                    logging.error("Expected to find '{}' but got '{}'".format(
                        OpenSongXMLBible.bookTag, element.tag))
        else:
            logging.error("Expected to load '{}' but got '{}'".format(
                OpenSongXMLBible.treeTag, self.tree.tag))

    # end of OpenSongXMLBible.load

    def __validateAndExtractBook(self, book):
        """
        Check/validate and extract book data from the given XML book record
            finding chapter subelements.
        """

        if Globals.verbosityLevel > 3:
            print(_("Validating OpenSong XML book..."))

        # Process the div attributes first
        BBB = bookName = None
        for attrib, value in book.items():
            if attrib == "n":
                bookName = value
            else:
                logging.warning(
                    "Unprocessed '{}' attribute ({}) in book element".format(
                        attrib, value))
        if bookName:
            BBB = self.genericBOS.getBBB(bookName)
            if BBB:
                if Globals.verbosityLevel > 2:
                    print(_("Validating {} {}...").format(BBB, bookName))
                thisBook = BibleBook(self.name, BBB)
                thisBook.objectNameString = "OpenSong XML Bible Book object"
                thisBook.objectTypeString = "OpenSong"
                #thisBook.sourceFilepath = self.sourceFilepath
                for element in book:
                    if element.tag == OpenSongXMLBible.chapterTag:
                        sublocation = "chapter in {}".format(BBB)
                        Globals.checkXMLNoText(element, sublocation, 'j3jd')
                        Globals.checkXMLNoTail(element, sublocation, 'al1d')
                        self.__validateAndExtractChapter(
                            BBB, thisBook, element)
                    else:
                        logging.error(
                            "Expected to find '{}' but got '{}'".format(
                                OpenSongXMLBible.chapterTag, element.tag))
                if Globals.verbosityLevel > 2:
                    print("  Saving {} into results...".format(BBB))
                self.saveBook(thisBook)
            logging.error(
                _("OpenSong load doesn't recognize book name: '{}'").format(
                    bookName))
        logging.error(_("OpenSong load can't find a book name"))

    # end of OpenSongXMLBible.__validateAndExtractBook

    def __validateAndExtractChapter(self, BBB, thisBook, chapter):
        """
        Check/validate and extract chapter data from the given XML book record
            finding and saving chapter numbers and
            finding and saving verse elements.
        """

        if Globals.verbosityLevel > 3: print(_("Validating XML chapter..."))

        # Process the div attributes first
        chapterNumber = numVerses = None
        for attrib, value in chapter.items():
            if attrib == "n":
                chapterNumber = value
            elif attrib == "VERSES":
                numVerses = value
            else:
                logging.warning(
                    "Unprocessed '{}' attribute ({}) in chapter element".
                    format(attrib, value))
        if chapterNumber:
            #print( BBB, 'c', chapterNumber )
            chapterNumber = chapterNumber.replace(
                'of Solomon ', '')  # Fix a mistake in the Chinese_SU module
            thisBook.appendLine('c', chapterNumber)
        else:
            logging.error(
                "Missing 'n' attribute in chapter element for BBB".format(BBB))

        for element in chapter:
            if element.tag == OpenSongXMLBible.verseTag:
                sublocation = "verse in {} {}".format(BBB, chapterNumber)
                Globals.checkXMLNoTail(element, sublocation, 'l5ks')
                Globals.checkXMLNoSubelements(element, sublocation, '5f7h')
                verseNumber = toVerseNumber = None
                for attrib, value in element.items():
                    if attrib == "n":
                        verseNumber = value
                    elif attrib == "t":
                        toVerseNumber = value
                    else:
                        logging.warning(
                            "Unprocessed '{}' attribute ({}) in verse element".
                            format(attrib, value))
                if Globals.debugFlag: assert (verseNumber)
                #thisBook.appendLine( 'v', verseNumber )
                vText = element.text
                if not vText:
                    logging.warning("{} {}:{} has no text".format(
                        BBB, chapterNumber, verseNumber))
                if vText:  # This is the main text of the verse (follows the verse milestone)
                    #print( "{} {}:{} '{}'".format( BBB, chapterNumber, verseNumber, vText ) )
                    if '\n' in vText:  # This is how they represent poety
                        #print( "vText", repr(vText), repr(element.text) )
                        for j, textBit in enumerate(vText.split('\n')):
                            if j == 0:
                                thisBook.appendLine('q1', '')
                                thisBook.appendLine(
                                    'v', verseNumber + ' ' + textBit)
                            else:
                                thisBook.appendLine('q1', textBit)
                    else:  # Just one verse line
                        thisBook.appendLine('v', verseNumber + ' ' + vText)
            else:
                logging.error("Expected to find '{}' but got '{}'".format(
                    OpenSongXMLBible.verseTag, element.tag))