def loadTable( self, element, location ): """ """ BibleOrgSysGlobals.checkXMLNoText( element, location, 'kg92' ) BibleOrgSysGlobals.checkXMLNoTail( element, location, 'ka92' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, location, 'ks63' ) for subelement in element: sublocation = subelement.tag + " of " + location if subelement.tag == 'tr': #print( "table", sublocation ) self.thisBook.addLine( 'tr', '' ) BibleOrgSysGlobals.checkXMLNoText( subelement, sublocation, 'sg32' ) BibleOrgSysGlobals.checkXMLNoTail( subelement, sublocation, 'dh82' ) BibleOrgSysGlobals.checkXMLNoAttributes( subelement, sublocation, 'mniq' ) for sub2element in subelement: sub2location = sub2element.tag + " of " + sublocation tag, text = sub2element.tag, clean(sub2element.text) assert( tag in ('th', 'thr', 'tc', 'tcr',) ) BibleOrgSysGlobals.checkXMLNoTail( sub2element, sub2location, 'ah82' ) BibleOrgSysGlobals.checkXMLNoSubelements( sub2element, sub2location, 'ka63' ) level = None for attrib,value in sub2element.items(): if attrib == 'level': level = value else: logging.warning( _("vx25 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) marker = tag + (level if level else '') self.thisBook.appendToLastLine( ' \\{} {}'.format( marker, text ) ) else: logging.warning( _("kv64 Unprocessed {} element after {} {}:{} in {}").format( subelement.tag, self.thisBook.BBB, C, V, sublocation ) )
def __validateAndExtractChapter(self, BBB, thisBook, chapter): """ Check/validate and extract chapter data from the given XML book record finding and saving chapter numbers and finding and saving verse elements. """ if BibleOrgSysGlobals.verbosityLevel > 3: print(_("Validating XML chapter...")) # Process the chapter attributes first chapterNumber = numVerses = None for attrib, value in chapter.items(): if attrib == "cnumber": chapterNumber = value else: logging.warning( "Unprocessed {!r} attribute ({}) in chapter element". format(attrib, value)) if chapterNumber: #print( BBB, 'c', chapterNumber ) thisBook.addLine('c', chapterNumber) else: logging.error( "Missing 'n' attribute in chapter element for BBB".format(BBB)) for element in chapter: if element.tag == ZefaniaXMLBible.verseTag: location = "verse in {} {}".format(BBB, chapterNumber) self.__validateAndExtractVerse(BBB, chapterNumber, thisBook, element) elif element.tag == ZefaniaXMLBible.captionTag: # Used in Psalms location = "caption in {} {}".format(BBB, chapterNumber) BibleOrgSysGlobals.checkXMLNoTail(element, location, 'k5k8') BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'd3f5') # Handle caption attributes vRef = None for attrib, value in element.items(): if attrib == "vref": vRef = value if BibleOrgSysGlobals.debugFlag: assert (vRef == '1') else: logging.warning( "Unprocessed {!r} attribute ({}) in caption element" .format(attrib, value)) if BibleOrgSysGlobals.debugFlag: assert (vRef) vText = element.text if not vText: logging.warning("{} {}:{} has no text".format( BBB, chapterNumber, vRef)) if vText: # This is the main text of the caption #print( "{} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) ) thisBook.addLine('v', '0' + ' ' + vText) # We save it as verse zero else: logging.error("Expected to find {!r} but got {!r}".format( ZefaniaXMLBible.verseTag, element.tag))
def loadCrossreference( self, element, location ): """ Has to handle: <x caller="+"><ref tgt="EXO.30.12">Exodus 30:12</ref></x> """ text, tail = clean(element.text), clean(element.tail) caller = None for attrib,value in element.items(): if attrib == 'caller': caller = value else: logging.warning( _("fhj2 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.thisBook.appendToLastLine( ' \\x {}'.format( caller ) ) for subelement in element: sublocation = subelement.tag + " of " + location marker, xText, xTail = subelement.tag, clean(subelement.text), clean(subelement.tail) #print( "USFX.loadCrossreference", repr(caller), repr(text), repr(tail), repr(marker), repr(xText), repr(xTail) ) #if BibleOrgSysGlobals.verbosityLevel > 0 and marker not in ('ref','xo','xt',): #print( "USFX.loadCrossreference found", repr(caller), repr(marker), repr(xText), repr(xTail) ) if BibleOrgSysGlobals.debugFlag: assert( marker in ('ref','xo','xt',) ) if marker=='ref': assert( xText ) BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 's1sd' ) target = None for attrib,value in subelement.items(): if attrib == 'tgt': target = value else: logging.warning( _("aj41 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) if target: self.thisBook.appendToLastLine( ' \\{} {}\\{}*{}'.format( marker, target, marker, xText ) ) else: halt else: BibleOrgSysGlobals.checkXMLNoAttributes( subelement, sublocation, 'sc35' ) self.thisBook.appendToLastLine( ' \\{} {}'.format( marker, xText ) ) if marker[0] == 'x': # Starts with x, e.g., xo, xt for sub2element in subelement: sub2location = sub2element.tag + " of " + sublocation marker2, xText2, xTail2 = sub2element.tag, clean(sub2element.text), clean(sub2element.tail) BibleOrgSysGlobals.checkXMLNoSubelements( sub2element, sub2location, 'fs63' ) if marker2=='ref': if xText2: #print( 'xt2', marker2, repr(xText2), repr(xTail2), sub2location ) self.thisBook.appendToLastLine( xText2 ) target = None for attrib,value in sub2element.items(): if attrib == 'tgt': target = value else: logging.warning( _("gs34 Unprocessed {} attribute ({}) in {}").format( attrib, value, sub2location ) ) if target: self.thisBook.appendToLastLine( ' \\{} {}'.format( marker2, target ) ) else: halt else: halt if xTail2: self.thisBook.appendToLastLine( xTail2 ) else: halt if xTail: self.thisBook.appendToLastLine( '\\{}*{}'.format( marker, xTail ) ) self.thisBook.appendToLastLine( '\\x*{}'.format( (' '+tail) if tail else '' ) )
def __validateAndExtractChapter( self, BBB, thisBook, chapter ): """ Check/validate and extract chapter data from the given XML book record finding and saving chapter numbers and finding and saving verse elements. """ if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML chapter...") ) # Process the div attributes first chapterNumber = numVerses = None for attrib,value in chapter.items(): if attrib=="n": chapterNumber = value elif attrib=="VERSES": numVerses = value else: logging.warning( "Unprocessed {!r} attribute ({}) in chapter element".format( attrib, value ) ) if chapterNumber: #print( BBB, 'c', chapterNumber ) chapterNumber = chapterNumber.replace( 'of Solomon ', '' ) # Fix a mistake in the Chinese_SU module thisBook.addLine( 'c', chapterNumber ) else: logging.error( "Missing 'n' attribute in chapter element for BBB".format( BBB ) ) for element in chapter: if element.tag == OpenSongXMLBible.verseTag: sublocation = "verse in {} {}".format( BBB, chapterNumber ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'l5ks' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5f7h' ) verseNumber = toVerseNumber = None for attrib,value in element.items(): if attrib=="n": verseNumber = value elif attrib=="t": toVerseNumber = value else: logging.warning( "Unprocessed {!r} attribute ({}) in verse element".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert( verseNumber ) #thisBook.addLine( 'v', verseNumber ) vText = element.text if not vText: logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, verseNumber ) ) if vText: # This is the main text of the verse (follows the verse milestone) #print( "{} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) ) if '\n' in vText: # This is how they represent poety #print( "vText", repr(vText), repr(element.text) ) for j, textBit in enumerate( vText.split( '\n' ) ): if j==0: thisBook.addLine( 'q1', '' ) thisBook.addLine( 'v', verseNumber + ' ' + textBit ) else: thisBook.addLine( 'q1', textBit ) else: # Just one verse line thisBook.addLine( 'v', verseNumber + ' ' + vText ) else: logging.error( "Expected to find {!r} but got {!r}".format( OpenSongXMLBible.verseTag, element.tag ) )
def __validateAndExtractParagraph(self, BBB, chapterNumber, thisBook, paragraph): """ Check/validate and extract paragraph data from the given XML book record finding and saving paragraphs and finding and saving verse elements. """ if BibleOrgSysGlobals.verbosityLevel > 3: print(_("Validating XML paragraph…")) location = "paragraph in {} {}".format(BBB, chapterNumber) BibleOrgSysGlobals.checkXMLNoAttributes(paragraph, location, 'brgw3') BibleOrgSysGlobals.checkXMLNoText(paragraph, location, 'brgw3') BibleOrgSysGlobals.checkXMLNoTail(paragraph, location, 'brgw3') thisBook.addLine('p', '') # Handle verse subelements (verses) for element in paragraph: if element.tag == HaggaiXMLBible.verseTag: location = "verse in {} {}".format(BBB, chapterNumber) self.__validateAndExtractVerse(BBB, chapterNumber, thisBook, element) elif element.tag == HaggaiXMLBible.captionTag + 'disabled': # Used in Psalms location = "caption in {} {}".format(BBB, chapterNumber) BibleOrgSysGlobals.checkXMLNoTail(element, location, 'k5k8') BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'd3f5') # Handle caption attributes vRef = None for attrib, value in element.items(): if attrib == "vref": vRef = value if BibleOrgSysGlobals.debugFlag: assert vRef == '1' else: logging.warning( "Unprocessed {!r} attribute ({}) in caption element" .format(attrib, value)) if BibleOrgSysGlobals.debugFlag: assert vRef vText = element.text if not vText: logging.warning("{} {}:{} has no text".format( BBB, chapterNumber, vRef)) if vText: # This is the main text of the caption #print( "{} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) ) thisBook.addLine('v', '0' + ' ' + vText) # We save it as verse zero else: logging.error("Expected to find {!r} but got {!r}".format( HaggaiXMLBible.verseTag, element.tag))
def __validateAndExtractBook( self, book ): """ Check/validate and extract book data from the given XML book record finding chapter subelements. """ if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML book…") ) # Process the div attributes first BBB = bookName = bookShortName = bookNumber = None for attrib,value in book.items(): if attrib=="bnumber": bookNumber = value elif attrib=="bname": bookName = value elif attrib=="bsname": bookShortName = value else: logging.warning( "Unprocessed {!r} attribute ({}) in book element".format( attrib, value ) ) if bookNumber: try: BBB = BibleOrgSysGlobals.BibleBooksCodes.getBBBFromReferenceNumber( bookNumber ) except KeyError: logging.warning( "Unable to deduce which book is number={}, name={}, shortName={} -- ignoring it" \ .format( bookNumber, bookName, bookShortName ) ) elif bookName: BBB = self.genericBOS.getBBBFromText( bookName ) if BBB: if BibleOrgSysGlobals.verbosityLevel > 2: print( _("Validating {} {}…").format( BBB, bookName ) ) thisBook = BibleBook( self, BBB ) thisBook.objectNameString = 'Haggai XML Bible Book object' thisBook.objectTypeString = 'Haggai' #thisBook.sourceFilepath = self.sourceFilepath for element in book: if element.tag == HaggaiXMLBible.captionTag: sublocation = "caption in {}".format( BBB ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jhl6' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'jk21' ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'kjh6' ) thisBook.addLine( 'mt', element.text ) elif element.tag == HaggaiXMLBible.chapterTag: sublocation = "chapter in {}".format( BBB ) BibleOrgSysGlobals.checkXMLNoText( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) self.__validateAndExtractChapter( BBB, thisBook, element ) else: logging.error( "Expected to find {!r} but got {!r}".format( HaggaiXMLBible.chapterTag, element.tag ) ) if BibleOrgSysGlobals.verbosityLevel > 2: print( " Saving {} into results…".format( BBB ) ) self.stashBook( thisBook )
def __validateAndExtractChapter( self, BBB, thisBook, chapter ): """ Check/validate and extract chapter data from the given XML book record finding and saving chapter numbers and finding and saving verse elements. """ if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML chapter…") ) # Process the chapter attributes first chapterNumber = numVerses = None for attrib,value in chapter.items(): if attrib=="cnumber": chapterNumber = value else: logging.warning( "Unprocessed {!r} attribute ({}) in chapter element".format( attrib, value ) ) if chapterNumber: #print( BBB, 'c', chapterNumber ) thisBook.addLine( 'c', chapterNumber ) else: logging.error( "Missing 'n' attribute in chapter element for {}".format( BBB ) ) for element in chapter: if element.tag == HaggaiXMLBible.paragraphTag: location = "paragraph in {} {}".format( BBB, chapterNumber ) self.__validateAndExtractParagraph( BBB, chapterNumber, thisBook, element ) elif element.tag == HaggaiXMLBible.verseTag+'disabled': location = "verse in {} {}".format( BBB, chapterNumber ) self.__validateAndExtractVerse( BBB, chapterNumber, thisBook, element ) elif element.tag == HaggaiXMLBible.captionTag+'disabled': # Used in Psalms location = "caption in {} {}".format( BBB, chapterNumber ) BibleOrgSysGlobals.checkXMLNoTail( element, location, 'k5k8' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'd3f5' ) # Handle caption attributes vRef = None for attrib,value in element.items(): if attrib=="vref": vRef = value if BibleOrgSysGlobals.debugFlag: assert vRef == '1' else: logging.warning( "Unprocessed {!r} attribute ({}) in caption element".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert vRef vText = element.text if not vText: logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, vRef ) ) if vText: # This is the main text of the caption #print( "{} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) ) thisBook.addLine( 'v', '0' + ' ' + vText ) # We save it as verse zero else: logging.error( "Expected to find {!r} but got {!r}".format( HaggaiXMLBible.verseTag, element.tag ) )
def _validate( self ): """ Check/validate the loaded data. """ assert self._XMLtree uniqueDict = {} #for elementName in self._uniqueElements: uniqueDict["Element_"+elementName] = [] for attributeName in self._uniqueAttributes: uniqueDict["Attribute_"+attributeName] = [] for j,element in enumerate(self._XMLtree): if element.tag == self._mainElementTag: BibleOrgSysGlobals.checkXMLNoText( element, element.tag ) BibleOrgSysGlobals.checkXMLNoTail( element, element.tag ) BibleOrgSysGlobals.checkXMLNoSubelements( element, element.tag ) # Check compulsory attributes on this main element for attributeName in self._compulsoryAttributes: attributeValue = element.get( attributeName ) if attributeValue is None: logging.error( "Compulsory {!r} attribute is missing from {} element in record {}".format( attributeName, element.tag, j ) ) if not attributeValue and attributeName!="type": logging.warning( "Compulsory {!r} attribute is blank on {} element in record {}".format( attributeName, element.tag, j ) ) # Check optional attributes on this main element for attributeName in self._optionalAttributes: attributeValue = element.get( attributeName ) if attributeValue is not None: if not attributeValue: logging.warning( "Optional {!r} attribute is blank on {} element in record {}".format( attributeName, element.tag, j ) ) # Check for unexpected additional attributes on this main element for attributeName in element.keys(): attributeValue = element.get( attributeName ) if attributeName not in self._compulsoryAttributes and attributeName not in self._optionalAttributes: logging.warning( "Additional {!r} attribute ({!r}) found on {} element in record {}".format( attributeName, attributeValue, element.tag, j ) ) # Check the attributes that must contain unique information (in that particular field -- doesn't check across different attributes) for attributeName in self._uniqueAttributes: attributeValue = element.get( attributeName ) if attributeValue is not None and attributeName!="reference_name": if attributeValue in uniqueDict["Attribute_"+attributeName]: logging.error( "Found {!r} data repeated in {!r} field on {} element in record {}".format( attributeValue, attributeName, element.tag, j ) ) uniqueDict["Attribute_"+attributeName].append( attributeValue ) else: logging.warning( "Unexpected element: {} in record {}".format( element.tag, j ) )
def loadFigure( self, element, location ): """ """ BibleOrgSysGlobals.checkXMLNoText( element, location, 'ff36' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, location, 'cf35' ) figDict = { 'description':'', 'catalog':'', 'size':'', 'location':'', 'copyright':'', 'caption':'', 'reference':'' } for subelement in element: sublocation = subelement.tag + " of " + location figTag, figText = subelement.tag, clean(subelement.text) assert( figTag in figDict ) figDict[figTag] = '' if figText is None else figText BibleOrgSysGlobals.checkXMLNoTail( subelement, sublocation, 'jkf5' ) BibleOrgSysGlobals.checkXMLNoAttributes( subelement, sublocation, 'ld18' ) BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'hb46' ) newString = '' for j,tag in enumerate( ('description', 'catalog', 'size', 'location', 'copyright', 'caption', 'reference',) ): newString += ('' if j==0 else '|') + figDict[tag] figTail = clean( element.tail ) self.thisBook.appendToLastLine( ' \\fig {}\\fig*{}'.format( newString, (' '+figTail) if figTail else '' ) )
def __validateAndExtractParagraph( self, BBB, chapterNumber, thisBook, paragraph ): """ Check/validate and extract paragraph data from the given XML book record finding and saving paragraphs and finding and saving verse elements. """ if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML paragraph...") ) location = "paragraph in {} {}".format( BBB, chapterNumber ) BibleOrgSysGlobals.checkXMLNoAttributes( paragraph, location, 'brgw3' ) BibleOrgSysGlobals.checkXMLNoText( paragraph, location, 'brgw3' ) BibleOrgSysGlobals.checkXMLNoTail( paragraph, location, 'brgw3' ) thisBook.addLine( 'p', '' ) # Handle verse subelements (verses) for element in paragraph: if element.tag == HaggaiXMLBible.verseTag: location = "verse in {} {}".format( BBB, chapterNumber ) self.__validateAndExtractVerse( BBB, chapterNumber, thisBook, element ) elif element.tag == HaggaiXMLBible.captionTag+'disabled': # Used in Psalms location = "caption in {} {}".format( BBB, chapterNumber ) BibleOrgSysGlobals.checkXMLNoTail( element, location, 'k5k8' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'd3f5' ) # Handle caption attributes vRef = None for attrib,value in element.items(): if attrib=="vref": vRef = value if BibleOrgSysGlobals.debugFlag: assert( vRef == '1' ) else: logging.warning( "Unprocessed {!r} attribute ({}) in caption element".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert( vRef ) vText = element.text if not vText: logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, vRef ) ) if vText: # This is the main text of the caption #print( "{} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) ) thisBook.addLine( 'v', '0' + ' ' + vText ) # We save it as verse zero else: logging.error( "Expected to find {!r} but got {!r}".format( HaggaiXMLBible.verseTag, element.tag ) )
def __validateAndExtractHeader( self ): """ Extracts information out of the header record, such as: <INFORMATION> <title>King James Version</title> <creator></creator> <subject>The Holy Bible</subject> <description>In 1604, King James I of England authorized that a new translation of the Bible into English be started. It was finished in 1611, just 85 years after the first translation of the New Testament into English appeared (Tyndale, 1526). The Authorized Version, or King James Version, quickly became the standard for English-speaking Protestants. Its flowing language and prose rhythm has had a profound influence on the literature of the past 300 years.</description> <publisher>FREE BIBLE SOFTWARE GROUP</publisher> <contributors /> <date>2009-01-23</date> <type>Bible</type> <format>Haggai XML Bible Markup Language</format> <identifier>kjv</identifier> <source>http://www.unboundbible.com/zips/index.cfm?lang=English</source> <language>ENG</language> <coverage>provide the Bible to the nations of the world</coverage> <rights>We believe that this Bible is found in the Public Domain.</rights> </INFORMATION> """ if BibleOrgSysGlobals.debugFlag: assert self.header location = 'Header' BibleOrgSysGlobals.checkXMLNoAttributes( self.header, location, 'j4j6' ) BibleOrgSysGlobals.checkXMLNoText( self.header, location, 'sk4l' ) BibleOrgSysGlobals.checkXMLNoTail( self.header, location, 'a2d4' ) # TODO: We probably need to rationalise some of the self.xxx stores for element in self.header: #print( "header", element.tag ) if element.tag == 'title': sublocation = "title in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if BibleOrgSysGlobals.debugFlag: assert element.text self.title = element.text elif element.tag == 'creator': sublocation = "creator in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if element.text: self.creator = element.text elif element.tag == 'subject': sublocation = "subject in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if element.text: self.subject = element.text elif element.tag == 'description': sublocation = "description in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if BibleOrgSysGlobals.debugFlag: assert element.text self.description = element.text elif element.tag == 'publisher': sublocation = "publisher in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if element.text: self.publisher = element.text elif element.tag == 'contributor': sublocation = "contributor in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'alj1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jjd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5gk78' ) if element.text: try: self.contributor = [ self.contributor, element.text ] # Put multiples into a list except AttributeError: self.contributor = element.text # Must be the first (and possibly only) one elif element.tag == 'contributors': sublocation = "contributors in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if element.text: self.contributors = element.text elif element.tag == 'date': sublocation = "date in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if BibleOrgSysGlobals.debugFlag: assert element.text self.date = element.text elif element.tag == 'type': sublocation = "type in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if element.text: self.documentType = element.text elif element.tag == 'format': sublocation = "format in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if BibleOrgSysGlobals.debugFlag: assert element.text if BibleOrgSysGlobals.debugFlag: assert element.text == 'Haggai XML Bible Markup Language' elif element.tag == 'identifier': sublocation = "identifier in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if BibleOrgSysGlobals.debugFlag: assert element.text self.identifier = element.text elif element.tag == 'source': sublocation = "source in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if BibleOrgSysGlobals.debugFlag: assert element.text self.source = element.text elif element.tag == 'language': sublocation = "language in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if BibleOrgSysGlobals.debugFlag: assert element.text self.language = element.text elif element.tag == 'coverage': sublocation = "coverage in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if element.text: self.coverage = element.text elif element.tag == 'rights': sublocation = "rights in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if element.text: self.rights = element.text else: logging.error( "Found unexpected {!r} tag in {}".format( element.tag, location ) )
def loadParagraph( self, paragraphElement, paragraphLocation, BBB, C ): """ Load the paragraph (p or q) container from the XML data file. """ #if BibleOrgSysGlobals.verbosityLevel > 3: #print( _("USFXXMLBible.loadParagraph: Loading {} from {}...").format( self.name, self.sourceFolder ) ) V = None pTag, pText = paragraphElement.tag, clean(paragraphElement.text) BibleOrgSysGlobals.checkXMLNoTail( paragraphElement, paragraphLocation, 'vsg7' ) # Process the attributes first sfm = level = style = None for attrib,value in paragraphElement.items(): if attrib == 'sfm': sfm = value elif attrib == 'level': level = value elif attrib == 'style': style = value else: logging.warning( "vfh4 Unprocessed {} attribute ({}) in {}".format( attrib, value, paragraphLocation ) ) if sfm: assert( pTag == 'p' ) pTag = sfm if level: #assert( pTag == 'q' ) # Could also be mt, etc. pTag += level if style: #print( repr(pTag), repr(pText), repr(style) ) if BibleOrgSysGlobals.verbosityLevel > 2: print( "Ignoring {!r} style".format( style ) ) self.thisBook.addLine( pTag, '' if pText is None else pText ) for element in paragraphElement: location = element.tag + " of " + paragraphLocation #print( "element", repr(element.tag) ) if element.tag == 'v': # verse milestone vTail = clean( element.tail ) # Main verse text BibleOrgSysGlobals.checkXMLNoText( element, location, 'crc2' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'lct3' ) lastV, V = V, None for attrib,value in element.items(): if attrib == 'id': V = value else: logging.warning( _("cbs2 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) assert( V is not None ) assert( V ) self.thisBook.addLine( 'v', V + ((' '+vTail) if vTail else '' ) ) elif element.tag == 've': # verse end milestone -- we can just ignore this BibleOrgSysGlobals.checkXMLNoText( element, location, 'lsc3' ) BibleOrgSysGlobals.checkXMLNoTail( element, location, 'mfy4' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, location, 'bd24' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'ks35' ) elif element.tag == 'fig': self.loadFigure( element, location ) elif element.tag == 'table': self.loadTable( element, location ) elif element.tag == 'f': #print( "USFX.loadParagraph Found footnote at", paragraphLocation, C, V, repr(element.text) ) self.loadFootnote( element, location, BBB, C, V ) elif element.tag == 'x': #print( "USFX.loadParagraph Found xref at", paragraphLocation, C, V, repr(element.text) ) self.loadCrossreference( element, location ) elif element.tag in ('add','nd','wj','rq','sig','sls','bk','k','tl','vp','pn','qs','qt','em','it','bd','bdit','sc','no',): # character formatting self.loadCharacterFormatting( element, location, BBB, C, V ) elif element.tag == 'cs': # character style -- seems like a USFX hack text, tail = clean(element.text), clean(element.tail) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'kf92' ) sfm = None for attrib,value in element.items(): if attrib == 'sfm': sfm = value else: logging.warning( _("sh29 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) if sfm not in ('w','ior',): print( "cs sfm got", repr(sfm) ) self.thisBook.appendToLastLine( ' \\{} {}\\{}*{}'.format( sfm, text, sfm, (' '+tail) if tail else '' ) ) elif element.tag in ('cp',): # Simple single-line paragraph-level markers marker, text = element.tag, clean(element.text) BibleOrgSysGlobals.checkXMLNoTail( element, location, 'kdf0' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, location, 'lkj1' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'da13' ) self.thisBook.addLine( marker, text ) elif element.tag == 'ref': # encoded reference -- seems like a USFX hack text, tail = clean(element.text), clean(element.tail) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'bd83' ) target = None for attrib,value in element.items(): if attrib == 'tgt': target = value else: logging.warning( _("be83 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) #if target not in ('w','ior',): print( "ref sfm got", repr(sfm) ) self.thisBook.appendToLastLine( ' \\{} {}\\{}*{}{}'.format( element.tag, target, element.tag, text, (' '+tail) if tail else '' ) ) #print( "Saved", '\\{} {}\\{}*{}{}'.format( element.tag, target, element.tag, text, (' '+tail) if tail else '' ) ) elif element.tag == 'optionalLineBreak': print( "What is loadParagraph optionalLineBreak?" ) if BibleOrgSysGlobals.debugFlag: halt elif element.tag == 'milestone': # e.g., <milestone sfm="pb" attribute=""/> (pb = explicit page break) BibleOrgSysGlobals.checkXMLNoText( element, location, 'jzx2' ) BibleOrgSysGlobals.checkXMLNoTail( element, location, 'ms23' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'dw24' ) sfm = None for attrib,value in element.items(): if attrib == 'sfm': sfm = value else: logging.warning( _("mcd2 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) if sfm not in ('pb',): print( "milestone sfm got", repr(sfm) ) self.thisBook.addLine( sfm, '' ) else: logging.warning( _("df45 Unprocessed {} element after {} {}:{} in {}").format( repr(element.tag), self.thisBook.BBB, C, V, location ) ) return V
def __validateAndExtractVerse(self, BBB, chapterNumber, thisBook, verse): """ Check/validate and extract chapter data from the given XML book record finding and saving chapter numbers and finding and saving verse elements. """ if BibleOrgSysGlobals.verbosityLevel > 3: print(_("Validating XML verse...")) location = "verse in {} {}".format(BBB, chapterNumber) BibleOrgSysGlobals.checkXMLNoTail(verse, location, 'l5ks') # Handle verse attributes verseNumber = toVerseNumber = None for attrib, value in verse.items(): if attrib == "vnumber": verseNumber = value else: logging.warning( "Unprocessed {!r} attribute ({}) in verse element".format( attrib, value)) if BibleOrgSysGlobals.debugFlag: assert (verseNumber) location = "{}:{}".format( location, verseNumber) # Get a better location description #thisBook.addLine( 'v', verseNumber ) vText = verse.text if vText: vText = vText.strip() #if not vText: # This happens if a verse starts immediately with a style or note #logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, verseNumber ) ) # Handle verse subelements (notes and styled portions) for subelement in verse: if subelement.tag == ZefaniaXMLBible.noteTag: sublocation = "note in " + location noteType = None for attrib, value in subelement.items(): if attrib == "type": noteType = value else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement" .format(attrib, value)) if noteType not in ( 'n-studynote', 'x-studynote', ): logging.warning("Unexpected {} note type in {}".format( noteType, BBB)) if BibleOrgSysGlobals.debugFlag: assert (noteType) nText, nTail = subelement.text, subelement.tail #print( "note", BBB, chapterNumber, verseNumber, noteType, repr(nText), repr(nTail) ) #thisBook.addLine( 'ST', css ) # XXXXXXXXXXXXXXXXXXXXXXXXXX Losing data here (for now) #thisBook.addLine( 'ST=', nText ) if nTail: if '\n' in nTail: print( "ZefaniaXMLBible.__validateAndExtractVerse: nTail {} {}:{} {!r}" .format(BBB, chapterNumber, verseNumber, nTail)) nTail = nTail.replace('\n', ' ') thisBook.addLine('v~', nTail) for subsubelement in subelement: if subsubelement.tag == ZefaniaXMLBible.styleTag: subsublocation = "style in " + sublocation BibleOrgSysGlobals.checkXMLNoSubelements( subsubelement, subsublocation, 'fyt4') css = idStyle = None for attrib, value in subsubelement.items(): if attrib == "css": css = value elif attrib == "id": idStyle = value else: logging.warning( "Unprocessed {!r} attribute ({}) in style subsubelement" .format(attrib, value)) if BibleOrgSysGlobals.debugFlag: assert (css or idStyle) SFM = None if css == "font-style:italic": SFM = '\\it' elif css == "font-style:italic;font-weight:bold": SFM = '\\bdit' elif css == "color:#FF0000": SFM = '\\em' elif css == "font-size: x-small; color:#8B8378": SFM = '\\add' elif css is None and idStyle == 'cl:divineName': SFM = '\\nd' else: print("css is", css, "idStyle is", idStyle) halt sText, sTail = subsubelement.text.strip( ), subsubelement.tail if BibleOrgSysGlobals.debugFlag: assert (sText) if SFM: vText += SFM + ' ' + sText + SFM + '*' else: vText += '\\sc ' + '[' + css + ']' + sText + '\\sc* ' # Use sc for unknown styles if sTail: vText += sTail.strip() else: logging.error( "Expected to find {} but got {!r} in {}".format( ZefaniaXMLBible.styleTag, subsubelement.tag, sublocation)) elif subelement.tag == ZefaniaXMLBible.styleTag: sublocation = "style in " + location BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'f5gh') css = idStyle = None for attrib, value in subelement.items(): if attrib == "css": css = value elif attrib == "id": idStyle = value else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement" .format(attrib, value)) if BibleOrgSysGlobals.debugFlag: assert (css or idStyle) SFM = None if css == "font-style:italic": SFM = '\\it' elif css == "font-style:italic;font-weight:bold": SFM = '\\bdit' elif css == "color:#FF0000": SFM = '\\em' elif css == "font-size: x-small; color:#8B8378": SFM = '\\add' elif css is None and idStyle == 'cl:divineName': SFM = '\\nd' else: print("css is", css, "idStyle is", idStyle) halt sText, sTail = subelement.text.strip(), subelement.tail if BibleOrgSysGlobals.debugFlag: assert (sText) if SFM: vText += SFM + ' ' + sText + SFM + '*' else: vText += '\\sc ' + '[' + css + ']' + sText + '\\sc* ' # Use sc for unknown styles if sTail: vText += sTail.strip() elif subelement.tag == ZefaniaXMLBible.breakTag: sublocation = "line break in " + location BibleOrgSysGlobals.checkXMLNoText(subelement, sublocation, 'c1d4') BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'g4g8') art = None for attrib, value in subelement.items(): if attrib == "art": art = value else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement" .format(attrib, value)) if BibleOrgSysGlobals.debugFlag: assert (art == 'x-nl') #print( BBB, chapterNumber, verseNumber ) #assert( vText ) if vText: thisBook.addLine('v', verseNumber + ' ' + vText) vText = '' thisBook.addLine( 'm', subelement.tail.strip() if subelement.tail else '') #bTail = subelement.tail #if bTail: vText = bTail.strip() else: logging.error( "Expected to find NOTE or STYLE but got {!r} in {}".format( subelement.tag, location)) if vText: # This is the main text of the verse (follows the verse milestone) if '\n' in vText: print( "ZefaniaXMLBible.__validateAndExtractVerse: vText {} {}:{} {!r}" .format(BBB, chapterNumber, verseNumber, vText)) vText = vText.replace('\n', ' ') thisBook.addLine('v', verseNumber + ' ' + vText)
def _validateSystem( self, punctuationTree, systemName ): """ """ assert punctuationTree uniqueDict = {} for elementName in self.uniqueElements: uniqueDict["Element_"+elementName] = [] for attributeName in self.uniqueAttributes: uniqueDict["Attribute_"+attributeName] = [] for k,element in enumerate(punctuationTree): if element.tag in self.mainElementTags: BibleOrgSysGlobals.checkXMLNoTail( element, element.tag ) if not self.compulsoryAttributes and not self.optionalAttributes: BibleOrgSysGlobals.checkXMLNoAttributes( element, element.tag ) if not self.compulsoryElements and not self.optionalElements: BibleOrgSysGlobals.checkXMLNoSubelements( element, element.tag ) # Check compulsory attributes on this main element for attributeName in self.compulsoryAttributes: attributeValue = element.get( attributeName ) if attributeValue is None: logging.error( _("Compulsory {!r} attribute is missing from {} element in record {}").format( attributeName, element.tag, k ) ) if not attributeValue: logging.warning( _("Compulsory {!r} attribute is blank on {} element in record {}").format( attributeName, element.tag, k ) ) # Check optional attributes on this main element for attributeName in self.optionalAttributes: attributeValue = element.get( attributeName ) if attributeValue is not None: if not attributeValue: logging.warning( _("Optional {!r} attribute is blank on {} element in record {}").format( attributeName, element.tag, k ) ) # Check for unexpected additional attributes on this main element for attributeName in element.keys(): attributeValue = element.get( attributeName ) if attributeName not in self.compulsoryAttributes and attributeName not in self.optionalAttributes: logging.warning( _("Additional {!r} attribute ({!r}) found on {} element in record {}").format( attributeName, attributeValue, element.tag, k ) ) # Check the attributes that must contain unique information (in that particular field -- doesn't check across different attributes) for attributeName in self.uniqueAttributes: attributeValue = element.get( attributeName ) if attributeValue is not None: if attributeValue in uniqueDict["Attribute_"+attributeName]: logging.error( _("Found {!r} data repeated in {!r} field on {} element in record {}").format( attributeValue, attributeName, element.tag, k ) ) uniqueDict["Attribute_"+attributeName].append( attributeValue ) # Check compulsory elements for elementName in self.compulsoryElements: if element.find( elementName ) is None: logging.error( _("Compulsory {!r} element is missing in record with ID {!r} (record {})").format( elementName, ID, k ) ) if not element.find( elementName ).text: logging.warning( _("Compulsory {!r} element is blank in record with ID {!r} (record {})").format( elementName, ID, k ) ) # Check optional elements for elementName in self.optionalElements: if element.find( elementName ) is not None: if not element.find( elementName ).text: logging.warning( _("Optional {!r} element is blank in record with ID {!r} (record {})").format( elementName, ID, k ) ) # Check for unexpected additional elements for subelement in element: if subelement.tag not in self.compulsoryElements and subelement.tag not in self.optionalElements: logging.warning( _("Additional {!r} element ({!r}) found in record with ID {!r} (record {})").format( subelement.tag, subelement.text, ID, k ) ) # Check the elements that must contain unique information (in that particular element -- doesn't check across different elements) for elementName in self.uniqueElements: if element.find( elementName ) is not None: text = element.find( elementName ).text if text in uniqueDict["Element_"+elementName]: logging.error( _("Found {!r} data repeated in {!r} element in record with ID {!r} (record {})").format( text, elementName, ID, k ) ) uniqueDict["Element_"+elementName].append( text ) else: logging.warning( _("Unexpected element: {} in record {}").format( element.tag, k ) )
def importDataToPython( self ): """ Loads (and pivots) the data (not including the header) into suitable Python containers to use in a Python program. (Of course, you can just use the elementTree in self._XMLtree if you prefer.) """ def makeList( parameter1, parameter2 ): """ Returns a list containing all parameters. Parameter1 may already be a list. """ if isinstance( parameter1, list ): #assert parameter2 not in parameter1 parameter1.append( parameter2 ) return parameter1 else: return [ parameter1, parameter2 ] # end of makeList assert self._XMLtree if self.__DataList: # We've already done an import/restructuring -- no need to repeat it return self.__DataList, self.__DataDict # We'll create a number of dictionaries with different elements as the key rawRefLinkList = [] actualLinkCount = 0 for element in self._XMLtree: #print( BibleOrgSysGlobals.elementStr( element ) ) # Get these first for helpful error messages sourceReference = element.find('sourceReference').text sourceComponent = element.find('sourceComponent').text assert sourceComponent in ('Section','Verses','Verse',) BibleOrgSysGlobals.checkXMLNoText( element, sourceReference, 'kls1' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sourceReference, 'kd21' ) BibleOrgSysGlobals.checkXMLNoTail( element, sourceReference, 'so20' ) actualRawLinksList = [] for subelement in element: #print( BibleOrgSysGlobals.elementStr( subelement ) ) if subelement.tag in ( 'sourceReference','sourceComponent',): # already processed these BibleOrgSysGlobals.checkXMLNoAttributes( subelement, sourceReference, 'ls12' ) BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sourceReference, 'ks02' ) BibleOrgSysGlobals.checkXMLNoTail( subelement, sourceReference, 'sqw1' ) elif subelement.tag == 'BibleReferenceLink': BibleOrgSysGlobals.checkXMLNoText( subelement, sourceReference, 'haw9' ) BibleOrgSysGlobals.checkXMLNoAttributes( subelement, sourceReference, 'hs19' ) BibleOrgSysGlobals.checkXMLNoTail( subelement, sourceReference, 'jsd9' ) targetReference = subelement.find('targetReference').text targetComponent = subelement.find('targetComponent').text assert targetComponent in ('Section','Verses','Verse',) linkType = subelement.find('linkType').text assert linkType in ('TSK','QuotedOTReference','AlludedOTReference','PossibleOTReference',) actualRawLinksList.append( (targetReference,targetComponent,linkType,) ) actualLinkCount += 1 rawRefLinkList.append( (sourceReference,sourceComponent,actualRawLinksList,) ) if BibleOrgSysGlobals.verbosityLevel > 1: print( " {} raw links loaded (with {} actual raw link entries)".format( len(rawRefLinkList), actualLinkCount ) ) myRefLinkList = [] actualLinkCount = 0 BOS = BibleOrganizationalSystem( 'GENERIC-KJV-66-ENG' ) for j,(sourceReference,sourceComponent,actualRawLinksList) in enumerate( rawRefLinkList ): # Just do some testing first if sourceComponent == 'Verse': x = SimpleVerseKey( sourceReference ) else: flag = False try: x = SimpleVerseKey( sourceReference, ignoreParseErrors=True ) flag = True except TypeError: pass # This should happen coz it should fail the SVK if flag: logging.error( "{} {!r} failed!".format( sourceComponent, sourceReference ) ) raise TypeError # Now do the actual parsing parsedSourceReference = FlexibleVersesKey( sourceReference ) if BibleOrgSysGlobals.debugFlag and debuggingThisModule: print( j, sourceComponent, sourceReference, parsedSourceReference ) #assert parsedSourceReference.getShortText().replace(' ','_') == sourceReference actualLinksList = [] for k,(targetReference,targetComponent,linkType) in enumerate( actualRawLinksList ): # Just do some testing first if targetComponent == 'Verse': x = SimpleVerseKey( targetReference ) else: flag = False try: x = SimpleVerseKey( targetReference, ignoreParseErrors=True ) flag = True except TypeError: pass # This should happen coz it should fail the SVK if flag: logging.error( "{} {!r} failed!".format( targetComponent, targetReference ) ) raise TypeError # Now do the actual parsing try: parsedTargetReference = FlexibleVersesKey( targetReference ) except TypeError: print( " Temporarily ignored {!r} (TypeError from FlexibleVersesKey)".format( targetReference ) ) parsedTargetReference = None if BibleOrgSysGlobals.debugFlag and debuggingThisModule: print( ' ', targetComponent, targetReference, parsedTargetReference ) #assert parsedTargetReference.getShortText().replace(' ','_',1) == targetReference actualLinksList.append( (targetReference,targetComponent,parsedTargetReference,linkType,) ) actualLinkCount += 1 myRefLinkList.append( (sourceReference,sourceComponent,parsedSourceReference,actualLinksList,) ) if BibleOrgSysGlobals.verbosityLevel > 1: print( " {} links processed (with {} actual link entries)".format( len(rawRefLinkList), actualLinkCount ) ) #print( myRefLinkList ); halt self.__DataList = myRefLinkList # Now put it into my dictionaries for easy access # This part should be customized or added to for however you need to process the data # Create a link dictionary (by verse key) myRefLinkDict = {} for sourceReference,sourceComponent,parsedSourceReference,actualLinksList in myRefLinkList: #print( sourceReference, sourceComponent, parsedSourceReference ) #print( sourceReference, sourceComponent, parsedSourceReference, actualLinksList ) for verseRef in parsedSourceReference.getIncludedVerses(): #print( verseRef ) assert isinstance( verseRef, SimpleVerseKey ) if verseRef not in myRefLinkDict: myRefLinkDict[verseRef] = [] myRefLinkDict[verseRef].append( (sourceReference,sourceComponent,parsedSourceReference,actualLinksList,) ) #print( myRefLinkDict ); halt originalLinks = len( myRefLinkDict ) print( " {} verse links added to dictionary (includes filling out spans)".format( originalLinks ) ) #print( myRefLinkDict ); halt # Create a reversed link dictionary (by verse key) for sourceReference,sourceComponent,parsedSourceReference,actualLinksList in myRefLinkList: #print( sourceReference, sourceComponent, parsedSourceReference ) #print( sourceReference, sourceComponent, parsedSourceReference, actualLinksList ) for targetReference,targetComponent,parsedTargetReference,linkType in actualLinksList: if parsedTargetReference is not None: for verseRef in parsedTargetReference.getIncludedVerses(): #print( verseRef ) assert isinstance( verseRef, SimpleVerseKey ) if linkType == 'TSK': reverseLinkType = 'TSKQuoted' elif linkType == 'QuotedOTReference': reverseLinkType = 'OTReferenceQuoted' elif linkType == 'AlludedOTReference': reverseLinkType = 'OTReferenceAlluded' elif linkType == 'PossibleOTReference': reverseLinkType = 'OTReferencePossible' else: halt # Have a new linkType! if verseRef not in myRefLinkDict: myRefLinkDict[verseRef] = [] myRefLinkDict[verseRef].append( (targetReference,targetComponent,parsedTargetReference,[(sourceReference,sourceComponent,parsedSourceReference,reverseLinkType)]) ) #print( myRefLinkDict ); halt totalLinks = len( myRefLinkDict ) reverseLinks = totalLinks - originalLinks print( " {} reverse links added to dictionary to give {} total".format( reverseLinks, totalLinks ) ) #print( myRefLinkDict ); halt self.__DataDict = myRefLinkDict # Let's find the most number of references for a verse mostReferences = totalReferences = 0 for verseRef, entryList in self.__DataDict.items(): numRefs = len( entryList ) if numRefs > mostReferences: mostReferences, mostVerseRef = numRefs, verseRef totalReferences += numRefs print( " {} maximum links for any one reference ({})".format( mostReferences, mostVerseRef.getShortText() ) ) print( " {} total links for all references".format( totalReferences ) ) return self.__DataList, self.__DataDict
def __validateAndExtractChapter(self, BBB, thisBook, chapter): """ Check/validate and extract chapter data from the given XML book record finding and saving chapter numbers and finding and saving verse elements. """ if BibleOrgSysGlobals.verbosityLevel > 3: print(_("Validating XML chapter…")) # Process the div attributes first chapterNumber = numVerses = None for attrib, value in chapter.items(): if attrib == "n": chapterNumber = value elif attrib == "VERSES": numVerses = value else: logging.warning( "Unprocessed {!r} attribute ({}) in chapter element". format(attrib, value)) if chapterNumber: #print( BBB, 'c', chapterNumber ) chapterNumber = chapterNumber.replace( 'of Solomon ', '') # Fix a mistake in the Chinese_SU module thisBook.addLine('c', chapterNumber) else: logging.error( "Missing 'n' attribute in chapter element for {}".format(BBB)) for element in chapter: if element.tag == OpenSongXMLBible.verseTag: sublocation = "verse in {} {}".format(BBB, chapterNumber) BibleOrgSysGlobals.checkXMLNoTail(element, sublocation, 'l5ks') verseNumber = toVerseNumber = None for attrib, value in element.items(): if attrib == "n": verseNumber = value elif attrib == "t": toVerseNumber = value else: logging.warning( "Unprocessed {!r} attribute ({}) in verse element". format(attrib, value)) if BibleOrgSysGlobals.debugFlag: assert verseNumber #thisBook.addLine( 'v', verseNumber ) vText = element.text if element.text else '' for subelement in element: sub2location = "{} in {}".format(subelement.tag, sublocation) BibleOrgSysGlobals.checkXMLNoAttributes( subelement, sub2location, 'ks03') BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sub2location, 'ks05') if subelement.tag == 'i': vText += '\\it {}\\it*{}'.format( subelement.text, subelement.tail) else: logging.error( "Expected to find 'i' but got {!r}".format( subelement.tag)) vText += element.tail if element.tail else '' if not vText: logging.warning("{} {}:{} has no text".format( BBB, chapterNumber, verseNumber)) #print( 'vText1', vText ) if vText: # This is the main text of the verse (follows the verse milestone) #print( "{} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) ) if '\n' in vText: # This is how they represent poety #print( "vText", repr(vText), repr(element.text) ) for j, textBit in enumerate(vText.split('\n')): if j == 0: thisBook.addLine('q1', '') thisBook.addLine('v', verseNumber + ' ' + textBit) else: thisBook.addLine('q1', textBit) else: # Just one verse line thisBook.addLine('v', verseNumber + ' ' + vText) #print( 'vText2', vText ) else: logging.error("Expected to find {!r} but got {!r}".format( OpenSongXMLBible.verseTag, element.tag))
def loadParagraph( paragraphXML, paragraphlocation ): """ Load a paragraph from the USX XML. Uses (and updates) c,v information from the containing function. """ nonlocal c, v # Process the attributes first paragraphStyle = None for attrib,value in paragraphXML.items(): if attrib=='style': paragraphStyle = value # This is basically the USFM marker name else: logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) # Now process the paragraph text (or write a paragraph marker anyway) self.addLine( paragraphStyle, paragraphXML.text if paragraphXML.text and paragraphXML.text.strip() else '' ) # Now process the paragraph subelements for element in paragraphXML: location = element.tag + ' ' + paragraphlocation #print( "USXXMLBibleBook.load", c, v, element.tag, location ) if element.tag == 'verse': # milestone (not a container) BibleOrgSysGlobals.checkXMLNoText( element, location ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location ) # Process the attributes first verseStyle = None for attrib,value in element.items(): if attrib=='number': v = value elif attrib=='style': verseStyle = value else: logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) if verseStyle != 'v': logging.warning( _("Unexpected style attribute ({}) in {}").format( verseStyle, location ) ) self.addLine( verseStyle, v + ' ' ) # Now process the tail (if there's one) which is the verse text if element.tail: vText = element.tail.strip() if vText: #print( repr(vText) ) self.appendToLastLine( vText ) elif element.tag == 'char': # Process the attributes first charStyle = None for attrib,value in element.items(): if attrib=='style': charStyle = value # This is basically the USFM character marker name #print( " charStyle", charStyle ) assert( not BibleOrgSysGlobals.USFMMarkers.isNewlineMarker( charStyle ) ) else: logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) charLine = "\\{} {} ".format( charStyle, element.text ) # Now process the subelements -- chars are one of the few multiply embedded fields in USX for subelement in element: sublocation = subelement.tag + ' ' + location #print( c, v, element.tag ) if subelement.tag == 'char': # milestone (not a container) BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation ) # Process the attributes first subCharStyle, charClosed = None, True for attrib,value in subelement.items(): if attrib=='style': subCharStyle = value elif attrib=='closed': assert( value=='false' ) charClosed = False else: logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) charLine += "\\{} {}".format( subCharStyle, subelement.text ) if charClosed: charLine += "\\{}*".format( subCharStyle ) charLine += '' if subelement.tail is None else subelement.tail.strip() else: logging.warning( _("Unprocessed {} subelement after {} {}:{} in {}").format( subelement.tag, self.BBB, c, v, sublocation ) ) self.addPriorityError( 1, c, v, _("Unprocessed {} subelement").format( subelement.tag ) ) # A character field must be added to the previous field charLine += "\\{}*{}".format( charStyle, '' if element.tail is None else element.tail.strip() ) if debuggingThisModule: print( "USX.loadParagraph:", c, v, paragraphStyle, charStyle, repr(charLine) ) self.appendToLastLine( charLine ) elif element.tag == 'note': BibleOrgSysGlobals.checkXMLNoText( element, location ) # Process the attributes first noteStyle = noteCaller = None for attrib,value in element.items(): if attrib=='style': noteStyle = value # This is basically the USFM marker name assert( noteStyle in ('x','f',) ) elif attrib=='caller': noteCaller = value # Usually hyphen or a symbol to be used for the note else: logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) assert( noteStyle and noteCaller ) # both compulsory noteLine = "\\{} {} ".format( noteStyle, noteCaller ) # Now process the subelements -- notes are one of the few multiply embedded fields in USX for subelement in element: sublocation = subelement.tag + ' ' + location #print( c, v, element.tag ) if subelement.tag == 'char': # milestone (not a container) BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation ) # Process the attributes first charStyle, charClosed = None, True for attrib,value in subelement.items(): if attrib=='style': charStyle = value elif attrib=='closed': assert( value=='false' ) charClosed = False else: logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) noteLine += "\\{} {}".format( charStyle, subelement.text ) if charClosed: noteLine += "\\{}*".format( charStyle ) noteLine += '' if subelement.tail is None else subelement.tail.strip() elif subelement.tag == 'unmatched': # Used to denote errors in the source text BibleOrgSysGlobals.checkXMLNoText( subelement, sublocation ) BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation ) # Process the attributes first unmmatchedMarker = None for attrib,value in subelement.items(): if attrib=='marker': unmmatchedMarker = value else: logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) self.addPriorityError( 2, c, v, _("Unmatched subelement for {} in {}").format( repr(unmmatchedMarker), sublocation) if unmmatchedMarker else _("Unmatched subelement in {}").format( sublocation) ) else: logging.warning( _("Unprocessed {} subelement after {} {}:{} in {}").format( subelement.tag, self.BBB, c, v, sublocation ) ) self.addPriorityError( 1, c, v, _("Unprocessed {} subelement").format( subelement.tag ) ) if subelement.tail and subelement.tail.strip(): noteLine += subelement.tail #noteLine += "\\{}*".format( charStyle ) noteLine += "\\{}*".format( noteStyle ) if element.tail: noteText = element.tail.strip() noteLine += noteText self.appendToLastLine( noteLine ) elif element.tag == 'link': # Used to include extra resources BibleOrgSysGlobals.checkXMLNoText( element, location ) BibleOrgSysGlobals.checkXMLNoTail( element, location ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location ) # Process the attributes first linkStyle = linkDisplay = linkTarget = None for attrib,value in element.items(): if attrib=='style': linkStyle = value assert( linkStyle in ('jmp',) ) elif attrib=='display': linkDisplay = value # e.g., "click here" elif attrib=='target': linkTarget = value # e.g., some reference else: logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.addPriorityError( 3, c, v, _("Unprocessed {} link to {} in {}").format( repr(linkDisplay), repr(linkTarget), location) ) elif element.tag == 'unmatched': # Used to denote errors in the source text BibleOrgSysGlobals.checkXMLNoText( element, location ) BibleOrgSysGlobals.checkXMLNoTail( element, location ) BibleOrgSysGlobals.checkXMLNoAttributes( element, location ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location ) self.addPriorityError( 2, c, v, _("Unmatched element in {}").format( location) ) else: logging.warning( _("Unprocessed {} element after {} {}:{} in {}").format( element.tag, self.BBB, c, v, location ) ) self.addPriorityError( 1, c, v, _("Unprocessed {} element").format( element.tag ) ) for x in range(max(0,len(self)-10),len(self)): print( x, self._rawLines[x] ) if BibleOrgSysGlobals.debugFlag: halt
def load( self, filename, folder=None, encoding='utf-8' ): """ Load a single source USX XML file and extract the information. """ def loadParagraph( paragraphXML, paragraphlocation ): """ Load a paragraph from the USX XML. Uses (and updates) c,v information from the containing function. """ nonlocal c, v # Process the attributes first paragraphStyle = None for attrib,value in paragraphXML.items(): if attrib=='style': paragraphStyle = value # This is basically the USFM marker name else: logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) # Now process the paragraph text (or write a paragraph marker anyway) self.addLine( paragraphStyle, paragraphXML.text if paragraphXML.text and paragraphXML.text.strip() else '' ) # Now process the paragraph subelements for element in paragraphXML: location = element.tag + ' ' + paragraphlocation #print( "USXXMLBibleBook.load", c, v, element.tag, location ) if element.tag == 'verse': # milestone (not a container) BibleOrgSysGlobals.checkXMLNoText( element, location ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location ) # Process the attributes first verseStyle = None for attrib,value in element.items(): if attrib=='number': v = value elif attrib=='style': verseStyle = value else: logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) if verseStyle != 'v': logging.warning( _("Unexpected style attribute ({}) in {}").format( verseStyle, location ) ) self.addLine( verseStyle, v + ' ' ) # Now process the tail (if there's one) which is the verse text if element.tail: vText = element.tail.strip() if vText: #print( repr(vText) ) self.appendToLastLine( vText ) elif element.tag == 'char': # Process the attributes first charStyle = None for attrib,value in element.items(): if attrib=='style': charStyle = value # This is basically the USFM character marker name #print( " charStyle", charStyle ) assert( not BibleOrgSysGlobals.USFMMarkers.isNewlineMarker( charStyle ) ) else: logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) charLine = "\\{} {} ".format( charStyle, element.text ) # Now process the subelements -- chars are one of the few multiply embedded fields in USX for subelement in element: sublocation = subelement.tag + ' ' + location #print( c, v, element.tag ) if subelement.tag == 'char': # milestone (not a container) BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation ) # Process the attributes first subCharStyle, charClosed = None, True for attrib,value in subelement.items(): if attrib=='style': subCharStyle = value elif attrib=='closed': assert( value=='false' ) charClosed = False else: logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) charLine += "\\{} {}".format( subCharStyle, subelement.text ) if charClosed: charLine += "\\{}*".format( subCharStyle ) charLine += '' if subelement.tail is None else subelement.tail.strip() else: logging.warning( _("Unprocessed {} subelement after {} {}:{} in {}").format( subelement.tag, self.BBB, c, v, sublocation ) ) self.addPriorityError( 1, c, v, _("Unprocessed {} subelement").format( subelement.tag ) ) # A character field must be added to the previous field charLine += "\\{}*{}".format( charStyle, '' if element.tail is None else element.tail.strip() ) if debuggingThisModule: print( "USX.loadParagraph:", c, v, paragraphStyle, charStyle, repr(charLine) ) self.appendToLastLine( charLine ) elif element.tag == 'note': BibleOrgSysGlobals.checkXMLNoText( element, location ) # Process the attributes first noteStyle = noteCaller = None for attrib,value in element.items(): if attrib=='style': noteStyle = value # This is basically the USFM marker name assert( noteStyle in ('x','f',) ) elif attrib=='caller': noteCaller = value # Usually hyphen or a symbol to be used for the note else: logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) assert( noteStyle and noteCaller ) # both compulsory noteLine = "\\{} {} ".format( noteStyle, noteCaller ) # Now process the subelements -- notes are one of the few multiply embedded fields in USX for subelement in element: sublocation = subelement.tag + ' ' + location #print( c, v, element.tag ) if subelement.tag == 'char': # milestone (not a container) BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation ) # Process the attributes first charStyle, charClosed = None, True for attrib,value in subelement.items(): if attrib=='style': charStyle = value elif attrib=='closed': assert( value=='false' ) charClosed = False else: logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) noteLine += "\\{} {}".format( charStyle, subelement.text ) if charClosed: noteLine += "\\{}*".format( charStyle ) noteLine += '' if subelement.tail is None else subelement.tail.strip() elif subelement.tag == 'unmatched': # Used to denote errors in the source text BibleOrgSysGlobals.checkXMLNoText( subelement, sublocation ) BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation ) # Process the attributes first unmmatchedMarker = None for attrib,value in subelement.items(): if attrib=='marker': unmmatchedMarker = value else: logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) self.addPriorityError( 2, c, v, _("Unmatched subelement for {} in {}").format( repr(unmmatchedMarker), sublocation) if unmmatchedMarker else _("Unmatched subelement in {}").format( sublocation) ) else: logging.warning( _("Unprocessed {} subelement after {} {}:{} in {}").format( subelement.tag, self.BBB, c, v, sublocation ) ) self.addPriorityError( 1, c, v, _("Unprocessed {} subelement").format( subelement.tag ) ) if subelement.tail and subelement.tail.strip(): noteLine += subelement.tail #noteLine += "\\{}*".format( charStyle ) noteLine += "\\{}*".format( noteStyle ) if element.tail: noteText = element.tail.strip() noteLine += noteText self.appendToLastLine( noteLine ) elif element.tag == 'link': # Used to include extra resources BibleOrgSysGlobals.checkXMLNoText( element, location ) BibleOrgSysGlobals.checkXMLNoTail( element, location ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location ) # Process the attributes first linkStyle = linkDisplay = linkTarget = None for attrib,value in element.items(): if attrib=='style': linkStyle = value assert( linkStyle in ('jmp',) ) elif attrib=='display': linkDisplay = value # e.g., "click here" elif attrib=='target': linkTarget = value # e.g., some reference else: logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.addPriorityError( 3, c, v, _("Unprocessed {} link to {} in {}").format( repr(linkDisplay), repr(linkTarget), location) ) elif element.tag == 'unmatched': # Used to denote errors in the source text BibleOrgSysGlobals.checkXMLNoText( element, location ) BibleOrgSysGlobals.checkXMLNoTail( element, location ) BibleOrgSysGlobals.checkXMLNoAttributes( element, location ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location ) self.addPriorityError( 2, c, v, _("Unmatched element in {}").format( location) ) else: logging.warning( _("Unprocessed {} element after {} {}:{} in {}").format( element.tag, self.BBB, c, v, location ) ) self.addPriorityError( 1, c, v, _("Unprocessed {} element").format( element.tag ) ) for x in range(max(0,len(self)-10),len(self)): print( x, self._rawLines[x] ) if BibleOrgSysGlobals.debugFlag: halt # end of loadParagraph if BibleOrgSysGlobals.verbosityLevel > 2: print( " " + _("Loading {}...").format( filename ) ) self.isOneChapterBook = self.BBB in BibleOrgSysGlobals.BibleBooksCodes.getSingleChapterBooksList() self.sourceFilename = filename self.sourceFolder = folder self.sourceFilepath = os.path.join( folder, filename ) if folder else filename self.tree = ElementTree().parse( self.sourceFilepath ) assert( len ( self.tree ) ) # Fail here if we didn't load anything at all c = v = '0' loadErrors = [] lastMarker = None # Find the main container if self.tree.tag=='usx' or self.tree.tag=='usfm': # Not sure why both are allowable location = "USX ({}) file".format( self.tree.tag ) BibleOrgSysGlobals.checkXMLNoText( self.tree, location ) BibleOrgSysGlobals.checkXMLNoTail( self.tree, location ) # Process the attributes first self.schemaLocation = '' version = None for attrib,value in self.tree.items(): if attrib=='version': version = value logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) if version not in ( None, '2.0' ): logging.warning( _("Not sure if we can handle v{} USX files").format( version ) ) # Now process the data for element in self.tree: sublocation = element.tag + " " + location if element.tag == 'book': # milestone (not a container) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation ) # Process the attributes idField = bookStyle = None for attrib,value in element.items(): if attrib=='id' or attrib=='code': idField = value # Should be USFM bookcode (not like BBB which is BibleOrgSys BBB bookcode) #if idField != BBB: # logging.warning( _("Unexpected book code ({}) in {}").format( idField, sublocation ) ) elif attrib=='style': bookStyle = value else: logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) if bookStyle != 'id': logging.warning( _("Unexpected style attribute ({}) in {}").format( bookStyle, sublocation ) ) idLine = idField if element.text and element.text.strip(): idLine += ' ' + element.text self.addLine( 'id', idLine ) elif element.tag == 'chapter': # milestone (not a container) v = '0' BibleOrgSysGlobals.checkXMLNoText( element, sublocation ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation ) # Process the attributes chapterStyle = None for attrib,value in element.items(): if attrib=='number': c = value elif attrib=='style': chapterStyle = value else: logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) if chapterStyle != 'c': logging.warning( _("Unexpected style attribute ({}) in {}").format( chapterStyle, sublocation ) ) self.addLine( 'c', c ) elif element.tag == 'para': BibleOrgSysGlobals.checkXMLNoTail( element, sublocation ) USFMMarker = element.attrib['style'] # Get the USFM code for the paragraph style if BibleOrgSysGlobals.USFMMarkers.isNewlineMarker( USFMMarker ): #if lastMarker: self.addLine( lastMarker, lastText ) #lastMarker, lastText = USFMMarker, text loadParagraph( element, sublocation ) elif BibleOrgSysGlobals.USFMMarkers.isInternalMarker( USFMMarker ): # the line begins with an internal USFM Marker -- append it to the previous line text = element.text if text is None: text = '' if BibleOrgSysGlobals.debugFlag: print( _("{} {}:{} Found '\\{}' internal USFM marker at beginning of line with text: {}").format( self.BBB, c, v, USFMMarker, text ) ) #halt # Not checked yet if text: loadErrors.append( _("{} {}:{} Found '\\{}' internal USFM marker at beginning of line with text: {}").format( self.BBB, c, v, USFMMarker, text ) ) logging.warning( _("Found '\\{}' internal USFM Marker after {} {}:{} at beginning of line with text: {}").format( USFMMarker, self.BBB, c, v, text ) ) else: # no text loadErrors.append( _("{} {}:{} Found '\\{}' internal USFM Marker at beginning of line (with no text)").format( self.BBB, c, v, USFMMarker ) ) logging.warning( _("Found '\\{}' internal USFM Marker after {} {}:{} at beginning of line (with no text)").format( USFMMarker, self.BBB, c, v ) ) self.addPriorityError( 97, c, v, _("Found \\{} internal USFM Marker on new line in file").format( USFMMarker ) ) #lastText += '' if lastText.endswith(' ') else ' ' # Not always good to add a space, but it's their fault! lastText = '\\' + USFMMarker + ' ' + text #print( "{} {} {} Now have {}:{!r}".format( self.BBB, c, v, lastMarker, lastText ) ) else: # the line begins with an unknown USFM Marker text = element.text if text: loadErrors.append( _("{} {}:{} Found '\\{}' unknown USFM Marker at beginning of line with text: {}").format( self.BBB, c, v, USFMMarker, text ) ) logging.error( _("Found '\\{}' unknown USFM Marker after {} {}:{} at beginning of line with text: {}").format( USFMMarker, self.BBB, c, v, text ) ) else: # no text loadErrors.append( _("{} {}:{} Found '\\{}' unknown USFM Marker at beginning of line (with no text").format( self.BBB, c, v, USFMMarker ) ) logging.error( _("Found '\\{}' unknown USFM Marker after {} {}:{} at beginning of line (with no text)").format( USFMMarker, self.BBB, c, v ) ) self.addPriorityError( 100, c, v, _("Found \\{} unknown USFM Marker on new line in file").format( USFMMarker ) ) for tryMarker in sortedNLMarkers: # Try to do something intelligent here -- it might be just a missing space if USFMMarker.startswith( tryMarker ): # Let's try changing it if lastMarker: self.addLine( lastMarker, lastText ) lastMarker, lastText = tryMarker, USFMMarker[len(tryMarker):] + ' ' + text loadErrors.append( _("{} {}:{} Changed '\\{}' unknown USFM Marker to {!r} at beginning of line: {}").format( self.BBB, c, v, USFMMarker, tryMarker, text ) ) logging.warning( _("Changed '\\{}' unknown USFM Marker to {!r} after {} {}:{} at beginning of line: {}").format( USFMMarker, tryMarker, self.BBB, c, v, text ) ) break # Otherwise, don't bother processing this line -- it'll just cause more problems later on else: logging.warning( _("Unprocessed {} element after {} {}:{} in {}").format( element.tag, self.BBB, c, v, sublocation ) ) self.addPriorityError( 1, c, v, _("Unprocessed {} element").format( element.tag ) ) if loadErrors: self.errorDictionary['Load Errors'] = loadErrors
def __validate( self ): """ Check/validate the loaded data. """ assert self._XMLtree uniqueDict = {} for elementName in self._uniqueElements: uniqueDict["Element_"+elementName] = [] for attributeName in self._uniqueAttributes: uniqueDict["Attribute_"+attributeName] = [] expectedID = 1 for j,element in enumerate(self._XMLtree): if element.tag == self._mainElementTag: BibleOrgSysGlobals.checkXMLNoText( element, element.tag ) BibleOrgSysGlobals.checkXMLNoTail( element, element.tag ) if not self._compulsoryAttributes and not self._optionalAttributes: BibleOrgSysGlobals.checkXMLNoAttributes( element, element.tag ) if not self._compulsoryElements and not self._optionalElements: BibleOrgSysGlobals.checkXMLNoSubelements( element, element.tag ) # Check compulsory attributes on this main element for attributeName in self._compulsoryAttributes: attributeValue = element.get( attributeName ) if attributeValue is None: logging.error( _("Compulsory {!r} attribute is missing from {} element in record {}").format( attributeName, element.tag, j ) ) if not attributeValue: logging.warning( _("Compulsory {!r} attribute is blank on {} element in record {}").format( attributeName, element.tag, j ) ) # Check optional attributes on this main element for attributeName in self._optionalAttributes: attributeValue = element.get( attributeName ) if attributeValue is not None: if not attributeValue: logging.warning( _("Optional {!r} attribute is blank on {} element in record {}").format( attributeName, element.tag, j ) ) # Check for unexpected additional attributes on this main element for attributeName in element.keys(): attributeValue = element.get( attributeName ) if attributeName not in self._compulsoryAttributes and attributeName not in self._optionalAttributes: logging.warning( _("Additional {!r} attribute ({!r}) found on {} element in record {}").format( attributeName, attributeValue, element.tag, j ) ) # Check the attributes that must contain unique information (in that particular field -- doesn't check across different attributes) for attributeName in self._uniqueAttributes: attributeValue = element.get( attributeName ) if attributeValue is not None: if attributeValue in uniqueDict["Attribute_"+attributeName]: logging.error( _("Found {!r} data repeated in {!r} field on {} element in record {}").format( attributeValue, attributeName, element.tag, j ) ) uniqueDict["Attribute_"+attributeName].append( attributeValue ) # Get the marker to use as a record ID marker = element.find("marker").text # Check compulsory elements for elementName in self._compulsoryElements: if element.find( elementName ) is None: logging.error( _("Compulsory {!r} element is missing in record with marker {!r} (record {})").format( elementName, marker, j ) ) elif not element.find( elementName ).text: logging.warning( _("Compulsory {!r} element is blank in record with marker {!r} (record {})").format( elementName, marker, j ) ) # Check optional elements for elementName in self._optionalElements: if element.find( elementName ) is not None: if not element.find( elementName ).text: logging.warning( _("Optional {!r} element is blank in record with marker {!r} (record {})").format( elementName, marker, j ) ) # Check for unexpected additional elements for subelement in element: if subelement.tag not in self._compulsoryElements and subelement.tag not in self._optionalElements: logging.warning( _("Additional {!r} element ({!r}) found in record with marker {!r} (record {})").format( subelement.tag, subelement.text, marker, j ) ) # Check the elements that must contain unique information (in that particular element -- doesn't check across different elements) for elementName in self._uniqueElements: if element.find( elementName ) is not None: text = element.find( elementName ).text if text in uniqueDict["Element_"+elementName]: logging.error( _("Found {!r} data repeated in {!r} element in record with marker {!r} (record {})").format( text, elementName, marker, j ) ) uniqueDict["Element_"+elementName].append( text ) else: logging.warning( _("Unexpected element: {} in record {}").format( element.tag, j ) ) if element.tail is not None and element.tail.strip(): logging.error( _("Unexpected {!r} tail data after {} element in record {}").format( element.tail, element.tag, j ) ) if self._XMLtree.tail is not None and self._XMLtree.tail.strip(): logging.error( _("Unexpected {!r} tail data after {} element").format( self._XMLtree.tail, self._XMLtree.tag ) )
def _validateSystem(self, punctuationTree, systemName): """ """ assert punctuationTree uniqueDict = {} for elementName in self.uniqueElements: uniqueDict["Element_" + elementName] = [] for attributeName in self.uniqueAttributes: uniqueDict["Attribute_" + attributeName] = [] for k, element in enumerate(punctuationTree): if element.tag in self.mainElementTags: BibleOrgSysGlobals.checkXMLNoTail(element, element.tag) if not self.compulsoryAttributes and not self.optionalAttributes: BibleOrgSysGlobals.checkXMLNoAttributes( element, element.tag) if not self.compulsoryElements and not self.optionalElements: BibleOrgSysGlobals.checkXMLNoSubelements( element, element.tag) # Check compulsory attributes on this main element for attributeName in self.compulsoryAttributes: attributeValue = element.get(attributeName) if attributeValue is None: logging.error( _("Compulsory {!r} attribute is missing from {} element in record {}" ).format(attributeName, element.tag, k)) if not attributeValue: logging.warning( _("Compulsory {!r} attribute is blank on {} element in record {}" ).format(attributeName, element.tag, k)) # Check optional attributes on this main element for attributeName in self.optionalAttributes: attributeValue = element.get(attributeName) if attributeValue is not None: if not attributeValue: logging.warning( _("Optional {!r} attribute is blank on {} element in record {}" ).format(attributeName, element.tag, k)) # Check for unexpected additional attributes on this main element for attributeName in element.keys(): attributeValue = element.get(attributeName) if attributeName not in self.compulsoryAttributes and attributeName not in self.optionalAttributes: logging.warning( _("Additional {!r} attribute ({!r}) found on {} element in record {}" ).format(attributeName, attributeValue, element.tag, k)) # Check the attributes that must contain unique information (in that particular field -- doesn't check across different attributes) for attributeName in self.uniqueAttributes: attributeValue = element.get(attributeName) if attributeValue is not None: if attributeValue in uniqueDict["Attribute_" + attributeName]: logging.error( _("Found {!r} data repeated in {!r} field on {} element in record {}" ).format(attributeValue, attributeName, element.tag, k)) uniqueDict["Attribute_" + attributeName].append(attributeValue) # Check compulsory elements for elementName in self.compulsoryElements: if element.find(elementName) is None: logging.error( _("Compulsory {!r} element is missing in record with ID {!r} (record {})" ).format(elementName, ID, k)) if not element.find(elementName).text: logging.warning( _("Compulsory {!r} element is blank in record with ID {!r} (record {})" ).format(elementName, ID, k)) # Check optional elements for elementName in self.optionalElements: if element.find(elementName) is not None: if not element.find(elementName).text: logging.warning( _("Optional {!r} element is blank in record with ID {!r} (record {})" ).format(elementName, ID, k)) # Check for unexpected additional elements for subelement in element: if subelement.tag not in self.compulsoryElements and subelement.tag not in self.optionalElements: logging.warning( _("Additional {!r} element ({!r}) found in record with ID {!r} (record {})" ).format(subelement.tag, subelement.text, ID, k)) # Check the elements that must contain unique information (in that particular element -- doesn't check across different elements) for elementName in self.uniqueElements: if element.find(elementName) is not None: text = element.find(elementName).text if text in uniqueDict["Element_" + elementName]: logging.error( _("Found {!r} data repeated in {!r} element in record with ID {!r} (record {})" ).format(text, elementName, ID, k)) uniqueDict["Element_" + elementName].append(text) else: logging.warning( _("Unexpected element: {} in record {}").format( element.tag, k))
def __validateSystem( self, systemName ): """ Checks for basic formatting/content errors in a Bible book name system. """ assert systemName assert self.__XMLSystems[systemName]['tree'] if len(self.__XMLSystems[systemName]["languageCode"]) != 3: logging.error( _("Couldn't find 3-letter language code in {!r} book names system").format( systemName ) ) #if self.__ISOLanguages and not self.__ISOLanguages.isValidLanguageCode( self.__XMLSystems[systemName]["languageCode"] ): # Check that we have a valid language code #logging.error( _("Unrecognized {!r} ISO-639-3 language code in {!r} book names system").format( self.__XMLSystems[systemName]["languageCode"], systemName ) ) uniqueDict = {} for index in range( 0, len(self.mainElementTags) ): for elementName in self.uniqueElements[index]: uniqueDict["Element_"+str(index)+"_"+elementName] = [] for attributeName in self.uniqueAttributes[index]: uniqueDict["Attribute_"+str(index)+"_"+attributeName] = [] expectedID = 1 for k,element in enumerate(self.__XMLSystems[systemName]['tree']): if element.tag in self.mainElementTags: BibleOrgSysGlobals.checkXMLNoText( element, element.tag ) BibleOrgSysGlobals.checkXMLNoTail( element, element.tag ) if not self.compulsoryAttributes and not self.optionalAttributes: BibleOrgSysGlobals.checkXMLNoAttributes( element, element.tag ) if not self.compulsoryElements and not self.optionalElements: BibleOrgSysGlobals.checkXMLNoSubelements( element, element.tag ) index = self.mainElementTags.index( element.tag ) # Check compulsory attributes on this main element for attributeName in self.compulsoryAttributes[index]: attributeValue = element.get( attributeName ) if attributeValue is None: logging.error( _("Compulsory {!r} attribute is missing from {} element in record {} in {}").format( attributeName, element.tag, k, systemName ) ) if not attributeValue: logging.warning( _("Compulsory {!r} attribute is blank on {} element in record {} in {}").format( attributeName, element.tag, k, systemName ) ) # Check optional attributes on this main element for attributeName in self.optionalAttributes[index]: attributeValue = element.get( attributeName ) if attributeValue is not None: if not attributeValue: logging.warning( _("Optional {!r} attribute is blank on {} element in record {} in {}").format( attributeName, element.tag, k, systemName ) ) # Check for unexpected additional attributes on this main element for attributeName in element.keys(): attributeValue = element.get( attributeName ) if attributeName not in self.compulsoryAttributes[index] and attributeName not in self.optionalAttributes[index]: logging.warning( _("Additional {!r} attribute ({!r}) found on {} element in record {} in {}").format( attributeName, attributeValue, element.tag, k, systemName ) ) # Check the attributes that must contain unique information (in that particular field -- doesn't check across different attributes) for attributeName in self.uniqueAttributes[index]: attributeValue = element.get( attributeName ) if attributeValue is not None: if attributeValue in uniqueDict["Attribute_"+str(index)+"_"+attributeName]: logging.error( _("Found {!r} data repeated in {!r} field on {} element in record {} in {}").format( attributeValue, attributeName, element.tag, k, systemName ) ) uniqueDict["Attribute_"+str(index)+"_"+attributeName].append( attributeValue ) # Check compulsory elements for elementName in self.compulsoryElements[index]: if element.find( elementName ) is None: logging.error( _("Compulsory {!r} element is missing (record {}) in {}").format( elementName, k, systemName ) ) if not element.find( elementName ).text: logging.warning( _("Compulsory {!r} element is blank (record {}) in {}").format( elementName, k, systemName ) ) # Check optional elements for elementName in self.optionalElements[index]: if element.find( elementName ) is not None: if not element.find( elementName ).text: logging.warning( _("Optional {!r} element is blank (record {}) in {}").format( elementName, k, systemName ) ) # Check for unexpected additional elements for subelement in element: if subelement.tag not in self.compulsoryElements[index] and subelement.tag not in self.optionalElements[index]: logging.warning( _("Additional {!r} element ({!r}) found (record {}) in {} {}").format( subelement.tag, subelement.text, k, systemName, element.tag ) ) # Check the elements that must contain unique information (in that particular element -- doesn't check across different elements) for elementName in self.uniqueElements[index]: if element.find( elementName ) is not None: text = element.find( elementName ).text if text in uniqueDict["Element_"+str(index)+"_"+elementName]: myLogging = logging.info if element.tag == 'BibleDivisionNames' else logging.error myLogging( _("Found {!r} data repeated in {!r} element (record {}) in {}").format( text, elementName, k, systemName ) ) uniqueDict["Element_"+str(index)+"_"+elementName].append( text ) else: logging.warning( _("Unexpected element: {} in record {} in {}").format( element.tag, k, systemName ) )
def loadFootnote( self, element, location, BBB, C, V ): """ Handles footnote fields, including xt field. """ text, tail = clean(element.text), clean(element.tail) caller = None for attrib,value in element.items(): if attrib == 'caller': caller = value else: logging.warning( _("dg35 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.thisBook.appendToLastLine( ' \\f {}{}'.format( caller, (' '+text) if text else '' ) ) for subelement in element: sublocation = subelement.tag + " of " + location marker, fText, fTail = subelement.tag, clean(subelement.text), clean(subelement.tail) #print( "USFX.loadFootnote", repr(caller), repr(text), repr(tail), repr(marker), repr(fText), repr(fTail) ) #if BibleOrgSysGlobals.verbosityLevel > 0 and marker not in ('ref','fr','ft','fq','fv','fk','fqa','it','bd','rq',): #print( "USFX.loadFootnote found", repr(caller), repr(marker), repr(fText), repr(fTail) ) if BibleOrgSysGlobals.debugFlag: assert( marker in ('ref','fr','ft','fq','fv','fk','fqa','it','bd','rq','xt',) ) if marker=='ref': assert( fText ) BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'ls13' ) target = None for attrib,value in subelement.items(): if attrib == 'tgt': target = value else: logging.warning( _("gs35 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) if target: self.thisBook.appendToLastLine( ' \\{} {}\\{}*{}'.format( marker, target, marker, fText ) ) else: halt else: BibleOrgSysGlobals.checkXMLNoAttributes( subelement, sublocation, 'dq54' ) self.thisBook.appendToLastLine( ' \\{} {}'.format( marker, fText ) ) if marker=='xt' or marker[0]=='f': # Starts with f, e.g., fr, ft for sub2element in subelement: sub2location = sub2element.tag + " of " + sublocation marker2, fText2, fTail2 = sub2element.tag, clean(sub2element.text), clean(sub2element.tail) BibleOrgSysGlobals.checkXMLNoSubelements( sub2element, sub2location, 'js72' ) if marker2 == 'ref': #print( sub2location ) if fText2: #print( 'ft2', marker2, repr(fText2), repr(fTail2), sub2location ) self.thisBook.appendToLastLine( fText2 ) target = None for attrib,value in sub2element.items(): if attrib == 'tgt': target = value # OSIS style reference, e.g., '1SA.27.8' else: logging.warning( _("hd52 Unprocessed {} attribute ({}) in {}").format( attrib, value, sub2location ) ) if target: #print( 'tg', marker2, repr(target) ) self.thisBook.appendToLastLine( ' \\{} {}'.format( marker2, target ) ) else: if debuggingThisModule: halt elif marker2 in ('add','nd','wj','rq','sig','sls','bk','k','tl','vp','pn','qs','qt','em','it','bd','bdit','sc','no',): # character formatting self.loadCharacterFormatting( sub2element, sub2location, BBB, C, V ) else: print( 'Ignored marker2', repr(marker2), BBB, C, V ) if debuggingThisModule: halt if fTail2: self.thisBook.appendToLastLine( fTail2 ) elif marker in ('add','nd','wj','rq','sig','sls','bk','k','tl','vp','pn','qs','qt','em','it','bd','bdit','sc','no',): # character formatting self.loadCharacterFormatting( subelement, sublocation, BBB, C, V ) else: print( 'Ignored marker', repr(marker), BBB, C, V ) halt if fTail: self.thisBook.appendToLastLine( '\\{}*{}'.format( marker, fTail ) ) self.thisBook.appendToLastLine( '\\f*{}'.format( (' '+tail) if tail else '' ) )
def __validateAndExtractVerse( self, BBB, chapterNumber, thisBook, verse ): """ Check/validate and extract chapter data from the given XML book record finding and saving chapter numbers and finding and saving verse elements. """ if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML verse…") ) location = "verse in {} {}".format( BBB, chapterNumber ) BibleOrgSysGlobals.checkXMLNoTail( verse, location, 'l5ks' ) # Handle verse attributes verseNumber = toVerseNumber = None for attrib,value in verse.items(): if attrib == 'vnumber': verseNumber = value else: logging.warning( "Unprocessed {!r} attribute ({}) in verse element".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert verseNumber location = "{}:{}".format( location, verseNumber ) # Get a better location description #thisBook.addLine( 'v', verseNumber ) vText = verse.text if vText: vText = vText.strip() #if not vText: # This happens if a verse starts immediately with a style or note #logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, verseNumber ) ) # Handle verse subelements (notes and styled portions) for subelement in verse: if subelement.tag == ZefaniaXMLBible.noteTag: sublocation = "note in " + location noteType = None for attrib,value in subelement.items(): if attrib == 'type': noteType = value else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) ) if noteType not in ('n-studynote','x-studynote',): logging.warning( "Unexpected {} note type in {}".format( noteType, BBB ) ) if BibleOrgSysGlobals.debugFlag: assert noteType nText, nTail = subelement.text, subelement.tail #print( "note", BBB, chapterNumber, verseNumber, noteType, repr(nText), repr(nTail) ) #thisBook.addLine( 'ST', css ) # XXXXXXXXXXXXXXXXXXXXXXXXXX Losing data here (for now) #thisBook.addLine( 'ST=', nText ) if nTail: if '\n' in nTail: print( "ZefaniaXMLBible.__validateAndExtractVerse: nTail {} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, nTail ) ) nTail = nTail.replace( '\n', ' ' ) thisBook.addLine( 'v~', nTail ) for sub2element in subelement: if sub2element.tag == ZefaniaXMLBible.styleTag: sub2location = "style in " + sublocation BibleOrgSysGlobals.checkXMLNoSubelements( sub2element, sub2location, 'fyt4' ) css = idStyle = None for attrib,value in sub2element.items(): if attrib == 'css': css = value elif attrib == 'id': idStyle = value else: logging.warning( "Unprocessed {!r} attribute ({}) in style sub2element".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert css or idStyle SFM = None if css=='font-style:italic' or css=='font-style:italic;': SFM = '\\it' elif css == "font-style:italic;font-weight:bold": SFM = '\\bdit' elif css == "color:#FF0000": SFM = '\\em' elif css == "font-size: x-small; color:#8B8378": SFM = '\\add' elif css is None and idStyle=='cl:divineName': SFM = '\\nd' else: logging.error( "Ignored1 css is {!r} idStyle is {!r}".format( css, idStyle ) ) if BibleOrgSysGlobals.debugFlag: halt sText, sTail = sub2element.text.strip(), sub2element.tail if BibleOrgSysGlobals.debugFlag: assert sText if SFM: vText += SFM+' ' + sText + SFM+'*' else: vText += '\\sc ' + '['+css+']' + sText + '\\sc* ' # Use sc for unknown styles if sTail: vText += sTail.strip() else: logging.error( "Expected to find {} but got {!r} in {}".format( ZefaniaXMLBible.styleTag, sub2element.tag, sublocation ) ) elif subelement.tag == ZefaniaXMLBible.styleTag: sublocation = "style in " + location css = idStyle = None for attrib,value in subelement.items(): if attrib == 'css': css = value elif attrib == 'id': idStyle = value else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert css or idStyle SFM = None if css=='font-style:italic' or css=='font-style:italic;': SFM = '\\it' elif css == "font-style:italic;font-weight:bold": SFM = '\\bdit' elif css == "color:#FF0000": SFM = '\\em' elif css == "font-size: x-small; color:#8B8378": SFM = '\\add' elif css is None and idStyle=='cl:divineName': SFM = '\\nd' else: logging.error( "Ignored2 css is {!r} idStyle is {!r}".format( css, idStyle ) ) if BibleOrgSysGlobals.debugFlag and debuggingThisModule: halt for sub2element in subelement: if sub2element.tag == ZefaniaXMLBible.grTag: sub2location = "gr in " + sublocation BibleOrgSysGlobals.checkXMLNoAttributes( sub2element, sub2location, 'ks12' ) BibleOrgSysGlobals.checkXMLNoSubelements( sub2element, sub2location, 'dl36' ) BibleOrgSysGlobals.checkXMLNoTail( sub2element, sub2location, 'js24' ) grText = sub2element.text.strip() if sub2element.text else '' logging.error( "Unfinished to process 'gr' {!r} sub2element ({}) in style subelement".format( grText, sublocation ) ) else: logging.error( "Expected to find 'gr' but got {!r} in {}".format( sub2element.tag, sublocation ) ) #sText, sTail = subelement.text.strip(), subelement.tail sText = subelement.text.strip() if subelement.text else '' sTail = subelement.tail.strip() if subelement.tail else None if BibleOrgSysGlobals.debugFlag and debuggingThisModule: assert sText if SFM: vText += SFM+' ' + sText + SFM+'*' else: vText += '\\sc ' + '['+(css if css else '')+']' + sText + '\\sc* ' # Use sc for unknown styles if sTail: vText += sTail elif subelement.tag == ZefaniaXMLBible.breakTag: sublocation = "line break in " + location BibleOrgSysGlobals.checkXMLNoText( subelement, sublocation, 'c1d4' ) BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'g4g8' ) art = None for attrib,value in subelement.items(): if attrib == 'art': art = value else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert art == 'x-nl' #print( BBB, chapterNumber, verseNumber ) #assert vText if vText: if '\n' in vText: logging.warning( "ZefaniaXMLBible.__validateAndExtractVerse_a: newline in vText {} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) ) vText = vText.replace( '\n', ' ' ) thisBook.addLine( 'v', verseNumber + ' ' + vText ) vText = '' breakText = subelement.tail.strip() if subelement.tail else '' if '\n' in breakText: logging.warning( "ZefaniaXMLBible.__validateAndExtractVerse: newline in breakText {} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, breakText ) ) breakText = breakText.replace( '\n', ' ' ) thisBook.addLine( 'm', breakText ) elif subelement.tag == ZefaniaXMLBible.divTag: sublocation = "div break in " + location BibleOrgSysGlobals.checkXMLNoText( subelement, sublocation, 'ld46' ) BibleOrgSysGlobals.checkXMLNoAttributes( subelement, sublocation, 'kx10' ) BibleOrgSysGlobals.checkXMLNoTail( subelement, sublocation, 'las9' ) for sub2element in subelement: if sub2element.tag == 'NOTE': sub2location = "NOTE in " + sublocation BibleOrgSysGlobals.checkXMLNoAttributes( sub2element, sub2location, 'lc35' ) BibleOrgSysGlobals.checkXMLNoSubelements( sub2element, sub2location, 'ks27' ) BibleOrgSysGlobals.checkXMLNoTail( sub2element, sub2location, 'ksd1' ) noteText = sub2element.text.strip() if sub2element.text else '' vText += '\\f {}\\f*'.format( noteText ) else: logging.error( "Expected to find 'NOTE' but got {!r} in {}".format( sub2element.tag, sublocation ) ) elif subelement.tag == ZefaniaXMLBible.grTag: sublocation = "gr in " + location BibleOrgSysGlobals.checkXMLNoAttributes( subelement, sublocation, 'ksd2' ) BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'ls10' ) BibleOrgSysGlobals.checkXMLNoTail( subelement, sublocation, 'cg27' ) grText = subelement.text.strip() if subelement.text else '' logging.error( "Unfinished to process 'gr' {!r} subelement ({}) in style subelement".format( grText, location ) ) else: logging.error( "Expected to find NOTE or STYLE or BREAK or DIV but got {!r} in {}".format( subelement.tag, location ) ) if vText: # This is the main text of the verse (follows the verse milestone) if '\n' in vText: logging.warning( "ZefaniaXMLBible.__validateAndExtractVerse_b: newline in vText {} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) ) vText = vText.replace( '\n', ' ' ) thisBook.addLine( 'v', verseNumber + ' ' + vText )
def __validateSystem(self, systemName): """ Checks for basic formatting/content errors in a Bible book name system. """ assert systemName assert self.__XMLSystems[systemName]['tree'] if len(self.__XMLSystems[systemName]["languageCode"]) != 3: logging.error( _("Couldn't find 3-letter language code in {!r} book names system" ).format(systemName)) #if self.__ISOLanguages and not self.__ISOLanguages.isValidLanguageCode( self.__XMLSystems[systemName]["languageCode"] ): # Check that we have a valid language code #logging.error( _("Unrecognized {!r} ISO-639-3 language code in {!r} book names system").format( self.__XMLSystems[systemName]["languageCode"], systemName ) ) uniqueDict = {} for index in range(0, len(self.mainElementTags)): for elementName in self.uniqueElements[index]: uniqueDict["Element_" + str(index) + "_" + elementName] = [] for attributeName in self.uniqueAttributes[index]: uniqueDict["Attribute_" + str(index) + "_" + attributeName] = [] expectedID = 1 for k, element in enumerate(self.__XMLSystems[systemName]['tree']): if element.tag in self.mainElementTags: BibleOrgSysGlobals.checkXMLNoText(element, element.tag) BibleOrgSysGlobals.checkXMLNoTail(element, element.tag) if not self.compulsoryAttributes and not self.optionalAttributes: BibleOrgSysGlobals.checkXMLNoAttributes( element, element.tag) if not self.compulsoryElements and not self.optionalElements: BibleOrgSysGlobals.checkXMLNoSubelements( element, element.tag) index = self.mainElementTags.index(element.tag) # Check compulsory attributes on this main element for attributeName in self.compulsoryAttributes[index]: attributeValue = element.get(attributeName) if attributeValue is None: logging.error( _("Compulsory {!r} attribute is missing from {} element in record {} in {}" ).format(attributeName, element.tag, k, systemName)) if not attributeValue: logging.warning( _("Compulsory {!r} attribute is blank on {} element in record {} in {}" ).format(attributeName, element.tag, k, systemName)) # Check optional attributes on this main element for attributeName in self.optionalAttributes[index]: attributeValue = element.get(attributeName) if attributeValue is not None: if not attributeValue: logging.warning( _("Optional {!r} attribute is blank on {} element in record {} in {}" ).format(attributeName, element.tag, k, systemName)) # Check for unexpected additional attributes on this main element for attributeName in element.keys(): attributeValue = element.get(attributeName) if attributeName not in self.compulsoryAttributes[ index] and attributeName not in self.optionalAttributes[ index]: logging.warning( _("Additional {!r} attribute ({!r}) found on {} element in record {} in {}" ).format(attributeName, attributeValue, element.tag, k, systemName)) # Check the attributes that must contain unique information (in that particular field -- doesn't check across different attributes) for attributeName in self.uniqueAttributes[index]: attributeValue = element.get(attributeName) if attributeValue is not None: if attributeValue in uniqueDict["Attribute_" + str(index) + "_" + attributeName]: logging.error( _("Found {!r} data repeated in {!r} field on {} element in record {} in {}" ).format(attributeValue, attributeName, element.tag, k, systemName)) uniqueDict["Attribute_" + str(index) + "_" + attributeName].append(attributeValue) # Check compulsory elements for elementName in self.compulsoryElements[index]: if element.find(elementName) is None: logging.error( _("Compulsory {!r} element is missing (record {}) in {}" ).format(elementName, k, systemName)) if not element.find(elementName).text: logging.warning( _("Compulsory {!r} element is blank (record {}) in {}" ).format(elementName, k, systemName)) # Check optional elements for elementName in self.optionalElements[index]: if element.find(elementName) is not None: if not element.find(elementName).text: logging.warning( _("Optional {!r} element is blank (record {}) in {}" ).format(elementName, k, systemName)) # Check for unexpected additional elements for subelement in element: if subelement.tag not in self.compulsoryElements[ index] and subelement.tag not in self.optionalElements[ index]: logging.warning( _("Additional {!r} element ({!r}) found (record {}) in {} {}" ).format(subelement.tag, subelement.text, k, systemName, element.tag)) # Check the elements that must contain unique information (in that particular element -- doesn't check across different elements) for elementName in self.uniqueElements[index]: if element.find(elementName) is not None: text = element.find(elementName).text if text in uniqueDict["Element_" + str(index) + "_" + elementName]: myLogging = logging.info if element.tag == 'BibleDivisionNames' else logging.error myLogging( _("Found {!r} data repeated in {!r} element (record {}) in {}" ).format(text, elementName, k, systemName)) uniqueDict["Element_" + str(index) + "_" + elementName].append(text) else: logging.warning( _("Unexpected element: {} in record {} in {}").format( element.tag, k, systemName))
def load( self ): """ Load a single source XML file and load book elements. """ if BibleOrgSysGlobals.verbosityLevel > 2: print( _("Loading {}…").format( self.sourceFilepath ) ) self.tree = ElementTree().parse( self.sourceFilepath ) if BibleOrgSysGlobals.debugFlag: assert len ( self.tree ) # Fail here if we didn't load anything at all if self.suppliedMetadata is None: self.suppliedMetadata = {} self.suppliedMetadata['VerseView'] = {} # Find the main (bible) container if self.tree.tag == VerseViewXMLBible.treeTag: location = "VerseView XML file" BibleOrgSysGlobals.checkXMLNoText( self.tree, location, '4f6h' ) BibleOrgSysGlobals.checkXMLNoAttributes( self.tree, location, 'js24' ) BibleOrgSysGlobals.checkXMLNoTail( self.tree, location, '1wk8' ) # Find the submain (various info and then book) containers bookNumber = 0 for element in self.tree: if element.tag == VerseViewXMLBible.filenameTag: sublocation = "filename in " + location BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jk86' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'hjk7' ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'bh09' ) #self.filename = element.text elif element.tag == VerseViewXMLBible.revisionTag: sublocation = "revision in " + location BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jk86' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'hjk7' ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'bh09' ) self.suppliedMetadata['VerseView']['Revision'] = element.text elif element.tag == VerseViewXMLBible.titleTag: sublocation = "title in " + location BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jk86' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'hjk7' ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'bh09' ) self.suppliedMetadata['VerseView']['Title'] = element.text elif element.tag == VerseViewXMLBible.fontTag: sublocation = "font in " + location BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jk86' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'hjk7' ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'bh09' ) self.suppliedMetadata['VerseView']['Font'] = element.text elif element.tag == VerseViewXMLBible.copyrightTag: sublocation = "copyright in " + location BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jk86' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'hjk7' ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'bh09' ) self.suppliedMetadata['VerseView']['Copyright'] = element.text elif element.tag == VerseViewXMLBible.sizefactorTag: sublocation = "sizefactor in " + location BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jk86' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'hjk7' ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'bh09' ) if BibleOrgSysGlobals.debugFlag: assert element.text == '1' elif element.tag == VerseViewXMLBible.bookTag: sublocation = "book in " + location BibleOrgSysGlobals.checkXMLNoText( element, sublocation, 'g3g5' ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'd3f6' ) bookNumber += 1 self.__validateAndExtractBook( element, bookNumber ) else: logging.error( "xk15 Expected to find {!r} but got {!r}".format( VerseViewXMLBible.bookTag, element.tag ) ) else: logging.error( "Expected to load {!r} but got {!r}".format( VerseViewXMLBible.treeTag, self.tree.tag ) ) if BibleOrgSysGlobals.debugFlag or BibleOrgSysGlobals.verbosityLevel > 2: # These are all compulsory so they should all exist #print( "Filename is {!r}".format( self.filename ) ) print( "Revision is {!r}".format( self.suppliedMetadata['VerseView']['Revision'] ) ) print( "Title is {!r}".format( self.suppliedMetadata['VerseView']['Title'] ) ) print( "Font is {!r}".format( self.suppliedMetadata['VerseView']['Font'] ) ) print( "Copyright is {!r}".format( self.suppliedMetadata['VerseView']['Copyright'] ) ) #print( "SizeFactor is {!r}".format( self.sizeFactor ) ) self.applySuppliedMetadata( 'VerseView' ) # Copy some to self.settingsDict self.doPostLoadProcessing()
def load( self, filename, folder=None, encoding='utf-8' ): """ Load a single source USX XML file and extract the information. """ if BibleOrgSysGlobals.debugFlag and debuggingThisModule: print( exp("load( {}, {}, {} )").format( filename, folder, encoding ) ) def loadParagraph( paragraphXML, paragraphlocation ): """ Load a paragraph from the USX XML. In this context, paragraph means heading and intro lines, as well as paragraphs of verses. Uses (and updates) C,V information from the containing function. """ nonlocal C, V # Process the attributes first paragraphStyle = None for attrib,value in paragraphXML.items(): if attrib=='style': paragraphStyle = value # This is basically the USFM marker name else: logging.warning( _("CH46 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) # Now process the paragraph text (or write a paragraph marker anyway) paragraphText = paragraphXML.text if paragraphXML.text and paragraphXML.text.strip() else '' if version is None: paragraphText = paragraphText.rstrip() # Don't need to strip extra spaces in v2 self.addLine( paragraphStyle, paragraphText ) # Now process the paragraph subelements for element in paragraphXML: location = element.tag + ' ' + paragraphlocation #print( "USXXMLBibleBook.load", C, V, element.tag, location ) if element.tag == 'verse': # milestone (not a container) BibleOrgSysGlobals.checkXMLNoText( element, location ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location ) # Process the attributes first verseStyle = altNumber = None for attrib,value in element.items(): if attrib=='number': V = value elif attrib=='style': verseStyle = value elif attrib=='altnumber': altNumber = value else: logging.error( _("KR60 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) if verseStyle != 'v': logging.error( _("Unexpected style attribute ({}) in {}").format( verseStyle, location ) ) #if altNumber: print( repr(verseStyle), repr(altNumber) ); halt altStuff = ' \\va {}\\va*'.format( altNumber ) if altNumber else '' self.addLine( verseStyle, V + altStuff + ' ' ) # Now process the tail (if there's one) which is the verse text if element.tail: vText = element.tail if vText[0]=='\n': vText = vText.lstrip() # Paratext puts cross references on a new line if vText: #print( repr(vText) ) self.appendToLastLine( vText ) elif element.tag == 'char': # Process the attributes first charStyle = None for attrib,value in element.items(): if attrib=='style': charStyle = value # This is basically the USFM character marker name #print( " charStyle", charStyle ) assert not BibleOrgSysGlobals.USFMMarkers.isNewlineMarker( charStyle ) else: logging.error( _("QU52 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) charLine = "\\{} {} ".format( charStyle, element.text ) # Now process the subelements -- chars are one of the few multiply embedded fields in USX for subelement in element: sublocation = subelement.tag + ' ' + location #print( '{} {}:{} {}'.format( self.BBB, C, V, element.tag ) ) if subelement.tag == 'char': # milestone (not a container) BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation ) # Process the attributes first subCharStyle, charClosed = None, True for attrib,value in subelement.items(): if attrib=='style': subCharStyle = value elif attrib=='closed': assert value=='false' charClosed = False else: logging.error( _("KS41 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) charLine += "\\{} {}".format( subCharStyle, subelement.text ) if charClosed: charLine += "\\{}*".format( subCharStyle ) #if subelement.tail is not None: print( " tail1", repr(subelement.tail) ) charLine += '' if subelement.tail is None else subelement.tail else: logging.error( _("Unprocessed {} subelement after {} {}:{} in {}").format( subelement.tag, self.BBB, C, V, sublocation ) ) self.addPriorityError( 1, C, V, _("Unprocessed {} subelement").format( subelement.tag ) ) # A character field must be added to the previous field #if element.tail is not None: print( " tail2", repr(element.tail) ) charTail = '' if element.tail: charTail = element.tail if charTail[0]=='\n': charTail = charTail.lstrip() # Paratext puts footnote parts on new lines charLine += "\\{}*{}".format( charStyle, charTail ) #if debuggingThisModule: print( "USX.loadParagraph:", C, V, paragraphStyle, charStyle, repr(charLine) ) self.appendToLastLine( charLine ) elif element.tag == 'note': #print( "NOTE", BibleOrgSysGlobals.elementStr( element ) ) # Process the attributes first noteStyle = noteCaller = None for attrib,value in element.items(): if attrib=='style': noteStyle = value # This is basically the USFM marker name assert noteStyle in ('x','f',) elif attrib=='caller': noteCaller = value # Usually hyphen or a symbol to be used for the note else: logging.error( _("CY38 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) if noteCaller=='' and self.BBB=='NUM' and C=='10' and V=='36': noteCaller = '+' # Hack assert noteStyle and noteCaller # both compulsory noteLine = "\\{} {} ".format( noteStyle, noteCaller ) if element.text: noteText = element.text.strip() noteLine += noteText # Now process the subelements -- notes are one of the few multiply embedded fields in USX for subelement in element: sublocation = subelement.tag + ' ' + location #print( C, V, subelement.tag ) if subelement.tag == 'char': # milestone (not a container) # Process the attributes first charStyle, charClosed = None, True for attrib,value in subelement.items(): if attrib=='style': charStyle = value elif attrib=='closed': assert value=='false' charClosed = False else: logging.warning( _("GJ67 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) noteLine += "\\{} {}".format( charStyle, subelement.text ) # Now process the subelements -- notes are one of the few multiply embedded fields in USX for sub2element in subelement: sub2location = sub2element.tag + ' ' + sublocation #print( C, V, sub2element.tag ) if sub2element.tag == 'char': # milestone (not a container) BibleOrgSysGlobals.checkXMLNoSubelements( sub2element, sub2location ) # Process the attributes first char2Style, char2Closed = None, True for attrib,value in sub2element.items(): if attrib=='style': char2Style = value elif attrib=='closed': assert value=='false' char2Closed = False else: logging.warning( _("VH36 Unprocessed {} attribute ({}) in {}").format( attrib, value, sub2location ) ) assert char2Closed noteLine += "\\{} {}\\{}*{}".format( char2Style, sub2element.text, char2Style, sub2element.tail if sub2element.tail else '' ) if charClosed: noteLine += "\\{}*".format( charStyle ) if subelement.tail: charTail = subelement.tail if charTail[0]=='\n': charTail = charTail.lstrip() # Paratext puts cross reference parts on a new line noteLine += charTail elif subelement.tag == 'unmatched': # Used to denote errors in the source text BibleOrgSysGlobals.checkXMLNoText( subelement, sublocation ) BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation ) # Process the attributes first unmmatchedMarker = None for attrib,value in subelement.items(): if attrib=='marker': unmmatchedMarker = value else: logging.warning( _("NV21 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) self.addPriorityError( 2, C, V, _("Unmatched subelement for {} in {}").format( repr(unmmatchedMarker), sublocation) if unmmatchedMarker else _("Unmatched subelement in {}").format( sublocation) ) else: logging.warning( _("Unprocessed {} subelement after {} {}:{} in {}").format( subelement.tag, self.BBB, C, V, sublocation ) ) self.addPriorityError( 1, C, V, _("Unprocessed {} subelement").format( subelement.tag ) ) if subelement.tail and subelement.tail.strip(): noteLine += subelement.tail #noteLine += "\\{}*".format( charStyle ) noteLine += "\\{}*".format( noteStyle ) if element.tail: #if '\n' in element.tail: halt noteTail = element.tail if noteTail[0]=='\n': noteTail = noteTail.lstrip() # Paratext puts multiple cross-references on new lines noteLine += noteTail #print( "NoteLine", repr(noteLine) ) self.appendToLastLine( noteLine ) elif element.tag == 'link': # Used to include extra resources BibleOrgSysGlobals.checkXMLNoText( element, location ) BibleOrgSysGlobals.checkXMLNoTail( element, location ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location ) # Process the attributes first linkStyle = linkDisplay = linkTarget = None for attrib,value in element.items(): if attrib=='style': linkStyle = value assert linkStyle in ('jmp',) elif attrib=='display': linkDisplay = value # e.g., "click here" elif attrib=='target': linkTarget = value # e.g., some reference else: logging.warning( _("KW54 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.addPriorityError( 3, C, V, _("Unprocessed {} link to {} in {}").format( repr(linkDisplay), repr(linkTarget), location) ) elif element.tag == 'unmatched': # Used to denote errors in the source text BibleOrgSysGlobals.checkXMLNoText( element, location ) BibleOrgSysGlobals.checkXMLNoTail( element, location ) BibleOrgSysGlobals.checkXMLNoAttributes( element, location ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location ) self.addPriorityError( 2, C, V, _("Unmatched element in {}").format( location) ) else: logging.warning( _("Unprocessed {} element after {} {}:{} in {}").format( element.tag, self.BBB, C, V, location ) ) self.addPriorityError( 1, C, V, _("Unprocessed {} element").format( element.tag ) ) for x in range(max(0,len(self)-10),len(self)): print( x, self._rawLines[x] ) if BibleOrgSysGlobals.debugFlag: halt # end of loadParagraph C = V = '0' loadErrors = [] lastMarker = None if BibleOrgSysGlobals.verbosityLevel > 3: print( " " + _("Loading {} from {}…").format( filename, folder ) ) elif BibleOrgSysGlobals.verbosityLevel > 2: print( " " + _("Loading {}…").format( filename ) ) self.isOneChapterBook = self.BBB in BibleOrgSysGlobals.BibleBooksCodes.getSingleChapterBooksList() self.sourceFilename = filename self.sourceFolder = folder self.sourceFilepath = os.path.join( folder, filename ) if folder else filename try: self.tree = ElementTree().parse( self.sourceFilepath ) except ParseError as err: logging.critical( exp("Loader parse error in xml file {}: {} {}").format( filename, sys.exc_info()[0], err ) ) loadErrors.append( exp("Loader parse error in xml file {}: {} {}").format( filename, sys.exc_info()[0], err ) ) self.addPriorityError( 100, C, V, _("Loader parse error in xml file {}: {}").format( filename, err ) ) if BibleOrgSysGlobals.debugFlag: assert len ( self.tree ) # Fail here if we didn't load anything at all # Find the main container if 'tree' in dir(self) \ and ( self.tree.tag=='usx' or self.tree.tag=='usfm' ): # Not sure why both are allowable location = "USX ({}) file".format( self.tree.tag ) BibleOrgSysGlobals.checkXMLNoText( self.tree, location ) BibleOrgSysGlobals.checkXMLNoTail( self.tree, location ) # Process the attributes first self.schemaLocation = '' version = None for attrib,value in self.tree.items(): if attrib=='version': version = value else: logging.warning( _("DG84 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) if version not in ( None, '2.0' ): logging.warning( _("Not sure if we can handle v{} USX files").format( version ) ) # Now process the data for element in self.tree: sublocation = element.tag + " " + location if element.tag == 'book': # milestone (not a container) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation ) # Process the attributes idField = bookStyle = None for attrib,value in element.items(): if attrib=='id' or attrib=='code': idField = value # Should be USFM bookcode (not like BBB which is BibleOrgSys BBB bookcode) #if idField != BBB: # logging.warning( _("Unexpected book code ({}) in {}").format( idField, sublocation ) ) elif attrib=='style': bookStyle = value else: logging.warning( _("MD12 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) if bookStyle != 'id': logging.warning( _("Unexpected style attribute ({}) in {}").format( bookStyle, sublocation ) ) idLine = idField if element.text and element.text.strip(): idLine += ' ' + element.text self.addLine( 'id', idLine ) elif element.tag == 'chapter': # milestone (not a container) V = '0' BibleOrgSysGlobals.checkXMLNoText( element, sublocation ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation ) # Process the attributes chapterStyle = pubNumber = None for attrib,value in element.items(): if attrib=='number': C = value elif attrib=='style': chapterStyle = value elif attrib=='pubnumber': pubNumber = value else: logging.error( _("LY76 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) if chapterStyle != 'c': logging.warning( _("Unexpected style attribute ({}) in {}").format( chapterStyle, sublocation ) ) #if pubNumber: print( self.BBB, C, repr(pubNumber) ); halt self.addLine( 'c', C ) if pubNumber: self.addLine( 'cp', pubNumber ) elif element.tag == 'para': BibleOrgSysGlobals.checkXMLNoTail( element, sublocation ) USFMMarker = element.attrib['style'] # Get the USFM code for the paragraph style if BibleOrgSysGlobals.USFMMarkers.isNewlineMarker( USFMMarker ): #if lastMarker: self.addLine( lastMarker, lastText ) #lastMarker, lastText = USFMMarker, text loadParagraph( element, sublocation ) elif BibleOrgSysGlobals.USFMMarkers.isInternalMarker( USFMMarker ): # the line begins with an internal USFM Marker -- append it to the previous line text = element.text if text is None: text = '' if BibleOrgSysGlobals.debugFlag: print( _("{} {}:{} Found '\\{}' internal USFM marker at beginning of line with text: {}").format( self.BBB, C, V, USFMMarker, text ) ) #halt # Not checked yet if text: loadErrors.append( _("{} {}:{} Found '\\{}' internal USFM marker at beginning of line with text: {}").format( self.BBB, C, V, USFMMarker, text ) ) logging.warning( _("Found '\\{}' internal USFM Marker after {} {}:{} at beginning of line with text: {}").format( USFMMarker, self.BBB, C, V, text ) ) else: # no text loadErrors.append( _("{} {}:{} Found '\\{}' internal USFM Marker at beginning of line (with no text)").format( self.BBB, C, V, USFMMarker ) ) logging.warning( _("Found '\\{}' internal USFM Marker after {} {}:{} at beginning of line (with no text)").format( USFMMarker, self.BBB, C, V ) ) self.addPriorityError( 97, C, V, _("Found \\{} internal USFM Marker on new line in file").format( USFMMarker ) ) #lastText += '' if lastText.endswith(' ') else ' ' # Not always good to add a space, but it's their fault! lastText = '\\' + USFMMarker + ' ' + text #print( "{} {} {} Now have {}:{!r}".format( self.BBB, C, V, lastMarker, lastText ) ) else: # the line begins with an unknown USFM Marker try: status = element.attrib['status'] except KeyError: status = None text = element.text if text: loadErrors.append( _("{} {}:{} Found '\\{}' unknown USFM Marker at beginning of line with text: {}").format( self.BBB, C, V, USFMMarker, text ) ) logging.error( _("Found '\\{}' unknown USFM Marker after {} {}:{} at beginning of line with text: {}").format( USFMMarker, self.BBB, C, V, text ) ) else: # no text loadErrors.append( _("{} {}:{} Found '\\{}' unknown USFM Marker at beginning of line (with no text").format( self.BBB, C, V, USFMMarker ) ) logging.error( _("Found '\\{}' unknown USFM Marker after {} {}:{} at beginning of line (with no text)").format( USFMMarker, self.BBB, C, V ) ) self.addPriorityError( 100, C, V, _("Found \\{} unknown USFM Marker on new line in file").format( USFMMarker ) ) if status == 'unknown': # USX exporter already knew it was a bad marker pass # Just drop it completely else: for tryMarker in sortedNLMarkers: # Try to do something intelligent here -- it might be just a missing space if USFMMarker.startswith( tryMarker ): # Let's try changing it if lastMarker: self.addLine( lastMarker, lastText ) lastMarker, lastText = tryMarker, USFMMarker[len(tryMarker):] + ' ' + text loadErrors.append( _("{} {}:{} Changed '\\{}' unknown USFM Marker to {!r} at beginning of line: {}").format( self.BBB, C, V, USFMMarker, tryMarker, text ) ) logging.warning( _("Changed '\\{}' unknown USFM Marker to {!r} after {} {}:{} at beginning of line: {}").format( USFMMarker, tryMarker, self.BBB, C, V, text ) ) break # Otherwise, don't bother processing this line -- it'll just cause more problems later on else: logging.error( _("Unprocessed {} element after {} {}:{} in {}").format( element.tag, self.BBB, C, V, sublocation ) ) self.addPriorityError( 1, C, V, _("Unprocessed {} element").format( element.tag ) ) if loadErrors: self.errorDictionary['Load Errors'] = loadErrors
def __validateSystem( self, bookOrderTree, systemName ): """ Do a semi-automatic check of the XML file validity. """ assert bookOrderTree uniqueDict = {} for elementName in self.uniqueElements: uniqueDict["Element_"+elementName] = [] for attributeName in self.uniqueAttributes: uniqueDict["Attribute_"+attributeName] = [] expectedID = 1 for k,element in enumerate(bookOrderTree): if element.tag == self.mainElementTag: BibleOrgSysGlobals.checkXMLNoTail( element, element.tag ) if not self.compulsoryAttributes and not self.optionalAttributes: BibleOrgSysGlobals.checkXMLNoAttributes( element, element.tag ) if not self.compulsoryElements and not self.optionalElements: BibleOrgSysGlobals.checkXMLNoSubelements( element, element.tag ) # Check ascending ID field ID = element.get("id") intID = int( ID ) if intID != expectedID: logging.error( _("ID numbers out of sequence in record {} (got {} when expecting {}) for {}").format( k, intID, expectedID, systemName ) ) expectedID += 1 # Check that this is unique if element.text: if element.text in uniqueDict: logging.error( _("Found {!r} data repeated in {!r} element in record with ID {!r} (record {}) for {}").format( element.text, element.tag, ID, k, systemName ) ) uniqueDict[element.text] = None # Check compulsory attributes on this main element for attributeName in self.compulsoryAttributes: attributeValue = element.get( attributeName ) if attributeValue is None: logging.error( _("Compulsory {!r} attribute is missing from {} element in record {}").format( attributeName, element.tag, k ) ) if not attributeValue: logging.warning( _("Compulsory {!r} attribute is blank on {} element in record {}").format( attributeName, element.tag, k ) ) # Check optional attributes on this main element for attributeName in self.optionalAttributes: attributeValue = element.get( attributeName ) if attributeValue is not None: if not attributeValue: logging.warning( _("Optional {!r} attribute is blank on {} element in record {}").format( attributeName, element.tag, k ) ) # Check for unexpected additional attributes on this main element for attributeName in element.keys(): attributeValue = element.get( attributeName ) if attributeName not in self.compulsoryAttributes and attributeName not in self.optionalAttributes: logging.warning( _("Additional {!r} attribute ({!r}) found on {} element in record {}").format( attributeName, attributeValue, element.tag, k ) ) # Check the attributes that must contain unique information (in that particular field -- doesn't check across different attributes) for attributeName in self.uniqueAttributes: attributeValue = element.get( attributeName ) if attributeValue is not None: if attributeValue in uniqueDict["Attribute_"+attributeName]: logging.error( _("Found {!r} data repeated in {!r} field on {} element in record {}").format( attributeValue, attributeName, element.tag, k ) ) uniqueDict["Attribute_"+attributeName].append( attributeValue ) # Check compulsory elements for elementName in self.compulsoryElements: if element.find( elementName ) is None: logging.error( _("Compulsory {!r} element is missing in record with ID {!r} (record {})").format( elementName, ID, k ) ) if not element.find( elementName ).text: logging.warning( _("Compulsory {!r} element is blank in record with ID {!r} (record {})").format( elementName, ID, k ) ) # Check optional elements for elementName in self.optionalElements: if element.find( elementName ) is not None: if not element.find( elementName ).text: logging.warning( _("Optional {!r} element is blank in record with ID {!r} (record {})").format( elementName, ID, k ) ) # Check for unexpected additional elements for subelement in element: if subelement.tag not in self.compulsoryElements and subelement.tag not in self.optionalElements: logging.warning( _("Additional {!r} element ({!r}) found in record with ID {!r} (record {})").format( subelement.tag, subelement.text, ID, k ) ) # Check the elements that must contain unique information (in that particular element -- doesn't check across different elements) for elementName in self.uniqueElements: if element.find( elementName ) is not None: text = element.find( elementName ).text if text in uniqueDict["Element_"+elementName]: logging.error( _("Found {!r} data repeated in {!r} element in record with ID {!r} (record {})").format( text, elementName, ID, k ) ) uniqueDict["Element_"+elementName].append( text ) else: logging.warning( _("Unexpected element: {} in record {}").format( element.tag, k ) )
def validateEntry( self, entry ): """ Check/validate the given Strongs Greek lexicon entry. """ if BibleOrgSysGlobals.debugFlag: assert( entry.tag == "entry" ) BibleOrgSysGlobals.checkXMLNoText( entry, entry.tag, "na19" ) BibleOrgSysGlobals.checkXMLNoTail( entry, entry.tag, "kaq9" ) # Process the entry attributes first strongs5 = None for attrib,value in entry.items(): if attrib == "strongs": strongs5 = value if BibleOrgSysGlobals.verbosityLevel > 2: print( "Validating {} entry...".format( strongs5 ) ) else: logging.warning( "Unprocessed {!r} attribute ({}) in main entry element".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert( len(strongs5)==5 and strongs5.isdigit() ) entryResults = {} entryString = "" gettingEssentials = True for j, element in enumerate( entry ): #print( strongs5, j, element.tag, repr(entryString) ) if element.tag == "strongs": if BibleOrgSysGlobals.debugFlag: assert( gettingEssentials and j==0 and element.text ) BibleOrgSysGlobals.checkXMLNoAttributes( element, element.tag, "md3d" ) if strongs5!='02717' and (3203 > int(strongs5) > 3302): BibleOrgSysGlobals.checkXMLNoTail( element, element.tag, "f3g7" ) BibleOrgSysGlobals.checkXMLNoSubelements( element, element.tag, "m56g" ) strongs = element.text if BibleOrgSysGlobals.debugFlag: assert( strongs5.endswith( strongs ) ) if element.tail and element.tail.strip(): entryString += element.tail.strip() elif element.tag == "greek": location = "greek in Strongs " + strongs5 BibleOrgSysGlobals.checkXMLNoText( element, location, "jke0" ) #BibleOrgSysGlobals.checkXMLNoTail( element, location, "ks24" ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, "df35" ) # Process the attributes translit = greek = beta = None for attrib,value in element.items(): if attrib=="translit": translit = value elif attrib=="unicode": greek = value elif attrib=="BETA": beta = value else: logging.warning( "scs4 Unprocessed {!r} attribute ({}) in {}".format( attrib, value, location ) ) if BibleOrgSysGlobals.debugFlag: assert( greek and translit and beta ) if 'word' not in entryResults: # This is the first/main entry if BibleOrgSysGlobals.debugFlag: assert( gettingEssentials and j==1 ) BibleOrgSysGlobals.checkXMLNoTail( element, location, "ks24" ) entryResults['word'] = (greek, translit, beta) else: #print( "Have multiple greek entries in " + strongs5 ) if BibleOrgSysGlobals.debugFlag: assert( j > 2 ) gettingEssentials = False entryString += ' ' + BibleOrgSysGlobals.getFlattenedXML( element, strongs5 ) #.replace( '\n', '' ) elif element.tag == "pronunciation": location = "pronunciation in Strongs " + strongs5 BibleOrgSysGlobals.checkXMLNoText( element, location, "iw9k" ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, "0s20" ) # Process the attributes pronunciation = None for attrib,value in element.items(): if attrib=="strongs": pronunciation = value else: logging.warning( "scs4 Unprocessed {!r} attribute ({}) in {}".format( attrib, value, location ) ) if gettingEssentials: #BibleOrgSysGlobals.checkXMLNoTail( element, location, "kd02" ) if BibleOrgSysGlobals.debugFlag: assert( j == 2 ) assert( pronunciation ) assert( 'pronunciation' not in entryResults ) entryResults['pronunciation'] = pronunciation else: if BibleOrgSysGlobals.debugFlag: assert( j>2 and not gettingEssentials ) if element.tail and element.tail.strip(): entryString += element.tail.strip().replace( '\n', '' ) elif element.tag == "strongs_derivation": location = "strongs_derivation in Strongs " + strongs5 BibleOrgSysGlobals.checkXMLNoAttributes( element, location, "jke0" ) BibleOrgSysGlobals.checkXMLNoTail( element, location, "ks24" ) derivation = BibleOrgSysGlobals.getFlattenedXML( element, strongs5 ).replace( '\n', '' ) #print( strongs5, "derivation", repr(derivation) ) if BibleOrgSysGlobals.debugFlag: assert( derivation and '\t' not in derivation and '\n' not in derivation ) entryString += derivation elif element.tag == "strongs_def": location = "strongs_def in Strongs " + strongs5 BibleOrgSysGlobals.checkXMLNoAttributes( element, location, "jke0" ) BibleOrgSysGlobals.checkXMLNoTail( element, location, "jd28" ) definition = BibleOrgSysGlobals.getFlattenedXML( element, strongs5 ).replace( '\n', '' ) #print( strongs5, "definition", repr(definition) ) if BibleOrgSysGlobals.debugFlag: assert( definition and '\t' not in definition and '\n' not in definition ) entryString += definition elif element.tag == "kjv_def": location = "kjv_def in Strongs " + strongs5 BibleOrgSysGlobals.checkXMLNoAttributes( element, location, "jke0" ) #BibleOrgSysGlobals.checkXMLNoTail( element, location, "8s2s" ) #BibleOrgSysGlobals.checkXMLNoSubelements( element, location, "dvb2" ) KJVdefinition = BibleOrgSysGlobals.getFlattenedXML( element, strongs5 ).replace( '\n', '' ) #print( strongs5, "KJVdefinition", repr(KJVdefinition), repr(entryString) ) if BibleOrgSysGlobals.debugFlag: assert( KJVdefinition and '\t' not in KJVdefinition and '\n' not in KJVdefinition ) entryString += KJVdefinition elif element.tag == "strongsref": location = "strongsref in Strongs " + strongs5 BibleOrgSysGlobals.checkXMLNoText( element, location, "kls2" ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, "ks24" ) strongsRef = BibleOrgSysGlobals.getFlattenedXML( element, strongs5 ).replace( '\n', '' ) if BibleOrgSysGlobals.debugFlag: assert( strongsRef and '\t' not in strongsRef and '\n' not in strongsRef ) strongsRef = re.sub( '<language="GREEK" strongs="(\d{1,5})">', r'<StrongsRef>G\1</StrongsRef>', strongsRef ) strongsRef = re.sub( '<strongs="(\d{1,5})" language="GREEK">', r'<StrongsRef>G\1</StrongsRef>', strongsRef ) #strongsRef = re.sub( '<language="HEBREW" strongs="(\d{1,5})">', r'<StrongsRef>H\1</StrongsRef>', strongsRef ) #strongsRef = re.sub( '<strongs="(\d{1,5})" language="HEBREW">', r'<StrongsRef>H\1</StrongsRef>', strongsRef ) #print( strongs5, "strongsRef", repr(strongsRef) ) entryString += ' ' + strongsRef elif element.tag == "see": location = "see in Strongs " + strongs5 BibleOrgSysGlobals.checkXMLNoText( element, location, "iw9k" ) BibleOrgSysGlobals.checkXMLNoTail( element, location, "kd02" ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, "0s20" ) # Process the attributes seeLanguage = seeStrongsNumber = None for attrib,value in element.items(): if attrib == "language": seeLanguage = value elif attrib == "strongs": seeStrongsNumber = value # Note: No leading zeroes here else: logging.warning( "scs4 Unprocessed {!r} attribute ({}) in {}".format( attrib, value, location ) ) if BibleOrgSysGlobals.debugFlag: assert( seeLanguage and seeStrongsNumber and seeStrongsNumber.isdigit() ) assert( seeLanguage in ('GREEK','HEBREW',) ) if 'see' not in entryResults: entryResults['see'] = [] entryResults['see'].append( ('G' if seeLanguage=='GREEK' else 'H') + seeStrongsNumber ) else: logging.error( "2d4f Unprocessed {!r} element ({}) in entry".format( element.tag, element.text ) ) if entryString: #print( strongs5, "entryString", repr(entryString) ) if BibleOrgSysGlobals.debugFlag: assert( '\t' not in entryString and '\n' not in entryString ) entryString = re.sub( '<strongsref language="GREEK" strongs="(\d{1,5})"></strongsref>', r'<StrongsRef>G\1</StrongsRef>', entryString ) entryString = re.sub( '<strongsref strongs="(\d{1,5})" language="GREEK"></strongsref>', r'<StrongsRef>G\1</StrongsRef>', entryString ) entryString = re.sub( '<strongsref language="HEBREW" strongs="(\d{1,5})"></strongsref>', r'<StrongsRef>H\1</StrongsRef>', entryString ) entryString = re.sub( '<strongsref strongs="(\d{1,5})" language="HEBREW"></strongsref>', r'<StrongsRef>H\1</StrongsRef>', entryString ) if BibleOrgSysGlobals.debugFlag: assert( 'strongsref' not in entryString ) entryResults['Entry'] = entryString #print( "entryResults", entryResults ) self.StrongsEntries[strongs] = entryResults
def importDataToPython(self): """ Loads (and pivots) the data (not including the header) into suitable Python containers to use in a Python program. (Of course, you can just use the elementTree in self._XMLtree if you prefer.) """ def makeList(parameter1, parameter2): """ Returns a list containing all parameters. Parameter1 may already be a list. """ if isinstance(parameter1, list): #assert( parameter2 not in parameter1 ) parameter1.append(parameter2) return parameter1 else: return [parameter1, parameter2] # end of makeList assert (self._XMLtree) if self.__DataList: # We've already done an import/restructuring -- no need to repeat it return self.__DataList, self.__DataDict # We'll create a number of dictionaries with different elements as the key rawRefLinkList = [] actualLinkCount = 0 for element in self._XMLtree: #print( BibleOrgSysGlobals.elementStr( element ) ) # Get these first for helpful error messages sourceReference = element.find('sourceReference').text sourceComponent = element.find('sourceComponent').text assert (sourceComponent in ( 'Section', 'Verses', 'Verse', )) BibleOrgSysGlobals.checkXMLNoText(element, sourceReference, 'kls1') BibleOrgSysGlobals.checkXMLNoAttributes(element, sourceReference, 'kd21') BibleOrgSysGlobals.checkXMLNoTail(element, sourceReference, 'so20') actualRawLinksList = [] for subelement in element: #print( BibleOrgSysGlobals.elementStr( subelement ) ) if subelement.tag in ( 'sourceReference', 'sourceComponent', ): # already processed these BibleOrgSysGlobals.checkXMLNoAttributes( subelement, sourceReference, 'ls12') BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sourceReference, 'ks02') BibleOrgSysGlobals.checkXMLNoTail(subelement, sourceReference, 'sqw1') elif subelement.tag == 'BibleReferenceLink': BibleOrgSysGlobals.checkXMLNoText(subelement, sourceReference, 'haw9') BibleOrgSysGlobals.checkXMLNoAttributes( subelement, sourceReference, 'hs19') BibleOrgSysGlobals.checkXMLNoTail(subelement, sourceReference, 'jsd9') targetReference = subelement.find('targetReference').text targetComponent = subelement.find('targetComponent').text assert (targetComponent in ( 'Section', 'Verses', 'Verse', )) linkType = subelement.find('linkType').text assert (linkType in ( 'TSK', 'QuotedOTReference', 'AlludedOTReference', 'PossibleOTReference', )) actualRawLinksList.append(( targetReference, targetComponent, linkType, )) actualLinkCount += 1 rawRefLinkList.append(( sourceReference, sourceComponent, actualRawLinksList, )) if BibleOrgSysGlobals.verbosityLevel > 1: print(" {} raw links loaded (with {} actual raw link entries)". format(len(rawRefLinkList), actualLinkCount)) myRefLinkList = [] actualLinkCount = 0 BOS = BibleOrganizationalSystem("GENERIC-KJV-66-ENG") for j, (sourceReference, sourceComponent, actualRawLinksList) in enumerate(rawRefLinkList): # Just do some testing first if sourceComponent == 'Verse': x = SimpleVerseKey(sourceReference) else: flag = False try: x = SimpleVerseKey(sourceReference, ignoreParseErrors=True) flag = True except TypeError: pass # This should happen coz it should fail the SVK if flag: logging.error("{} {!r} failed!".format( sourceComponent, sourceReference)) raise TypeError # Now do the actual parsing parsedSourceReference = FlexibleVersesKey(sourceReference) if BibleOrgSysGlobals.debugFlag and debuggingThisModule: print(j, sourceComponent, sourceReference, parsedSourceReference) #assert( parsedSourceReference.getShortText().replace(' ','_') == sourceReference ) actualLinksList = [] for k, (targetReference, targetComponent, linkType) in enumerate(actualRawLinksList): # Just do some testing first if targetComponent == 'Verse': x = SimpleVerseKey(targetReference) else: flag = False try: x = SimpleVerseKey(targetReference, ignoreParseErrors=True) flag = True except TypeError: pass # This should happen coz it should fail the SVK if flag: logging.error("{} {!r} failed!".format( targetComponent, targetReference)) raise TypeError # Now do the actual parsing try: parsedTargetReference = FlexibleVersesKey(targetReference) except TypeError: print( " Temporarily ignored {!r} (TypeError from FlexibleVersesKey)" .format(targetReference)) parsedTargetReference = None if BibleOrgSysGlobals.debugFlag and debuggingThisModule: print(' ', targetComponent, targetReference, parsedTargetReference) #assert( parsedTargetReference.getShortText().replace(' ','_',1) == targetReference ) actualLinksList.append(( targetReference, targetComponent, parsedTargetReference, linkType, )) actualLinkCount += 1 myRefLinkList.append(( sourceReference, sourceComponent, parsedSourceReference, actualLinksList, )) if BibleOrgSysGlobals.verbosityLevel > 1: print(" {} links processed (with {} actual link entries)".format( len(rawRefLinkList), actualLinkCount)) #print( myRefLinkList ); halt self.__DataList = myRefLinkList # Now put it into my dictionaries for easy access # This part should be customized or added to for however you need to process the data # Create a link dictionary (by verse key) myRefLinkDict = {} for sourceReference, sourceComponent, parsedSourceReference, actualLinksList in myRefLinkList: #print( sourceReference, sourceComponent, parsedSourceReference ) #print( sourceReference, sourceComponent, parsedSourceReference, actualLinksList ) for verseRef in parsedSourceReference.getIncludedVerses(): #print( verseRef ) assert (isinstance(verseRef, SimpleVerseKey)) if verseRef not in myRefLinkDict: myRefLinkDict[verseRef] = [] myRefLinkDict[verseRef].append(( sourceReference, sourceComponent, parsedSourceReference, actualLinksList, )) #print( myRefLinkDict ); halt originalLinks = len(myRefLinkDict) print( " {} verse links added to dictionary (includes filling out spans)" .format(originalLinks)) #print( myRefLinkDict ); halt # Create a reversed link dictionary (by verse key) for sourceReference, sourceComponent, parsedSourceReference, actualLinksList in myRefLinkList: #print( sourceReference, sourceComponent, parsedSourceReference ) #print( sourceReference, sourceComponent, parsedSourceReference, actualLinksList ) for targetReference, targetComponent, parsedTargetReference, linkType in actualLinksList: if parsedTargetReference is not None: for verseRef in parsedTargetReference.getIncludedVerses(): #print( verseRef ) assert (isinstance(verseRef, SimpleVerseKey)) if linkType == 'TSK': reverseLinkType = 'TSKQuoted' elif linkType == 'QuotedOTReference': reverseLinkType = 'OTReferenceQuoted' elif linkType == 'AlludedOTReference': reverseLinkType = 'OTReferenceAlluded' elif linkType == 'PossibleOTReference': reverseLinkType = 'OTReferencePossible' else: halt # Have a new linkType! if verseRef not in myRefLinkDict: myRefLinkDict[verseRef] = [] myRefLinkDict[verseRef].append( (targetReference, targetComponent, parsedTargetReference, [ (sourceReference, sourceComponent, parsedSourceReference, reverseLinkType) ])) #print( myRefLinkDict ); halt totalLinks = len(myRefLinkDict) reverseLinks = totalLinks - originalLinks print(" {} reverse links added to dictionary to give {} total".format( reverseLinks, totalLinks)) #print( myRefLinkDict ); halt self.__DataDict = myRefLinkDict # Let's find the most number of references for a verse mostReferences = totalReferences = 0 for verseRef, entryList in self.__DataDict.items(): numRefs = len(entryList) if numRefs > mostReferences: mostReferences, mostVerseRef = numRefs, verseRef totalReferences += numRefs print(" {} maximum links for any one reference ({})".format( mostReferences, mostVerseRef.getShortText())) print(" {} total links for all references".format(totalReferences)) return self.__DataList, self.__DataDict
def __validateSystem(self, bookOrderTree, systemName): """ Do a semi-automatic check of the XML file validity. """ assert bookOrderTree uniqueDict = {} for elementName in self.uniqueElements: uniqueDict["Element_" + elementName] = [] for attributeName in self.uniqueAttributes: uniqueDict["Attribute_" + attributeName] = [] expectedID = 1 for k, element in enumerate(bookOrderTree): if element.tag == self.mainElementTag: BibleOrgSysGlobals.checkXMLNoTail(element, element.tag) if not self.compulsoryAttributes and not self.optionalAttributes: BibleOrgSysGlobals.checkXMLNoAttributes( element, element.tag) if not self.compulsoryElements and not self.optionalElements: BibleOrgSysGlobals.checkXMLNoSubelements( element, element.tag) # Check ascending ID field ID = element.get("id") intID = int(ID) if intID != expectedID: logging.error( _("ID numbers out of sequence in record {} (got {} when expecting {}) for {}" ).format(k, intID, expectedID, systemName)) expectedID += 1 # Check that this is unique if element.text: if element.text in uniqueDict: logging.error( _("Found {!r} data repeated in {!r} element in record with ID {!r} (record {}) for {}" ).format(element.text, element.tag, ID, k, systemName)) uniqueDict[element.text] = None # Check compulsory attributes on this main element for attributeName in self.compulsoryAttributes: attributeValue = element.get(attributeName) if attributeValue is None: logging.error( _("Compulsory {!r} attribute is missing from {} element in record {}" ).format(attributeName, element.tag, k)) if not attributeValue: logging.warning( _("Compulsory {!r} attribute is blank on {} element in record {}" ).format(attributeName, element.tag, k)) # Check optional attributes on this main element for attributeName in self.optionalAttributes: attributeValue = element.get(attributeName) if attributeValue is not None: if not attributeValue: logging.warning( _("Optional {!r} attribute is blank on {} element in record {}" ).format(attributeName, element.tag, k)) # Check for unexpected additional attributes on this main element for attributeName in element.keys(): attributeValue = element.get(attributeName) if attributeName not in self.compulsoryAttributes and attributeName not in self.optionalAttributes: logging.warning( _("Additional {!r} attribute ({!r}) found on {} element in record {}" ).format(attributeName, attributeValue, element.tag, k)) # Check the attributes that must contain unique information (in that particular field -- doesn't check across different attributes) for attributeName in self.uniqueAttributes: attributeValue = element.get(attributeName) if attributeValue is not None: if attributeValue in uniqueDict["Attribute_" + attributeName]: logging.error( _("Found {!r} data repeated in {!r} field on {} element in record {}" ).format(attributeValue, attributeName, element.tag, k)) uniqueDict["Attribute_" + attributeName].append(attributeValue) # Check compulsory elements for elementName in self.compulsoryElements: if element.find(elementName) is None: logging.error( _("Compulsory {!r} element is missing in record with ID {!r} (record {})" ).format(elementName, ID, k)) if not element.find(elementName).text: logging.warning( _("Compulsory {!r} element is blank in record with ID {!r} (record {})" ).format(elementName, ID, k)) # Check optional elements for elementName in self.optionalElements: if element.find(elementName) is not None: if not element.find(elementName).text: logging.warning( _("Optional {!r} element is blank in record with ID {!r} (record {})" ).format(elementName, ID, k)) # Check for unexpected additional elements for subelement in element: if subelement.tag not in self.compulsoryElements and subelement.tag not in self.optionalElements: logging.warning( _("Additional {!r} element ({!r}) found in record with ID {!r} (record {})" ).format(subelement.tag, subelement.text, ID, k)) # Check the elements that must contain unique information (in that particular element -- doesn't check across different elements) for elementName in self.uniqueElements: if element.find(elementName) is not None: text = element.find(elementName).text if text in uniqueDict["Element_" + elementName]: logging.error( _("Found {!r} data repeated in {!r} element in record with ID {!r} (record {})" ).format(text, elementName, ID, k)) uniqueDict["Element_" + elementName].append(text) else: logging.warning( _("Unexpected element: {} in record {}").format( element.tag, k))
def __validateAndExtractVerse( self, BBB, chapterNumber, thisBook, verse ): """ Check/validate and extract verse data from the given XML book record finding and saving verse elements. """ if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML verse…") ) location = "verse in {} {}".format( BBB, chapterNumber ) BibleOrgSysGlobals.checkXMLNoTail( verse, location, 'l5ks' ) # Handle verse attributes verseNumber = toVerseNumber = None for attrib,value in verse.items(): if attrib=="vnumber": verseNumber = value else: logging.warning( "Unprocessed {!r} attribute ({}) in verse element".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert verseNumber location = "{}:{}".format( location, verseNumber ) # Get a better location description #thisBook.addLine( 'v', verseNumber ) vText = '' if verse.text is None else verse.text if vText: vText = vText.strip() #if not vText: # This happens if a verse starts immediately with a style or note #logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, verseNumber ) ) # Handle verse subelements (notes and styled portions) for subelement in verse: if subelement.tag == HaggaiXMLBible.noteTag: sublocation = "note in " + location noteType = None for attrib,value in subelement.items(): if attrib=="type": noteType = value else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) ) if noteType and noteType not in ('variant',): logging.warning( "Unexpected {} note type in {}".format( noteType, BBB ) ) nText, nTail = subelement.text, subelement.tail #print( "note", BBB, chapterNumber, verseNumber, noteType, repr(nText), repr(nTail) ) vText += "\\f + \\fk {} \\ft {}\\f*".format( noteType, nText ) if noteType else "\\f + \\ft {}\\f*".format( nText ) if nTail: if '\n' in nTail: print( "HaggaiXMLBible.__validateAndExtractVerse: nTail {} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, nTail ) ) nTail = nTail.replace( '\n', ' ' ) vText += nTail for subsubelement in subelement: if subsubelement.tag == HaggaiXMLBible.styleTag: subsublocation = "style in " + sublocation BibleOrgSysGlobals.checkXMLNoSubelements( subsubelement, subsublocation, 'fyt4' ) fs = css = idStyle = None for attrib,value in subsubelement.items(): if attrib=='fs': fs = value #elif attrib=="css": css = value #elif attrib=="id": idStyle = value else: logging.warning( "Unprocessed {!r} attribute ({}) in style subsubelement".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert fs or css or idStyle SFM = None if fs == 'italic': SFM = '\\it' elif fs == 'super': SFM = '\\bdit' elif fs == 'emphasis': SFM = '\\em' else: print( "fs is", fs, "css is", css, "idStyle is", idStyle ); halt #if css == "font-style:italic": SFM = '\\it' #elif css == "font-style:italic;font-weight:bold": SFM = '\\bdit' #elif css == "color:#FF0000": SFM = '\\em' #elif css == "font-size: x-small; color:#8B8378": SFM = '\\add' #elif css is None and idStyle=='cl:divineName': SFM = '\\nd' #else: print( "css is", css, "idStyle is", idStyle ); halt sText, sTail = subsubelement.text.strip(), subsubelement.tail if BibleOrgSysGlobals.debugFlag: assert sText if SFM: vText += SFM+' ' + sText + SFM+'*' else: vText += '\\sc ' + '['+css+']' + sText + '\\sc* ' # Use sc for unknown styles if sTail: vText += sTail.strip() else: logging.error( "Expected to find {} but got {!r} in {}".format( HaggaiXMLBible.styleTag, subsubelement.tag, sublocation ) ) elif subelement.tag == HaggaiXMLBible.styleTag: sublocation = "style in " + location BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'f5gh' ) fs = css = idStyle = None for attrib,value in subelement.items(): if attrib=="fs": fs = value #elif attrib=="css": css = value #elif attrib=="id": idStyle = value else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert fs SFM = None if fs == 'super': SFM = '\\bdit' elif fs == 'emphasis': SFM = '\\em' else: print( "fs is", fs, "css is", css, "idStyle is", idStyle ); halt #if css == "font-style:italic": SFM = '\\it' #elif css == "font-style:italic;font-weight:bold": SFM = '\\bdit' #elif css == "color:#FF0000": SFM = '\\em' #elif css == "font-size: x-small; color:#8B8378": SFM = '\\add' #elif css is None and idStyle=='cl:divineName': SFM = '\\nd' #else: print( "css is", css, "idStyle is", idStyle ); halt sText, sTail = subelement.text.strip(), subelement.tail if BibleOrgSysGlobals.debugFlag: assert sText #print( BBB, chapterNumber, sublocation ) if SFM: vText += SFM+' ' + sText + SFM+'*' else: vText += '\\sc ' + '['+css+']' + sText + '\\sc* ' # Use sc for unknown styles if sTail: vText += sTail.strip() elif subelement.tag == HaggaiXMLBible.breakTag: sublocation = "line break in " + location BibleOrgSysGlobals.checkXMLNoText( subelement, sublocation, 'c1d4' ) BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'g4g8' ) art = None for attrib,value in subelement.items(): if attrib=="art": art = value else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert art == 'x-nl' #print( BBB, chapterNumber, verseNumber ) #assert vText if vText: thisBook.addLine( 'v', verseNumber + ' ' + vText ); verseNumber = None vText = '' thisBook.addLine( 'm', subelement.tail.strip() if subelement.tail else '' ) #bTail = subelement.tail #if bTail: vText = bTail.strip() else: logging.error( "Expected to find NOTE or STYLE but got {!r} in {}".format( subelement.tag, location ) ) if vText: # This is the main text of the verse (follows the verse milestone) if '\n' in vText: print( "HaggaiXMLBible.__validateAndExtractVerse: vText {} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) ) vText = vText.replace( '\n', ' ' ) thisBook.addLine( 'v', verseNumber + ' ' + vText ); verseNumber = None
def load(self): """ Load a single source XML file and load book elements. """ if BibleOrgSysGlobals.verbosityLevel > 2: print(_("Loading {}…").format(self.sourceFilepath)) self.XMLTree = ElementTree().parse(self.sourceFilepath) if BibleOrgSysGlobals.debugFlag: assert len( self.XMLTree) # Fail here if we didn't load anything at all if self.suppliedMetadata is None: self.suppliedMetadata = {} self.suppliedMetadata['VerseView'] = {} # Find the main (bible) container if self.XMLTree.tag == VerseViewXMLBible.treeTag: location = "VerseView XML file" BibleOrgSysGlobals.checkXMLNoText(self.XMLTree, location, '4f6h') BibleOrgSysGlobals.checkXMLNoAttributes(self.XMLTree, location, 'js24') BibleOrgSysGlobals.checkXMLNoTail(self.XMLTree, location, '1wk8') # Find the submain (various info and then book) containers bookNumber = 0 for element in self.XMLTree: if element.tag == VerseViewXMLBible.filenameTag: sublocation = "filename in " + location BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jk86') BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'hjk7') BibleOrgSysGlobals.checkXMLNoTail(element, sublocation, 'bh09') #self.filename = element.text elif element.tag == VerseViewXMLBible.revisionTag: sublocation = "revision in " + location BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jk86') BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'hjk7') BibleOrgSysGlobals.checkXMLNoTail(element, sublocation, 'bh09') self.suppliedMetadata['VerseView'][ 'Revision'] = element.text elif element.tag == VerseViewXMLBible.titleTag: sublocation = "title in " + location BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jk86') BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'hjk7') BibleOrgSysGlobals.checkXMLNoTail(element, sublocation, 'bh09') self.suppliedMetadata['VerseView']['Title'] = element.text elif element.tag == VerseViewXMLBible.fontTag: sublocation = "font in " + location BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jk86') BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'hjk7') BibleOrgSysGlobals.checkXMLNoTail(element, sublocation, 'bh09') self.suppliedMetadata['VerseView']['Font'] = element.text elif element.tag == VerseViewXMLBible.copyrightTag: sublocation = "copyright in " + location BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jk86') BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'hjk7') BibleOrgSysGlobals.checkXMLNoTail(element, sublocation, 'bh09') self.suppliedMetadata['VerseView'][ 'Copyright'] = element.text elif element.tag == VerseViewXMLBible.sizefactorTag: sublocation = "sizefactor in " + location BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jk86') BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'hjk7') BibleOrgSysGlobals.checkXMLNoTail(element, sublocation, 'bh09') if BibleOrgSysGlobals.debugFlag: assert element.text == '1' elif element.tag == VerseViewXMLBible.bookTag: sublocation = "book in " + location BibleOrgSysGlobals.checkXMLNoText(element, sublocation, 'g3g5') BibleOrgSysGlobals.checkXMLNoTail(element, sublocation, 'd3f6') bookNumber += 1 self.__validateAndExtractBook(element, bookNumber) else: logging.error( "xk15 Expected to find {!r} but got {!r}".format( VerseViewXMLBible.bookTag, element.tag)) else: logging.error("Expected to load {!r} but got {!r}".format( VerseViewXMLBible.treeTag, self.XMLTree.tag)) if BibleOrgSysGlobals.debugFlag or BibleOrgSysGlobals.verbosityLevel > 2: # These are all compulsory so they should all exist #print( "Filename is {!r}".format( self.filename ) ) print("Revision is {!r}".format( self.suppliedMetadata['VerseView']['Revision'])) print("Title is {!r}".format( self.suppliedMetadata['VerseView']['Title'])) print("Font is {!r}".format( self.suppliedMetadata['VerseView']['Font'])) print("Copyright is {!r}".format( self.suppliedMetadata['VerseView']['Copyright'])) #print( "SizeFactor is {!r}".format( self.sizeFactor ) ) self.applySuppliedMetadata( 'VerseView') # Copy some to self.settingsDict self.doPostLoadProcessing()
def loadParagraph( paragraphXML, paragraphlocation ): """ Load a paragraph from the USX XML. In this context, paragraph means heading and intro lines, as well as paragraphs of verses. Uses (and updates) C,V information from the containing function. """ nonlocal C, V # Process the attributes first paragraphStyle = None for attrib,value in paragraphXML.items(): if attrib=='style': paragraphStyle = value # This is basically the USFM marker name else: logging.warning( _("CH46 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) # Now process the paragraph text (or write a paragraph marker anyway) paragraphText = paragraphXML.text if paragraphXML.text and paragraphXML.text.strip() else '' if version is None: paragraphText = paragraphText.rstrip() # Don't need to strip extra spaces in v2 self.addLine( paragraphStyle, paragraphText ) # Now process the paragraph subelements for element in paragraphXML: location = element.tag + ' ' + paragraphlocation #print( "USXXMLBibleBook.load", C, V, element.tag, location ) if element.tag == 'verse': # milestone (not a container) BibleOrgSysGlobals.checkXMLNoText( element, location ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location ) # Process the attributes first verseStyle = altNumber = None for attrib,value in element.items(): if attrib=='number': V = value elif attrib=='style': verseStyle = value elif attrib=='altnumber': altNumber = value else: logging.error( _("KR60 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) if verseStyle != 'v': logging.error( _("Unexpected style attribute ({}) in {}").format( verseStyle, location ) ) #if altNumber: print( repr(verseStyle), repr(altNumber) ); halt altStuff = ' \\va {}\\va*'.format( altNumber ) if altNumber else '' self.addLine( verseStyle, V + altStuff + ' ' ) # Now process the tail (if there's one) which is the verse text if element.tail: vText = element.tail if vText[0]=='\n': vText = vText.lstrip() # Paratext puts cross references on a new line if vText: #print( repr(vText) ) self.appendToLastLine( vText ) elif element.tag == 'char': # Process the attributes first charStyle = None for attrib,value in element.items(): if attrib=='style': charStyle = value # This is basically the USFM character marker name #print( " charStyle", charStyle ) assert not BibleOrgSysGlobals.USFMMarkers.isNewlineMarker( charStyle ) else: logging.error( _("QU52 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) charLine = "\\{} {} ".format( charStyle, element.text ) # Now process the subelements -- chars are one of the few multiply embedded fields in USX for subelement in element: sublocation = subelement.tag + ' ' + location #print( '{} {}:{} {}'.format( self.BBB, C, V, element.tag ) ) if subelement.tag == 'char': # milestone (not a container) BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation ) # Process the attributes first subCharStyle, charClosed = None, True for attrib,value in subelement.items(): if attrib=='style': subCharStyle = value elif attrib=='closed': assert value=='false' charClosed = False else: logging.error( _("KS41 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) charLine += "\\{} {}".format( subCharStyle, subelement.text ) if charClosed: charLine += "\\{}*".format( subCharStyle ) #if subelement.tail is not None: print( " tail1", repr(subelement.tail) ) charLine += '' if subelement.tail is None else subelement.tail else: logging.error( _("Unprocessed {} subelement after {} {}:{} in {}").format( subelement.tag, self.BBB, C, V, sublocation ) ) self.addPriorityError( 1, C, V, _("Unprocessed {} subelement").format( subelement.tag ) ) # A character field must be added to the previous field #if element.tail is not None: print( " tail2", repr(element.tail) ) charTail = '' if element.tail: charTail = element.tail if charTail[0]=='\n': charTail = charTail.lstrip() # Paratext puts footnote parts on new lines charLine += "\\{}*{}".format( charStyle, charTail ) #if debuggingThisModule: print( "USX.loadParagraph:", C, V, paragraphStyle, charStyle, repr(charLine) ) self.appendToLastLine( charLine ) elif element.tag == 'note': #print( "NOTE", BibleOrgSysGlobals.elementStr( element ) ) # Process the attributes first noteStyle = noteCaller = None for attrib,value in element.items(): if attrib=='style': noteStyle = value # This is basically the USFM marker name assert noteStyle in ('x','f',) elif attrib=='caller': noteCaller = value # Usually hyphen or a symbol to be used for the note else: logging.error( _("CY38 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) if noteCaller=='' and self.BBB=='NUM' and C=='10' and V=='36': noteCaller = '+' # Hack assert noteStyle and noteCaller # both compulsory noteLine = "\\{} {} ".format( noteStyle, noteCaller ) if element.text: noteText = element.text.strip() noteLine += noteText # Now process the subelements -- notes are one of the few multiply embedded fields in USX for subelement in element: sublocation = subelement.tag + ' ' + location #print( C, V, subelement.tag ) if subelement.tag == 'char': # milestone (not a container) # Process the attributes first charStyle, charClosed = None, True for attrib,value in subelement.items(): if attrib=='style': charStyle = value elif attrib=='closed': assert value=='false' charClosed = False else: logging.warning( _("GJ67 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) noteLine += "\\{} {}".format( charStyle, subelement.text ) # Now process the subelements -- notes are one of the few multiply embedded fields in USX for sub2element in subelement: sub2location = sub2element.tag + ' ' + sublocation #print( C, V, sub2element.tag ) if sub2element.tag == 'char': # milestone (not a container) BibleOrgSysGlobals.checkXMLNoSubelements( sub2element, sub2location ) # Process the attributes first char2Style, char2Closed = None, True for attrib,value in sub2element.items(): if attrib=='style': char2Style = value elif attrib=='closed': assert value=='false' char2Closed = False else: logging.warning( _("VH36 Unprocessed {} attribute ({}) in {}").format( attrib, value, sub2location ) ) assert char2Closed noteLine += "\\{} {}\\{}*{}".format( char2Style, sub2element.text, char2Style, sub2element.tail if sub2element.tail else '' ) if charClosed: noteLine += "\\{}*".format( charStyle ) if subelement.tail: charTail = subelement.tail if charTail[0]=='\n': charTail = charTail.lstrip() # Paratext puts cross reference parts on a new line noteLine += charTail elif subelement.tag == 'unmatched': # Used to denote errors in the source text BibleOrgSysGlobals.checkXMLNoText( subelement, sublocation ) BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation ) # Process the attributes first unmmatchedMarker = None for attrib,value in subelement.items(): if attrib=='marker': unmmatchedMarker = value else: logging.warning( _("NV21 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) self.addPriorityError( 2, C, V, _("Unmatched subelement for {} in {}").format( repr(unmmatchedMarker), sublocation) if unmmatchedMarker else _("Unmatched subelement in {}").format( sublocation) ) else: logging.warning( _("Unprocessed {} subelement after {} {}:{} in {}").format( subelement.tag, self.BBB, C, V, sublocation ) ) self.addPriorityError( 1, C, V, _("Unprocessed {} subelement").format( subelement.tag ) ) if subelement.tail and subelement.tail.strip(): noteLine += subelement.tail #noteLine += "\\{}*".format( charStyle ) noteLine += "\\{}*".format( noteStyle ) if element.tail: #if '\n' in element.tail: halt noteTail = element.tail if noteTail[0]=='\n': noteTail = noteTail.lstrip() # Paratext puts multiple cross-references on new lines noteLine += noteTail #print( "NoteLine", repr(noteLine) ) self.appendToLastLine( noteLine ) elif element.tag == 'link': # Used to include extra resources BibleOrgSysGlobals.checkXMLNoText( element, location ) BibleOrgSysGlobals.checkXMLNoTail( element, location ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location ) # Process the attributes first linkStyle = linkDisplay = linkTarget = None for attrib,value in element.items(): if attrib=='style': linkStyle = value assert linkStyle in ('jmp',) elif attrib=='display': linkDisplay = value # e.g., "click here" elif attrib=='target': linkTarget = value # e.g., some reference else: logging.warning( _("KW54 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.addPriorityError( 3, C, V, _("Unprocessed {} link to {} in {}").format( repr(linkDisplay), repr(linkTarget), location) ) elif element.tag == 'unmatched': # Used to denote errors in the source text BibleOrgSysGlobals.checkXMLNoText( element, location ) BibleOrgSysGlobals.checkXMLNoTail( element, location ) BibleOrgSysGlobals.checkXMLNoAttributes( element, location ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location ) self.addPriorityError( 2, C, V, _("Unmatched element in {}").format( location) ) else: logging.warning( _("Unprocessed {} element after {} {}:{} in {}").format( element.tag, self.BBB, C, V, location ) ) self.addPriorityError( 1, C, V, _("Unprocessed {} element").format( element.tag ) ) for x in range(max(0,len(self)-10),len(self)): print( x, self._rawLines[x] ) if BibleOrgSysGlobals.debugFlag: halt
def __validateAndExtractVerse(self, BBB, chapterNumber, thisBook, verse): """ Check/validate and extract verse data from the given XML book record finding and saving verse elements. """ if BibleOrgSysGlobals.debugFlag and debuggingThisModule and BibleOrgSysGlobals.verbosityLevel > 3: print(_("Validating XML verse…")) location = "verse in {} {}".format(BBB, chapterNumber) BibleOrgSysGlobals.checkXMLNoSubelements(verse, location, 'sg20') BibleOrgSysGlobals.checkXMLNoTail(verse, location, 'l5ks') # Handle verse attributes verseNumber = toVerseNumber = None for attrib, value in verse.items(): if attrib == "n": verseNumber = value else: logging.warning( "Unprocessed {!r} attribute ({}) in verse element".format( attrib, value)) if BibleOrgSysGlobals.debugFlag: assert verseNumber location = "{}:{}".format( location, verseNumber) # Get a better location description #thisBook.addLine( 'v', verseNumber ) vText = '' if verse.text is None else verse.text if vText: vText = vText.strip() #if not vText: # This happens if a verse starts immediately with a style or note #logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, verseNumber ) ) ## Handle verse subelements (notes and styled portions) #for subelement in verse: #if subelement.tag == VerseViewXMLBible.noteTag: #sublocation = "note in " + location #noteType = None #for attrib,value in subelement.items(): #if attrib=="type": noteType = value #else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) ) #if noteType and noteType not in ('variant',): #logging.warning( "Unexpected {} note type in {}".format( noteType, BBB ) ) #nText, nTail = subelement.text, subelement.tail ##print( "note", BBB, chapterNumber, verseNumber, noteType, repr(nText), repr(nTail) ) #vText += "\\f + \\fk {} \\ft {}\\f*".format( noteType, nText ) if noteType else "\\f + \\ft {}\\f*".format( nText ) #if nTail: #if '\n' in nTail: #print( "VerseViewXMLBible.__validateAndExtractVerse: nTail {} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, nTail ) ) #nTail = nTail.replace( '\n', ' ' ) #vText += nTail #for sub2element in subelement: #if sub2element.tag == VerseViewXMLBible.styleTag: #sub2location = "style in " + sublocation #BibleOrgSysGlobals.checkXMLNoSubelements( sub2element, sub2location, 'fyt4' ) #fs = css = idStyle = None #for attrib,value in sub2element.items(): #if attrib=='fs': fs = value ##elif attrib=="css": css = value ##elif attrib=="id": idStyle = value #else: logging.warning( "Unprocessed {!r} attribute ({}) in style sub2element".format( attrib, value ) ) #if BibleOrgSysGlobals.debugFlag: assert fs or css or idStyle #SFM = None #if fs == 'italic': SFM = '\\it' #elif fs == 'super': SFM = '\\bdit' #elif fs == 'emphasis': SFM = '\\em' #else: print( "fs is", fs, "css is", css, "idStyle is", idStyle ); halt ##if css == "font-style:italic": SFM = '\\it' ##elif css == "font-style:italic;font-weight:bold": SFM = '\\bdit' ##elif css == "color:#FF0000": SFM = '\\em' ##elif css == "font-size: x-small; color:#8B8378": SFM = '\\add' ##elif css is None and idStyle=='cl:divineName': SFM = '\\nd' ##else: print( "css is", css, "idStyle is", idStyle ); halt #sText, sTail = sub2element.text.strip(), sub2element.tail #if BibleOrgSysGlobals.debugFlag: assert sText #if SFM: vText += SFM+' ' + sText + SFM+'*' #else: vText += '\\sc ' + '['+css+']' + sText + '\\sc* ' # Use sc for unknown styles #if sTail: vText += sTail.strip() #else: logging.error( "df20 Expected to find {} but got {!r} in {}".format( VerseViewXMLBible.styleTag, sub2element.tag, sublocation ) ) #elif subelement.tag == VerseViewXMLBible.styleTag: #sublocation = "style in " + location #BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'f5gh' ) #fs = css = idStyle = None #for attrib,value in subelement.items(): #if attrib=="fs": fs = value ##elif attrib=="css": css = value ##elif attrib=="id": idStyle = value #else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) ) #if BibleOrgSysGlobals.debugFlag: assert fs #SFM = None #if fs == 'super': SFM = '\\bdit' #elif fs == 'emphasis': SFM = '\\em' #else: print( "fs is", fs, "css is", css, "idStyle is", idStyle ); halt ##if css == "font-style:italic": SFM = '\\it' ##elif css == "font-style:italic;font-weight:bold": SFM = '\\bdit' ##elif css == "color:#FF0000": SFM = '\\em' ##elif css == "font-size: x-small; color:#8B8378": SFM = '\\add' ##elif css is None and idStyle=='cl:divineName': SFM = '\\nd' ##else: print( "css is", css, "idStyle is", idStyle ); halt #sText, sTail = subelement.text.strip(), subelement.tail #if BibleOrgSysGlobals.debugFlag: assert sText ##print( BBB, chapterNumber, sublocation ) #if SFM: vText += SFM+' ' + sText + SFM+'*' #else: vText += '\\sc ' + '['+css+']' + sText + '\\sc* ' # Use sc for unknown styles #if sTail: vText += sTail.strip() #elif subelement.tag == VerseViewXMLBible.breakTag: #sublocation = "line break in " + location #BibleOrgSysGlobals.checkXMLNoText( subelement, sublocation, 'c1d4' ) #BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'g4g8' ) #art = None #for attrib,value in subelement.items(): #if attrib=="art": #art = value #else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) ) #if BibleOrgSysGlobals.debugFlag: assert art == 'x-nl' ##print( BBB, chapterNumber, verseNumber ) ##assert vText #if vText: #thisBook.addLine( 'v', verseNumber + ' ' + vText ); verseNumber = None #vText = '' #thisBook.addLine( 'm', subelement.tail.strip() if subelement.tail else '' ) ##bTail = subelement.tail ##if bTail: vText = bTail.strip() #else: logging.error( "bd47 Expected to find NOTE or STYLE but got {!r} in {}".format( subelement.tag, location ) ) if vText: # This is the main text of the verse (follows the verse milestone) if '\n' in vText: print( "VerseViewXMLBible.__validateAndExtractVerse: vText {} {}:{} {!r}" .format(BBB, chapterNumber, verseNumber, vText)) vText = vText.replace('\n', ' ') thisBook.addLine('v', verseNumber + ' ' + vText) verseNumber = None
def __validate(self): """ Check/validate the loaded data. """ assert (self._XMLtree) uniqueDict = {} for elementName in self._uniqueElements: uniqueDict["Element_" + elementName] = [] for attributeName in self._uniqueAttributes: uniqueDict["Attribute_" + attributeName] = [] expectedID = 1 for j, element in enumerate(self._XMLtree): if element.tag == self._mainElementTag: BibleOrgSysGlobals.checkXMLNoText(element, element.tag) BibleOrgSysGlobals.checkXMLNoTail(element, element.tag) if not self._compulsoryAttributes and not self._optionalAttributes: BibleOrgSysGlobals.checkXMLNoAttributes( element, element.tag) if not self._compulsoryElements and not self._optionalElements: BibleOrgSysGlobals.checkXMLNoSubelements( element, element.tag) # Check compulsory attributes on this main element for attributeName in self._compulsoryAttributes: attributeValue = element.get(attributeName) if attributeValue is None: logging.error( _("Compulsory {!r} attribute is missing from {} element in record {}" ).format(attributeName, element.tag, j)) if not attributeValue: logging.warning( _("Compulsory {!r} attribute is blank on {} element in record {}" ).format(attributeName, element.tag, j)) # Check optional attributes on this main element for attributeName in self._optionalAttributes: attributeValue = element.get(attributeName) if attributeValue is not None: if not attributeValue: logging.warning( _("Optional {!r} attribute is blank on {} element in record {}" ).format(attributeName, element.tag, j)) # Check for unexpected additional attributes on this main element for attributeName in element.keys(): attributeValue = element.get(attributeName) if attributeName not in self._compulsoryAttributes and attributeName not in self._optionalAttributes: logging.warning( _("Additional {!r} attribute ({!r}) found on {} element in record {}" ).format(attributeName, attributeValue, element.tag, j)) # Check the attributes that must contain unique information (in that particular field -- doesn't check across different attributes) for attributeName in self._uniqueAttributes: attributeValue = element.get(attributeName) if attributeValue is not None: if attributeValue in uniqueDict["Attribute_" + attributeName]: logging.error( _("Found {!r} data repeated in {!r} field on {} element in record {}" ).format(attributeValue, attributeName, element.tag, j)) uniqueDict["Attribute_" + attributeName].append(attributeValue) # Get the sourceComponent to use as a record ID ID = element.find("sourceComponent").text # Check compulsory elements for elementName in self._compulsoryElements: foundElement = element.find(elementName) if foundElement is None: logging.error( _("Compulsory {!r} element is missing in record with ID {!r} (record {})" ).format(elementName, ID, j)) else: BibleOrgSysGlobals.checkXMLNoTail( foundElement, foundElement.tag + " in " + element.tag) BibleOrgSysGlobals.checkXMLNoAttributes( foundElement, foundElement.tag + " in " + element.tag) #BibleOrgSysGlobals.checkXMLNoSubelements( foundElement, foundElement.tag + " in " + element.tag ) if not foundElement.text: logging.warning( _("Compulsory {!r} element is blank in record with ID {!r} (record {})" ).format(elementName, ID, j)) # Check optional elements for elementName in self._optionalElements: foundElement = element.find(elementName) if foundElement is not None: BibleOrgSysGlobals.checkXMLNoTail( foundElement, foundElement.tag + " in " + element.tag) BibleOrgSysGlobals.checkXMLNoAttributes( foundElement, foundElement.tag + " in " + element.tag) BibleOrgSysGlobals.checkXMLNoSubelements( foundElement, foundElement.tag + " in " + element.tag) if not foundElement.text: logging.warning( _("Optional {!r} element is blank in record with ID {!r} (record {})" ).format(elementName, ID, j)) # Check for unexpected additional elements for subelement in element: if subelement.tag not in self._compulsoryElements and subelement.tag not in self._optionalElements: logging.warning( _("Additional {!r} element ({!r}) found in record with ID {!r} (record {})" ).format(subelement.tag, subelement.text, ID, j)) # Check the elements that must contain unique information (in that particular element -- doesn't check across different elements) for elementName in self._uniqueElements: if element.find(elementName) is not None: text = element.find(elementName).text if text in uniqueDict["Element_" + elementName]: logging.error( _("Found {!r} data repeated in {!r} element in record with ID {!r} (record {})" ).format(text, elementName, ID, j)) uniqueDict["Element_" + elementName].append(text) else: logging.warning( _("Unexpected element: {} in record {}").format( element.tag, j)) if element.tail is not None and element.tail.strip(): logging.error( _("Unexpected {!r} tail data after {} element in record {}" ).format(element.tail, element.tag, j)) if self._XMLtree.tail is not None and self._XMLtree.tail.strip(): logging.error( _("Unexpected {!r} tail data after {} element").format( self._XMLtree.tail, self._XMLtree.tag))
def load( self ): """ Load the XML data file -- we should already know the filepath. """ if BibleOrgSysGlobals.verbosityLevel > 1: print( _("USFXXMLBible: Loading {} from {}...").format( self.name, self.sourceFolder ) ) #if BibleOrgSysGlobals.verbosityLevel > 2: print( _(" It seems we have {}...").format( BBB ) ) #self.thisBook = BibleBook( self, BBB ) #self.thisBook.objectNameString = "OSIS XML Bible Book object" #self.thisBook.objectTypeString = "OSIS" #self.haveBook = True try: self.tree = ElementTree().parse( self.sourceFilepath ) except ParseError: errorString = sys.exc_info()[1] logging.critical( "USFXXMLBible.load: failed loading the xml file {}: {!r}.".format( self.sourceFilepath, errorString ) ) return if BibleOrgSysGlobals.debugFlag: assert( len ( self.tree ) ) # Fail here if we didn't load anything at all # Find the main (osis) container if self.tree.tag == 'usfx': location = "USFX file" BibleOrgSysGlobals.checkXMLNoText( self.tree, location, '4f6h' ) BibleOrgSysGlobals.checkXMLNoTail( self.tree, location, '1wk8' ) # Process the attributes first self.schemaLocation = None for attrib,value in self.tree.items(): #print( "attrib", repr(attrib), repr(value) ) if attrib.endswith("SchemaLocation"): self.schemaLocation = value else: logging.warning( "fv6g Unprocessed {} attribute ({}) in {}".format( attrib, value, location ) ) BBB = C = V = None for element in self.tree: #print( "element", repr(element.tag) ) sublocation = element.tag + " " + location if element.tag == 'languageCode': self.languageCode = element.text BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'cff3' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'des1' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'dwf2' ) elif element.tag == 'book': self.loadBook( element ) ##BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '54f2' ) #BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'hd35' ) ## Process the attributes #idField = bookStyle = None #for attrib,value in element.items(): #if attrib=='id' or attrib=='code': #idField = value # Should be USFM bookcode (not like BBB which is BibleOrgSys BBB bookcode) ##if idField != BBB: ## logging.warning( _("Unexpected book code ({}) in {}").format( idField, sublocation ) ) #elif attrib=='style': #bookStyle = value #else: #logging.warning( _("gfw2 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) else: logging.warning( _("dbw1 Unprocessed {} element after {} {}:{} in {}").format( element.tag, BBB, C, V, sublocation ) ) #self.addPriorityError( 1, c, v, _("Unprocessed {} element").format( element.tag ) ) if not self.books: # Didn't successfully load any regularly named books -- maybe the files have weird names??? -- try to be intelligent here if BibleOrgSysGlobals.verbosityLevel > 2: print( "USFXXMLBible.load: Didn't find any regularly named USFX files in {!r}".format( self.sourceFolder ) ) for thisFilename in foundFiles: # Look for BBB in the ID line (which should be the first line in a USFX file) isUSFX = False thisPath = os.path.join( self.sourceFolder, thisFilename ) with open( thisPath ) as possibleUSXFile: # Automatically closes the file when done for line in possibleUSXFile: if line.startswith( '\\id ' ): USXId = line[4:].strip()[:3] # Take the first three non-blank characters after the space after id if BibleOrgSysGlobals.verbosityLevel > 2: print( "Have possible USFX ID {!r}".format( USXId ) ) BBB = BibleOrgSysGlobals.BibleBooksCodes.getBBBFromUSFM( USXId ) if BibleOrgSysGlobals.verbosityLevel > 2: print( "BBB is {!r}".format( BBB ) ) isUSFX = True break # We only look at the first line if isUSFX: UBB = USFXXMLBibleBook( self, BBB ) UBB.load( self.sourceFolder, thisFilename, self.encoding ) UBB.validateMarkers() print( UBB ) self.books[BBB] = UBB # Make up our book name dictionaries while we're at it assumedBookNames = UBB.getAssumedBookNames() for assumedBookName in assumedBookNames: self.BBBToNameDict[BBB] = assumedBookName assumedBookNameLower = assumedBookName.lower() self.bookNameDict[assumedBookNameLower] = BBB # Store the deduced book name (just lower case) self.combinedBookNameDict[assumedBookNameLower] = BBB # Store the deduced book name (just lower case) if ' ' in assumedBookNameLower: self.combinedBookNameDict[assumedBookNameLower.replace(' ','')] = BBB # Store the deduced book name (lower case without spaces) if self.books: print( "USFXXMLBible.load: Found {} irregularly named USFX files".format( len(self.books) ) ) self.doPostLoadProcessing()
def validateEntry( self, entry ): """ Check/validate the given Strongs Greek lexicon entry. """ if BibleOrgSysGlobals.debugFlag: assert entry.tag == "entry" BibleOrgSysGlobals.checkXMLNoText( entry, entry.tag, "na19" ) BibleOrgSysGlobals.checkXMLNoTail( entry, entry.tag, "kaq9" ) # Process the entry attributes first strongs5 = None for attrib,value in entry.items(): if attrib == "strongs": strongs5 = value if BibleOrgSysGlobals.verbosityLevel > 2: print( "Validating {} entry…".format( strongs5 ) ) else: logging.warning( "Unprocessed {!r} attribute ({}) in main entry element".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert len(strongs5)==5 and strongs5.isdigit() entryResults = {} entryString = "" gettingEssentials = True for j, element in enumerate( entry ): #print( strongs5, j, element.tag, repr(entryString) ) if element.tag == "strongs": if BibleOrgSysGlobals.debugFlag: assert gettingEssentials and j==0 and element.text BibleOrgSysGlobals.checkXMLNoAttributes( element, element.tag, "md3d" ) if strongs5!='02717' and (3203 > int(strongs5) > 3302): BibleOrgSysGlobals.checkXMLNoTail( element, element.tag, "f3g7" ) BibleOrgSysGlobals.checkXMLNoSubelements( element, element.tag, "m56g" ) strongs = element.text if BibleOrgSysGlobals.debugFlag: assert strongs5.endswith( strongs ) if element.tail and element.tail.strip(): entryString += element.tail.strip() elif element.tag == "greek": location = "greek in Strongs " + strongs5 BibleOrgSysGlobals.checkXMLNoText( element, location, "jke0" ) #BibleOrgSysGlobals.checkXMLNoTail( element, location, "ks24" ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, "df35" ) # Process the attributes translit = greek = beta = None for attrib,value in element.items(): if attrib=="translit": translit = value elif attrib=="unicode": greek = value elif attrib=="BETA": beta = value else: logging.warning( "scs4 Unprocessed {!r} attribute ({}) in {}".format( attrib, value, location ) ) if BibleOrgSysGlobals.debugFlag: assert greek and translit and beta if 'word' not in entryResults: # This is the first/main entry if BibleOrgSysGlobals.debugFlag: assert gettingEssentials and j==1 BibleOrgSysGlobals.checkXMLNoTail( element, location, "ks24" ) entryResults['word'] = (greek, translit, beta) else: #print( "Have multiple greek entries in " + strongs5 ) if BibleOrgSysGlobals.debugFlag: assert j > 2 gettingEssentials = False entryString += ' ' + BibleOrgSysGlobals.getFlattenedXML( element, strongs5 ) #.replace( '\n', '' ) elif element.tag == "pronunciation": location = "pronunciation in Strongs " + strongs5 BibleOrgSysGlobals.checkXMLNoText( element, location, "iw9k" ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, "0s20" ) # Process the attributes pronunciation = None for attrib,value in element.items(): if attrib=="strongs": pronunciation = value else: logging.warning( "scs4 Unprocessed {!r} attribute ({}) in {}".format( attrib, value, location ) ) if gettingEssentials: #BibleOrgSysGlobals.checkXMLNoTail( element, location, "kd02" ) if BibleOrgSysGlobals.debugFlag: assert j == 2 assert pronunciation assert 'pronunciation' not in entryResults entryResults['pronunciation'] = pronunciation else: if BibleOrgSysGlobals.debugFlag: assert j>2 and not gettingEssentials if element.tail and element.tail.strip(): entryString += element.tail.strip().replace( '\n', '' ) elif element.tag == "strongs_derivation": location = "strongs_derivation in Strongs " + strongs5 BibleOrgSysGlobals.checkXMLNoAttributes( element, location, "jke0" ) BibleOrgSysGlobals.checkXMLNoTail( element, location, "ks24" ) derivation = BibleOrgSysGlobals.getFlattenedXML( element, strongs5 ).replace( '\n', '' ) #print( strongs5, "derivation", repr(derivation) ) if BibleOrgSysGlobals.debugFlag: assert derivation and '\t' not in derivation and '\n' not in derivation entryString += derivation elif element.tag == "strongs_def": location = "strongs_def in Strongs " + strongs5 BibleOrgSysGlobals.checkXMLNoAttributes( element, location, "jke0" ) BibleOrgSysGlobals.checkXMLNoTail( element, location, "jd28" ) definition = BibleOrgSysGlobals.getFlattenedXML( element, strongs5 ).replace( '\n', '' ) #print( strongs5, "definition", repr(definition) ) if BibleOrgSysGlobals.debugFlag: assert definition and '\t' not in definition and '\n' not in definition entryString += definition elif element.tag == "kjv_def": location = "kjv_def in Strongs " + strongs5 BibleOrgSysGlobals.checkXMLNoAttributes( element, location, "jke0" ) #BibleOrgSysGlobals.checkXMLNoTail( element, location, "8s2s" ) #BibleOrgSysGlobals.checkXMLNoSubelements( element, location, "dvb2" ) KJVdefinition = BibleOrgSysGlobals.getFlattenedXML( element, strongs5 ).replace( '\n', '' ) #print( strongs5, "KJVdefinition", repr(KJVdefinition), repr(entryString) ) if BibleOrgSysGlobals.debugFlag: assert KJVdefinition and '\t' not in KJVdefinition and '\n' not in KJVdefinition entryString += KJVdefinition elif element.tag == "strongsref": location = "strongsref in Strongs " + strongs5 BibleOrgSysGlobals.checkXMLNoText( element, location, "kls2" ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, "ks24" ) strongsRef = BibleOrgSysGlobals.getFlattenedXML( element, strongs5 ).replace( '\n', '' ) if BibleOrgSysGlobals.debugFlag: assert strongsRef and '\t' not in strongsRef and '\n' not in strongsRef strongsRef = re.sub( '<language="GREEK" strongs="(\d{1,5})">', r'<StrongsRef>G\1</StrongsRef>', strongsRef ) strongsRef = re.sub( '<strongs="(\d{1,5})" language="GREEK">', r'<StrongsRef>G\1</StrongsRef>', strongsRef ) #strongsRef = re.sub( '<language="HEBREW" strongs="(\d{1,5})">', r'<StrongsRef>H\1</StrongsRef>', strongsRef ) #strongsRef = re.sub( '<strongs="(\d{1,5})" language="HEBREW">', r'<StrongsRef>H\1</StrongsRef>', strongsRef ) #print( strongs5, "strongsRef", repr(strongsRef) ) entryString += ' ' + strongsRef elif element.tag == "see": location = "see in Strongs " + strongs5 BibleOrgSysGlobals.checkXMLNoText( element, location, "iw9k" ) BibleOrgSysGlobals.checkXMLNoTail( element, location, "kd02" ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, "0s20" ) # Process the attributes seeLanguage = seeStrongsNumber = None for attrib,value in element.items(): if attrib == "language": seeLanguage = value elif attrib == "strongs": seeStrongsNumber = value # Note: No leading zeroes here else: logging.warning( "scs4 Unprocessed {!r} attribute ({}) in {}".format( attrib, value, location ) ) if BibleOrgSysGlobals.debugFlag: assert seeLanguage and seeStrongsNumber and seeStrongsNumber.isdigit() assert seeLanguage in ('GREEK','HEBREW',) if 'see' not in entryResults: entryResults['see'] = [] entryResults['see'].append( ('G' if seeLanguage=='GREEK' else 'H') + seeStrongsNumber ) else: logging.error( "2d4f Unprocessed {!r} element ({}) in entry".format( element.tag, element.text ) ) if entryString: #print( strongs5, "entryString", repr(entryString) ) if BibleOrgSysGlobals.debugFlag: assert '\t' not in entryString and '\n' not in entryString entryString = re.sub( '<strongsref language="GREEK" strongs="(\d{1,5})"></strongsref>', r'<StrongsRef>G\1</StrongsRef>', entryString ) entryString = re.sub( '<strongsref strongs="(\d{1,5})" language="GREEK"></strongsref>', r'<StrongsRef>G\1</StrongsRef>', entryString ) entryString = re.sub( '<strongsref language="HEBREW" strongs="(\d{1,5})"></strongsref>', r'<StrongsRef>H\1</StrongsRef>', entryString ) entryString = re.sub( '<strongsref strongs="(\d{1,5})" language="HEBREW"></strongsref>', r'<StrongsRef>H\1</StrongsRef>', entryString ) if BibleOrgSysGlobals.debugFlag: assert 'strongsref' not in entryString entryResults['Entry'] = entryString #print( "entryResults", entryResults ) self.StrongsEntries[strongs] = entryResults
def loadBook( self, bookElement ): """ Load the book container from the XML data file. """ if BibleOrgSysGlobals.verbosityLevel > 3: print( _("USFXXMLBible.loadBook: Loading {} from {}...").format( self.name, self.sourceFolder ) ) assert( bookElement.tag == 'book' ) mainLocation = self.name + " USFX book" # Process the attributes first bookCode = None for attrib,value in bookElement.items(): if attrib == 'id': bookCode = value else: logging.warning( "bce3 Unprocessed {} attribute ({}) in {}".format( attrib, value, mainLocation ) ) BBB = BibleOrgSysGlobals.BibleBooksCodes.getBBBFromUSFM( bookCode ) mainLocation = "{} USFX {} book".format( self.name, BBB ) if BibleOrgSysGlobals.verbosityLevel > 2: print( _("USFXXMLBible.loadBook: Loading {} from {}...").format( BBB, self.name ) ) BibleOrgSysGlobals.checkXMLNoText( self.tree, mainLocation, '4f6h' ) BibleOrgSysGlobals.checkXMLNoTail( self.tree, mainLocation, '1wk8' ) # Now create our actual book self.thisBook = BibleBook( self, BBB ) self.thisBook.objectNameString = "USFX XML Bible Book object" self.thisBook.objectTypeString = "USFX" C = V = '0' for element in bookElement: #print( "element", repr(element.tag) ) location = "{} of {} {}:{}".format( element.tag, mainLocation, BBB, C, V ) if element.tag == 'id': idText = clean( element.text ) BibleOrgSysGlobals.checkXMLNoTail( element, location, 'vsg3' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'ksq2' ) for attrib,value in element.items(): if attrib == 'id': assert( value == bookCode ) else: logging.warning( _("vsg4 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.thisBook.addLine( 'id', bookCode + ((' '+idText) if idText else '') ) elif element.tag == 'ide': ideText = clean( element.text ) BibleOrgSysGlobals.checkXMLNoTail( element, location, 'jsa0' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'ls01' ) charset = None for attrib,value in element.items(): if attrib == 'charset': charset = value else: logging.warning( _("jx53 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.thisBook.addLine( 'ide', charset + ((' '+ideText) if ideText else '') ) elif element.tag == 'h': hText = element.text BibleOrgSysGlobals.checkXMLNoTail( element, location, 'dj35' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, location, 'hs35' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'hs32' ) self.thisBook.addLine( 'h', clean(hText) ) elif element.tag == 'toc': tocText = element.text BibleOrgSysGlobals.checkXMLNoTail( element, location, 'ss13' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'js13' ) level = None for attrib,value in element.items(): if attrib == 'level': # Seems compulsory level = value else: logging.warning( _("dg36 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.thisBook.addLine( 'toc'+level, clean(tocText) ) elif element.tag == 'c': BibleOrgSysGlobals.checkXMLNoText( element, location, 'ks35' ) BibleOrgSysGlobals.checkXMLNoTail( element, location, 'gs35' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'kdr3' ) # This is a milestone for attrib,value in element.items(): if attrib == 'id': C, V = value, '0' else: logging.warning( _("hj52 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.thisBook.addLine( 'c', C ) elif element.tag == 's': sText = clean( element.text ) BibleOrgSysGlobals.checkXMLNoTail( element, location, 'wxg0' ) level = None for attrib,value in element.items(): if attrib == 'level': # Seems optional level = value else: logging.warning( _("bdy6 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) marker = 's' if level: marker += level self.thisBook.addLine( marker, sText ) for subelement in element: #print( "subelement", repr(subelement.tag) ) sublocation = subelement.tag + " of " + location if subelement.tag == 'f': self.loadFootnote( subelement, sublocation, BBB, C, V ) elif subelement.tag == 'x': self.loadCrossreference( subelement, sublocation ) elif subelement.tag == 'fig': self.loadFigure( subelement, sublocation ) elif subelement.tag == 'table': self.loadTable( subelement, sublocation ) elif subelement.tag in ('add','it','bd','bdit','sc',): self.loadCharacterFormatting( subelement, sublocation, BBB, C, V ) elif subelement.tag == 'optionalLineBreak': print( "What is loadBook optionalLineBreak?" ) else: logging.warning( _("jx9q Unprocessed {} element after {} {}:{} in {}").format( subelement.tag, BBB, C, V, sublocation ) ) elif element.tag in ('p','q','d',): V = self.loadParagraph( element, location, BBB, C ) elif element.tag == 'b': BibleOrgSysGlobals.checkXMLNoText( element, location, 'ks35' ) BibleOrgSysGlobals.checkXMLNoTail( element, location, 'gs35' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, location, 'nd04' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'kdr3' ) self.thisBook.addLine( 'b', '' ) elif element.tag in ('cl','cp'): # Simple single-line paragraph-level markers marker, text = element.tag, clean(element.text) BibleOrgSysGlobals.checkXMLNoTail( element, location, 'od01' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'gd92' ) idField = None for attrib,value in element.items(): if attrib == 'id': idField = value else: logging.warning( _("dv35 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) if idField and text is None: text = idField else: logging.warning( _("dve4 Unprocessed idField ({}) in {}").format( idField, location ) ) if text is None: logging.critical( "Why is {} empty at {}".format( marker, location ) ) assert( text is not None ) self.thisBook.addLine( marker, text ) elif element.tag == 'table': self.loadTable( element, location ) elif element.tag == 've': # What's this in Psalms: <c id="4" /><ve /><d>For the Chief Musician; on stringed instruments. A Psalm of David.</d> BibleOrgSysGlobals.checkXMLNoText( element, location, 'kds3' ) BibleOrgSysGlobals.checkXMLNoTail( element, location, 'ks29' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, location, 'kj24' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'js91' ) #self.thisBook.addLine( 'b', '' ) if BibleOrgSysGlobals.verbosityLevel > 2: print( "Ignoring 've' field", BBB, C, V ) else: logging.critical( _("caf2 Unprocessed {} element after {} {}:{} in {}").format( element.tag, BBB, C, V, location ) ) #self.addPriorityError( 1, c, v, _("Unprocessed {} element").format( element.tag ) ) if BibleOrgSysGlobals.debugFlag and debuggingThisModule: halt self.saveBook( self.thisBook )